diff --git a/.clang-format b/.clang-format
index 0984d5a7b..737cf9006 100644
--- a/.clang-format
+++ b/.clang-format
@@ -17,7 +17,7 @@ AllowShortEnumsOnASingleLine: true
 AllowShortBlocksOnASingleLine: Never
 AllowShortCaseLabelsOnASingleLine: false
 AllowShortFunctionsOnASingleLine: All
-AllowShortLambdasOnASingleLine: All
+AllowShortLambdasOnASingleLine: Inline
 AllowShortIfStatementsOnASingleLine: WithoutElse
 AllowShortLoopsOnASingleLine: true
 AlwaysBreakAfterDefinitionReturnType: None
diff --git a/.github/workflows/i386.yml b/.github/workflows/i386.yml
index 4a4d65b25..ca5baf412 100644
--- a/.github/workflows/i386.yml
+++ b/.github/workflows/i386.yml
@@ -5,6 +5,10 @@ on: [push, pull_request]
 permissions:
   contents: read # to fetch code (actions/checkout)
 
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
+
 jobs:
   build-32bit:
     name: Build 32-bit
diff --git a/.github/workflows/jvm_tests.yml b/.github/workflows/jvm_tests.yml
index 330c037d7..9ef314ca5 100644
--- a/.github/workflows/jvm_tests.yml
+++ b/.github/workflows/jvm_tests.yml
@@ -5,6 +5,10 @@ on: [push, pull_request]
 permissions:
   contents: read # to fetch code (actions/checkout)
 
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
+
 jobs:
   test-with-jvm:
     name: Test JVM on OS ${{ matrix.os }}
@@ -15,31 +19,36 @@ jobs:
         os: [windows-latest, ubuntu-latest, macos-11]
 
     steps:
-    - uses: actions/checkout@e2f20e631ae6d7dd3b768f56a5d2af784dd54791 # v2.5.0
+    - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
       with:
         submodules: 'true'
 
-    - uses: actions/setup-python@7f80679172b057fc5e90d70d197929d454754a5a # v4.3.0
+    - uses: mamba-org/setup-micromamba@422500192359a097648154e8db4e39bdb6c6eed7  # v1.8.1
       with:
-        python-version: '3.8'
-        architecture: 'x64'
-
-    - uses: actions/setup-java@d202f5dbf7256730fb690ec59f6381650114feb2 # v3.6.0
-      with:
-        java-version: 1.8
-
-    - name: Install Python packages
-      run: |
-        python -m pip install wheel setuptools
-        python -m pip install awscli
+        micromamba-version: '1.5.6-0'
+        environment-name: jvm_tests
+        create-args: >-
+          python=3.10
+          awscli
+        cache-downloads: true
+        cache-environment: true
+        init-shell: bash powershell
 
     - name: Cache Maven packages
-      uses: actions/cache@6998d139ddd3e68c71e9e398d8e40b71a2f39812 # v3.2.5
+      uses: actions/cache@13aacd865c20de90d75de3b17ebe84f7a17d57d2  # v4.0.0
       with:
         path: ~/.m2
         key: ${{ runner.os }}-m2-${{ hashFiles('./jvm-packages/pom.xml') }}
         restore-keys: ${{ runner.os }}-m2-${{ hashFiles('./jvm-packages/pom.xml') }}
 
+    - name: Build xgboost4j.dll
+      run: |
+        mkdir build
+        cd build
+        cmake .. -G"Visual Studio 17 2022" -A x64 -DJVM_BINDINGS=ON
+        cmake --build . --config Release
+      if: matrix.os == 'windows-latest'
+
     - name: Test XGBoost4J (Core)
       run: |
         cd jvm-packages
@@ -47,7 +56,8 @@ jobs:
 
     - name: Extract branch name
       shell: bash
-      run: echo "##[set-output name=branch;]$(echo ${GITHUB_REF#refs/heads/})"
+      run: |
+        echo "branch=${GITHUB_REF#refs/heads/}" >> "$GITHUB_OUTPUT"
       id: extract_branch
       if: |
         (github.ref == 'refs/heads/master' || contains(github.ref, 'refs/heads/release_')) &&
@@ -58,7 +68,7 @@ jobs:
         cd lib/
         Rename-Item -Path xgboost4j.dll -NewName xgboost4j_${{ github.sha }}.dll
         dir
-        python -m awscli s3 cp xgboost4j_${{ github.sha }}.dll s3://xgboost-nightly-builds/${{ steps.extract_branch.outputs.branch }}/libxgboost4j/ --acl public-read
+        python -m awscli s3 cp xgboost4j_${{ github.sha }}.dll s3://xgboost-nightly-builds/${{ steps.extract_branch.outputs.branch }}/libxgboost4j/ --acl public-read --region us-west-2
       if: |
         (github.ref == 'refs/heads/master' || contains(github.ref, 'refs/heads/release_')) &&
         matrix.os == 'windows-latest'
@@ -67,11 +77,12 @@ jobs:
         AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY_IAM_S3_UPLOADER }}
 
     - name: Publish artifact libxgboost4j.dylib to S3
+      shell: bash -l {0}
       run: |
         cd lib/
         mv -v libxgboost4j.dylib libxgboost4j_${{ github.sha }}.dylib
         ls
-        python -m awscli s3 cp libxgboost4j_${{ github.sha }}.dylib s3://xgboost-nightly-builds/${{ steps.extract_branch.outputs.branch }}/libxgboost4j/ --acl public-read
+        python -m awscli s3 cp libxgboost4j_${{ github.sha }}.dylib s3://xgboost-nightly-builds/${{ steps.extract_branch.outputs.branch }}/libxgboost4j/ --acl public-read --region us-west-2
       if: |
         (github.ref == 'refs/heads/master' || contains(github.ref, 'refs/heads/release_')) &&
         matrix.os == 'macos-11'
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 20e91a5d9..b064b4843 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -9,6 +9,10 @@ on: [push, pull_request]
 permissions:
   contents: read # to fetch code (actions/checkout)
 
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
+
 # A workflow run is made up of one or more jobs that can run sequentially or in parallel
 jobs:
   gtest-cpu:
@@ -174,7 +178,7 @@ jobs:
     - uses: actions/checkout@e2f20e631ae6d7dd3b768f56a5d2af784dd54791 # v2.5.0
       with:
         submodules: 'true'
-    - uses: actions/setup-python@7f80679172b057fc5e90d70d197929d454754a5a # v4.3.0
+    - uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5.0.0
       with:
         python-version: "3.8"
         architecture: 'x64'
diff --git a/.github/workflows/python_tests.yml b/.github/workflows/python_tests.yml
index 0fca76673..0a182677f 100644
--- a/.github/workflows/python_tests.yml
+++ b/.github/workflows/python_tests.yml
@@ -9,6 +9,10 @@ defaults:
   run:
     shell: bash -l {0}
 
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
+
 jobs:
   python-mypy-lint:
     runs-on: ubuntu-latest
@@ -310,7 +314,7 @@ jobs:
           submodules: 'true'
 
       - name: Set up Python 3.8
-        uses: actions/setup-python@v4
+        uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5.0.0
         with:
           python-version: 3.8
 
diff --git a/.github/workflows/python_wheels.yml b/.github/workflows/python_wheels.yml
index f46b77295..3a5c1db99 100644
--- a/.github/workflows/python_wheels.yml
+++ b/.github/workflows/python_wheels.yml
@@ -5,6 +5,10 @@ on: [push, pull_request]
 permissions:
   contents: read # to fetch code (actions/checkout)
 
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
+
 jobs:
   python-wheels:
     name: Build wheel for ${{ matrix.platform_id }}
@@ -21,7 +25,7 @@ jobs:
       with:
         submodules: 'true'
     - name: Setup Python
-      uses: actions/setup-python@7f80679172b057fc5e90d70d197929d454754a5a # v4.3.0
+      uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5.0.0
       with:
         python-version: "3.8"
     - name: Build wheels
diff --git a/.github/workflows/r_nold.yml b/.github/workflows/r_nold.yml
index a014c9138..eb7179e81 100644
--- a/.github/workflows/r_nold.yml
+++ b/.github/workflows/r_nold.yml
@@ -10,6 +10,10 @@ on:
 permissions:
   contents: read # to fetch code (actions/checkout)
 
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
+
 jobs:
   test-R-noLD:
     if: github.event.comment.body == '/gha run r-nold-test' && contains('OWNER,MEMBER,COLLABORATOR', github.event.comment.author_association)
diff --git a/.github/workflows/r_tests.yml b/.github/workflows/r_tests.yml
index d004ab15c..045dac575 100644
--- a/.github/workflows/r_tests.yml
+++ b/.github/workflows/r_tests.yml
@@ -8,6 +8,10 @@ env:
 permissions:
   contents: read # to fetch code (actions/checkout)
 
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
+
 jobs:
   lintr:
     runs-on: ${{ matrix.config.os }}
@@ -46,7 +50,7 @@ jobs:
         MAKEFLAGS="-j$(nproc)" R CMD INSTALL R-package/
         Rscript tests/ci_build/lint_r.R $(pwd)
 
-  test-R-on-Windows:
+  test-Rpkg:
     runs-on: ${{ matrix.config.os }}
     name: Test R on OS ${{ matrix.config.os }}, R ${{ matrix.config.r }}, Compiler ${{ matrix.config.compiler }}, Build ${{ matrix.config.build }}
     strategy:
@@ -54,11 +58,17 @@ jobs:
       matrix:
         config:
           - {os: windows-latest, r: 'release', compiler: 'mingw', build: 'autotools'}
+          - {os: ubuntu-latest, r: 'release', compiler: 'none', build: 'cmake'}
     env:
       R_REMOTES_NO_ERRORS_FROM_WARNINGS: true
       RSPM: ${{ matrix.config.rspm }}
 
     steps:
+    - name: Install system dependencies
+      run: |
+        sudo apt update
+        sudo apt install libcurl4-openssl-dev libssl-dev libssh2-1-dev libgit2-dev libglpk-dev libxml2-dev libharfbuzz-dev libfribidi-dev
+      if: matrix.config.os == 'ubuntu-latest'
     - uses: actions/checkout@e2f20e631ae6d7dd3b768f56a5d2af784dd54791 # v2.5.0
       with:
         submodules: 'true'
@@ -74,7 +84,7 @@ jobs:
         key: ${{ runner.os }}-r-${{ matrix.config.r }}-6-${{ hashFiles('R-package/DESCRIPTION') }}
         restore-keys: ${{ runner.os }}-r-${{ matrix.config.r }}-6-${{ hashFiles('R-package/DESCRIPTION') }}
 
-    - uses: actions/setup-python@7f80679172b057fc5e90d70d197929d454754a5a # v4.3.0
+    - uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5.0.0
       with:
         python-version: "3.8"
         architecture: 'x64'
@@ -89,6 +99,12 @@ jobs:
     - name: Test R
       run: |
         python tests/ci_build/test_r_package.py --compiler='${{ matrix.config.compiler }}' --build-tool="${{ matrix.config.build }}" --task=check
+      if: matrix.config.compiler != 'none'
+
+    - name: Test R
+      run: |
+        python tests/ci_build/test_r_package.py --build-tool="${{ matrix.config.build }}" --task=check
+      if: matrix.config.compiler == 'none'
 
   test-R-on-Debian:
     name: Test R package on Debian
diff --git a/R-package/CMakeLists.txt b/R-package/CMakeLists.txt
index d3a69abc2..37c5dbf4c 100644
--- a/R-package/CMakeLists.txt
+++ b/R-package/CMakeLists.txt
@@ -26,7 +26,6 @@ endif()
 target_compile_definitions(
   xgboost-r PUBLIC
   -DXGBOOST_STRICT_R_MODE=1
-  -DXGBOOST_CUSTOMIZE_GLOBAL_PRNG=1
   -DDMLC_LOG_BEFORE_THROW=0
   -DDMLC_DISABLE_STDIN=1
   -DDMLC_LOG_CUSTOMIZE=1
diff --git a/R-package/DESCRIPTION b/R-package/DESCRIPTION
index 66e2b5692..b4072aff0 100644
--- a/R-package/DESCRIPTION
+++ b/R-package/DESCRIPTION
@@ -56,7 +56,8 @@ Suggests:
     testthat,
     igraph (>= 1.0.1),
     float,
-    titanic
+    titanic,
+    RhpcBLASctl
 Depends:
     R (>= 4.3.0)
 Imports:
diff --git a/R-package/NAMESPACE b/R-package/NAMESPACE
index 580d1f873..c9e085e77 100644
--- a/R-package/NAMESPACE
+++ b/R-package/NAMESPACE
@@ -20,15 +20,9 @@ export("xgb.attr<-")
 export("xgb.attributes<-")
 export("xgb.config<-")
 export("xgb.parameters<-")
-export(cb.cv.predict)
-export(cb.early.stop)
-export(cb.evaluation.log)
-export(cb.gblinear.history)
-export(cb.print.evaluation)
-export(cb.reset.parameters)
-export(cb.save.model)
 export(getinfo)
 export(setinfo)
+export(xgb.Callback)
 export(xgb.DMatrix)
 export(xgb.DMatrix.hasinfo)
 export(xgb.DMatrix.save)
@@ -39,6 +33,13 @@ export(xgb.QuantileDMatrix)
 export(xgb.QuantileDMatrix.from_iterator)
 export(xgb.attr)
 export(xgb.attributes)
+export(xgb.cb.cv.predict)
+export(xgb.cb.early.stop)
+export(xgb.cb.evaluation.log)
+export(xgb.cb.gblinear.history)
+export(xgb.cb.print.evaluation)
+export(xgb.cb.reset.parameters)
+export(xgb.cb.save.model)
 export(xgb.config)
 export(xgb.copy.Booster)
 export(xgb.create.features)
@@ -72,14 +73,10 @@ export(xgb.slice.DMatrix)
 export(xgb.train)
 export(xgboost)
 import(methods)
+importClassesFrom(Matrix,CsparseMatrix)
 importClassesFrom(Matrix,dgCMatrix)
 importClassesFrom(Matrix,dgRMatrix)
-importClassesFrom(Matrix,dgeMatrix)
-importFrom(Matrix,colSums)
 importFrom(Matrix,sparse.model.matrix)
-importFrom(Matrix,sparseMatrix)
-importFrom(Matrix,sparseVector)
-importFrom(Matrix,t)
 importFrom(data.table,":=")
 importFrom(data.table,as.data.table)
 importFrom(data.table,data.table)
@@ -101,6 +98,7 @@ importFrom(methods,new)
 importFrom(stats,coef)
 importFrom(stats,median)
 importFrom(stats,predict)
+importFrom(stats,sd)
 importFrom(stats,variable.names)
 importFrom(utils,head)
 importFrom(utils,object.size)
diff --git a/R-package/R/callbacks.R b/R-package/R/callbacks.R
index 02e0a7cd4..39734ab09 100644
--- a/R-package/R/callbacks.R
+++ b/R-package/R/callbacks.R
@@ -1,769 +1,392 @@
-#' Callback closures for booster training.
-#'
-#' These are used to perform various service tasks either during boosting iterations or at the end.
-#' This approach helps to modularize many of such tasks without bloating the main training methods,
-#' and it offers .
-#'
-#' @details
-#' By default, a callback function is run after each boosting iteration.
-#' An R-attribute \code{is_pre_iteration} could be set for a callback to define a pre-iteration function.
-#'
-#' When a callback function has \code{finalize} parameter, its finalizer part will also be run after
-#' the boosting is completed.
-#'
-#' WARNING: side-effects!!! Be aware that these callback functions access and modify things in
-#' the environment from which they are called from, which is a fairly uncommon thing to do in R.
-#'
-#' To write a custom callback closure, make sure you first understand the main concepts about R environments.
-#' Check either R documentation on \code{\link[base]{environment}} or the
-#' \href{http://adv-r.had.co.nz/Environments.html}{Environments chapter} from the "Advanced R"
-#' book by Hadley Wickham. Further, the best option is to read the code of some of the existing callbacks -
-#' choose ones that do something similar to what you want to achieve. Also, you would need to get familiar
-#' with the objects available inside of the \code{xgb.train} and \code{xgb.cv} internal environments.
-#'
-#' @seealso
-#' \code{\link{cb.print.evaluation}},
-#' \code{\link{cb.evaluation.log}},
-#' \code{\link{cb.reset.parameters}},
-#' \code{\link{cb.early.stop}},
-#' \code{\link{cb.save.model}},
-#' \code{\link{cb.cv.predict}},
-#' \code{\link{xgb.train}},
-#' \code{\link{xgb.cv}}
-#'
-#' @name callbacks
-NULL
+.reserved_cb_names <- c("names", "class", "call", "params", "niter", "nfeatures", "folds")
 
-#
-# Callbacks -------------------------------------------------------------------
-#
-
-#' Callback closure for printing the result of evaluation
+#' @title XGBoost Callback Constructor
+#' @description Constructor for defining the structure of callback functions that can be executed
+#' at different stages of model training (before / after training, before / after each boosting
+#' iteration).
+#' @param cb_name Name for the callback.
 #'
-#' @param period  results would be printed every number of periods
-#' @param showsd  whether standard deviations should be printed (when available)
+#' If the callback produces some non-NULL result (from executing the function passed under
+#' `f_after_training`), that result will be added as an R attribute to the resulting booster
+#' (or as a named element in the result of CV), with the attribute name specified here.
 #'
-#' @details
-#' The callback function prints the result of evaluation at every \code{period} iterations.
-#' The initial and the last iteration's evaluations are always printed.
+#' Names of callbacks must be unique - i.e. there cannot be two callbacks with the same name.
+#' @param env An environment object that will be passed to the different functions in the callback.
+#' Note that this environment will not be shared with other callbacks.
+#' @param f_before_training A function that will be executed before the training has started.
 #'
-#' Callback function expects the following values to be set in its calling frame:
-#' \code{bst_evaluation} (also \code{bst_evaluation_err} when available),
-#' \code{iteration},
-#' \code{begin_iteration},
-#' \code{end_iteration}.
+#' If passing `NULL` for this or for the other function inputs, then no function will be executed.
 #'
-#' @seealso
-#' \code{\link{callbacks}}
+#' If passing a function, it will be called with parameters supplied as non-named arguments
+#' matching the function signatures that are shown in the default value for each function argument.
+#' @param f_before_iter A function that will be executed before each boosting round.
 #'
-#' @export
-cb.print.evaluation <- function(period = 1, showsd = TRUE) {
-
-  callback <- function(env = parent.frame()) {
-    if (length(env$bst_evaluation) == 0 ||
-        period == 0 ||
-        NVL(env$rank, 0) != 0)
-      return()
-
-    i <- env$iteration
-    if ((i - 1) %% period == 0 ||
-        i == env$begin_iteration ||
-        i == env$end_iteration) {
-      stdev <- if (showsd) env$bst_evaluation_err else NULL
-      msg <- .format_eval_string(i, env$bst_evaluation, stdev)
-      cat(msg, '\n')
-    }
-  }
-  attr(callback, 'call') <- match.call()
-  attr(callback, 'name') <- 'cb.print.evaluation'
-  callback
-}
-
-
-#' Callback closure for logging the evaluation history
+#' This function can signal whether the training should be finalized or not, by outputting
+#' a value that evaluates to `TRUE` - i.e. if the output from the function provided here at
+#' a given round is `TRUE`, then training will be stopped before the current iteration happens.
 #'
-#' @details
-#' This callback function appends the current iteration evaluation results \code{bst_evaluation}
-#' available in the calling parent frame to the \code{evaluation_log} list in a calling frame.
+#' Return values of `NULL` will be interpreted as `FALSE`.
+#' @param f_after_iter A function that will be executed after each boosting round.
 #'
-#' The finalizer callback (called with \code{finalize = TURE} in the end) converts
-#' the \code{evaluation_log} list into a final data.table.
+#' This function can signal whether the training should be finalized or not, by outputting
+#' a value that evaluates to `TRUE` - i.e. if the output from the function provided here at
+#' a given round is `TRUE`, then training will be stopped at that round.
 #'
-#' The iteration evaluation result \code{bst_evaluation} must be a named numeric vector.
+#' Return values of `NULL` will be interpreted as `FALSE`.
+#' @param f_after_training A function that will be executed after training is finished.
 #'
-#' Note: in the column names of the final data.table, the dash '-' character is replaced with
-#' the underscore '_' in order to make the column names more like regular R identifiers.
+#' This function can optionally output something non-NULL, which will become part of the R
+#' attributes of the booster (assuming one passes `keep_extra_attributes=TRUE` to \link{xgb.train})
+#' under the name supplied for parameter `cb_name` imn the case of \link{xgb.train}; or a part
+#' of the named elements in the result of \link{xgb.cv}.
+#' @return An `xgb.Callback` object, which can be passed to \link{xgb.train} or \link{xgb.cv}.
+#' @details Arguments that will be passed to the supplied functions are as follows:\itemize{
 #'
-#' Callback function expects the following values to be set in its calling frame:
-#' \code{evaluation_log},
-#' \code{bst_evaluation},
-#' \code{iteration}.
+#' \item env The same environment that is passed under argument `env`.
 #'
-#' @seealso
-#' \code{\link{callbacks}}
+#' It may be modified by the functions in order to e.g. keep tracking of what happens
+#' across iterations or similar.
 #'
-#' @export
-cb.evaluation.log <- function() {
-
-  mnames <- NULL
-
-  init <- function(env) {
-    if (!is.list(env$evaluation_log))
-      stop("'evaluation_log' has to be a list")
-    mnames <<- names(env$bst_evaluation)
-    if (is.null(mnames) || any(mnames == ""))
-      stop("bst_evaluation must have non-empty names")
-
-    mnames <<- gsub('-', '_', names(env$bst_evaluation), fixed = TRUE)
-    if (!is.null(env$bst_evaluation_err))
-      mnames <<- c(paste0(mnames, '_mean'), paste0(mnames, '_std'))
-  }
-
-  finalizer <- function(env) {
-    env$evaluation_log <- as.data.table(t(simplify2array(env$evaluation_log)))
-    setnames(env$evaluation_log, c('iter', mnames))
-
-    if (!is.null(env$bst_evaluation_err)) {
-      # rearrange col order from _mean,_mean,...,_std,_std,...
-      # to be _mean,_std,_mean,_std,...
-      len <- length(mnames)
-      means <- mnames[seq_len(len / 2)]
-      stds <- mnames[(len / 2 + 1):len]
-      cnames <- numeric(len)
-      cnames[c(TRUE, FALSE)] <- means
-      cnames[c(FALSE, TRUE)] <- stds
-      env$evaluation_log <- env$evaluation_log[, c('iter', cnames), with = FALSE]
-    }
-  }
-
-  callback <- function(env = parent.frame(), finalize = FALSE) {
-    if (is.null(mnames))
-      init(env)
-
-    if (finalize)
-      return(finalizer(env))
-
-    ev <- env$bst_evaluation
-    if (!is.null(env$bst_evaluation_err))
-      ev <- c(ev, env$bst_evaluation_err)
-    env$evaluation_log <- c(env$evaluation_log,
-                            list(c(iter = env$iteration, ev)))
-  }
-  attr(callback, 'call') <- match.call()
-  attr(callback, 'name') <- 'cb.evaluation.log'
-  callback
-}
-
-#' Callback closure for resetting the booster's parameters at each iteration.
+#' This environment is only used by the functions supplied to the callback, and will
+#' not be kept after the model fitting function terminates (see parameter `f_after_training`).
 #'
-#' @param new_params a list where each element corresponds to a parameter that needs to be reset.
-#'        Each element's value must be either a vector of values of length \code{nrounds}
-#'        to be set at each iteration,
-#'        or a function of two parameters \code{learning_rates(iteration, nrounds)}
-#'        which returns a new parameter value by using the current iteration number
-#'        and the total number of boosting rounds.
+#' \item model The booster object when using \link{xgb.train}, or the folds when using
+#' \link{xgb.cv}.
 #'
-#' @details
-#' This is a "pre-iteration" callback function used to reset booster's parameters
-#' at the beginning of each iteration.
-#'
-#' Note that when training is resumed from some previous model, and a function is used to
-#' reset a parameter value, the \code{nrounds} argument in this function would be the
-#' the number of boosting rounds in the current training.
-#'
-#' Callback function expects the following values to be set in its calling frame:
-#' \code{bst} or \code{bst_folds},
-#' \code{iteration},
-#' \code{begin_iteration},
-#' \code{end_iteration}.
-#'
-#' @seealso
-#' \code{\link{callbacks}}
-#'
-#' @export
-cb.reset.parameters <- function(new_params) {
-
-  if (typeof(new_params) != "list")
-    stop("'new_params' must be a list")
-  pnames <- gsub(".", "_", names(new_params), fixed = TRUE)
-  nrounds <- NULL
-
-  # run some checks in the beginning
-  init <- function(env) {
-    nrounds <<- env$end_iteration - env$begin_iteration + 1
-
-    if (is.null(env$bst) && is.null(env$bst_folds))
-      stop("Parent frame has neither 'bst' nor 'bst_folds'")
-
-    # Some parameters are not allowed to be changed,
-    # since changing them would simply wreck some chaos
-    not_allowed <- pnames %in%
-      c('num_class', 'num_output_group', 'size_leaf_vector', 'updater_seq')
-    if (any(not_allowed))
-      stop('Parameters ', paste(pnames[not_allowed]), " cannot be changed during boosting.")
-
-    for (n in pnames) {
-      p <- new_params[[n]]
-      if (is.function(p)) {
-        if (length(formals(p)) != 2)
-          stop("Parameter '", n, "' is a function but not of two arguments")
-      } else if (is.numeric(p) || is.character(p)) {
-        if (length(p) != nrounds)
-          stop("Length of '", n, "' has to be equal to 'nrounds'")
-      } else {
-        stop("Parameter '", n, "' is not a function or a vector")
-      }
-    }
-  }
-
-  callback <- function(env = parent.frame()) {
-    if (is.null(nrounds))
-      init(env)
-
-    i <- env$iteration
-    pars <- lapply(new_params, function(p) {
-      if (is.function(p))
-        return(p(i, nrounds))
-      p[i]
-    })
-
-    if (!is.null(env$bst)) {
-      xgb.parameters(env$bst) <- pars
-    } else {
-      for (fd in env$bst_folds)
-        xgb.parameters(fd$bst) <- pars
-    }
-  }
-  attr(callback, 'is_pre_iteration') <- TRUE
-  attr(callback, 'call') <- match.call()
-  attr(callback, 'name') <- 'cb.reset.parameters'
-  callback
-}
-
-
-#' Callback closure to activate the early stopping.
-#'
-#' @param stopping_rounds The number of rounds with no improvement in
-#'        the evaluation metric in order to stop the training.
-#' @param maximize whether to maximize the evaluation metric
-#' @param metric_name the name of an evaluation column to use as a criteria for early
-#'        stopping. If not set, the last column would be used.
-#'        Let's say the test data in \code{watchlist} was labelled as \code{dtest},
-#'        and one wants to use the AUC in test data for early stopping regardless of where
-#'        it is in the \code{watchlist}, then one of the following would need to be set:
-#'        \code{metric_name='dtest-auc'} or \code{metric_name='dtest_auc'}.
-#'        All dash '-' characters in metric names are considered equivalent to '_'.
-#' @param verbose whether to print the early stopping information.
-#'
-#' @details
-#' This callback function determines the condition for early stopping
-#' by setting the \code{stop_condition = TRUE} flag in its calling frame.
-#'
-#' The following additional fields are assigned to the model's R object:
-#' \itemize{
-#' \item \code{best_score} the evaluation score at the best iteration
-#' \item \code{best_iteration} at which boosting iteration the best score has occurred (1-based index)
-#' }
-#' The Same values are also stored as xgb-attributes:
-#' \itemize{
-#' \item \code{best_iteration} is stored as a 0-based iteration index (for interoperability of binary models)
-#' \item \code{best_msg} message string is also stored.
+#' For \link{xgb.cv}, folds are a list with a structure as follows:\itemize{
+#' \item `dtrain`: The training data for the fold (as an `xgb.DMatrix` object).
+#' \item `bst`: Rhe `xgb.Booster` object for the fold.
+#' \item `evals`: A list containing two DMatrices, with names `train` and `test`
+#' (`test` is the held-out data for the fold).
+#' \item `index`: The indices of the hold-out data for that fold (base-1 indexing),
+#' from which the `test` entry in `evals` was obtained.
 #' }
 #'
-#' At least one data element is required in the evaluation watchlist for early stopping to work.
+#' This object should \bold{not} be in-place modified in ways that conflict with the
+#' training (e.g. resetting the parameters for a training update in a way that resets
+#' the number of rounds to zero in order to overwrite rounds).
 #'
-#' Callback function expects the following values to be set in its calling frame:
-#' \code{stop_condition},
-#' \code{bst_evaluation},
-#' \code{rank},
-#' \code{bst} (or \code{bst_folds} and \code{basket}),
-#' \code{iteration},
-#' \code{begin_iteration},
-#' \code{end_iteration},
+#' Note that any R attributes that are assigned to the booster during the callback functions,
+#' will not be kept thereafter as the booster object variable is not re-assigned during
+#' training. It is however possible to set C-level attributes of the booster through
+#' \link{xgb.attr} or \link{xgb.attributes}, which should remain available for the rest
+#' of the iterations and after the training is done.
 #'
-#' @seealso
-#' \code{\link{callbacks}},
-#' \code{\link{xgb.attr}}
+#' For keeping variables across iterations, it's recommended to use `env` instead.
+#' \item data The data to which the model is being fit, as an `xgb.DMatrix` object.
 #'
-#' @export
-cb.early.stop <- function(stopping_rounds, maximize = FALSE,
-                          metric_name = NULL, verbose = TRUE) {
-  # state variables
-  best_iteration <- -1
-  best_score <- Inf
-  best_msg <- NULL
-  metric_idx <- 1
-
-  init <- function(env) {
-    if (length(env$bst_evaluation) == 0)
-      stop("For early stopping, watchlist must have at least one element")
-
-    eval_names <- gsub('-', '_', names(env$bst_evaluation), fixed = TRUE)
-    if (!is.null(metric_name)) {
-      metric_idx <<- which(gsub('-', '_', metric_name, fixed = TRUE) == eval_names)
-      if (length(metric_idx) == 0)
-        stop("'metric_name' for early stopping is not one of the following:\n",
-             paste(eval_names, collapse = ' '), '\n')
-    }
-    if (is.null(metric_name) &&
-        length(env$bst_evaluation) > 1) {
-      metric_idx <<- length(eval_names)
-      if (verbose)
-        cat('Multiple eval metrics are present. Will use ',
-            eval_names[metric_idx], ' for early stopping.\n', sep = '')
-    }
-
-    metric_name <<- eval_names[metric_idx]
-
-    # maximize is usually NULL when not set in xgb.train and built-in metrics
-    if (is.null(maximize))
-      maximize <<- grepl('(_auc|_map|_ndcg|_pre)', metric_name)
-
-    if (verbose && NVL(env$rank, 0) == 0)
-      cat("Will train until ", metric_name, " hasn't improved in ",
-          stopping_rounds, " rounds.\n\n", sep = '')
-
-    best_iteration <<- 1
-    if (maximize) best_score <<- -Inf
-
-    env$stop_condition <- FALSE
-
-    if (!is.null(env$bst)) {
-      if (!inherits(env$bst, 'xgb.Booster'))
-        stop("'bst' in the parent frame must be an 'xgb.Booster'")
-      if (!is.null(best_score <- xgb.attr(env$bst, 'best_score'))) {
-        best_score <<- as.numeric(best_score)
-        best_iteration <<- as.numeric(xgb.attr(env$bst, 'best_iteration')) + 1
-        best_msg <<- as.numeric(xgb.attr(env$bst, 'best_msg'))
-      } else {
-        xgb.attributes(env$bst) <- list(best_iteration = best_iteration - 1,
-                                        best_score = best_score)
-      }
-    } else if (is.null(env$bst_folds) || is.null(env$basket)) {
-      stop("Parent frame has neither 'bst' nor ('bst_folds' and 'basket')")
-    }
-  }
-
-  finalizer <- function(env) {
-    if (!is.null(env$bst)) {
-      attr_best_score <- as.numeric(xgb.attr(env$bst, 'best_score'))
-      if (best_score != attr_best_score) {
-        # If the difference is too big, throw an error
-        if (abs(best_score - attr_best_score) >= 1e-14) {
-          stop("Inconsistent 'best_score' values between the closure state: ", best_score,
-               " and the xgb.attr: ", attr_best_score)
-        }
-        # If the difference is due to floating-point truncation, update best_score
-        best_score <- attr_best_score
-      }
-      xgb.attr(env$bst, "best_iteration") <- best_iteration - 1
-      xgb.attr(env$bst, "best_score") <- best_score
-    } else {
-      env$basket$best_iteration <- best_iteration
-    }
-  }
-
-  callback <- function(env = parent.frame(), finalize = FALSE) {
-    if (best_iteration < 0)
-      init(env)
-
-    if (finalize)
-      return(finalizer(env))
-
-    i <- env$iteration
-    score <- env$bst_evaluation[metric_idx]
-
-    if ((maximize && score > best_score) ||
-        (!maximize && score < best_score)) {
-
-      best_msg <<- .format_eval_string(
-        i, env$bst_evaluation, env$bst_evaluation_err
-      )
-      best_score <<- score
-      best_iteration <<- i
-      # save the property to attributes, so they will occur in checkpoint
-      if (!is.null(env$bst)) {
-        xgb.attributes(env$bst) <- list(
-          best_iteration = best_iteration - 1, # convert to 0-based index
-          best_score = best_score,
-          best_msg = best_msg
-        )
-      }
-    } else if (i - best_iteration >= stopping_rounds) {
-      env$stop_condition <- TRUE
-      env$end_iteration <- i
-      if (verbose && NVL(env$rank, 0) == 0)
-        cat("Stopping. Best iteration:\n", best_msg, "\n\n", sep = '')
-    }
-  }
-  attr(callback, 'call') <- match.call()
-  attr(callback, 'name') <- 'cb.early.stop'
-  callback
-}
-
-
-#' Callback closure for saving a model file.
+#' Note that, for \link{xgb.cv}, this will be the full data, while data for the specific
+#' folds can be found in the `model` object.
 #'
-#' @param save_period save the model to disk after every
-#'        \code{save_period} iterations; 0 means save the model at the end.
-#' @param save_name the name or path for the saved model file.
+#' \item evals The evaluation data, as passed under argument `evals` to
+#' \link{xgb.train}.
 #'
-#'        Note that the format of the model being saved is determined by the file
-#'        extension specified here (see \link{xgb.save} for details about how it works).
+#' For \link{xgb.cv}, this will always be `NULL`.
 #'
-#'        It can contain a \code{\link[base]{sprintf}} formatting specifier
-#'        to include the integer iteration number in the file name.
-#'        E.g., with \code{save_name} = 'xgboost_%04d.ubj',
-#'        the file saved at iteration 50 would be named "xgboost_0050.ubj".
-#' @seealso \link{xgb.save}
-#' @details
-#' This callback function allows to save an xgb-model file, either periodically after each \code{save_period}'s or at the end.
+#' \item begin_iteration Index of the first boosting iteration that will be executed
+#' (base-1 indexing).
 #'
-#' Callback function expects the following values to be set in its calling frame:
-#' \code{bst},
-#' \code{iteration},
-#' \code{begin_iteration},
-#' \code{end_iteration}.
+#' This will typically be '1', but when using training continuation, depending on the
+#' parameters for updates, boosting rounds will be continued from where the previous
+#' model ended, in which case this will be larger than 1.
 #'
-#' @seealso
-#' \code{\link{callbacks}}
+#' \item end_iteration Index of the last boostign iteration that will be executed
+#' (base-1 indexing, inclusive of this end).
 #'
-#' @export
-cb.save.model <- function(save_period = 0, save_name = "xgboost.ubj") {
-
-  if (save_period < 0)
-    stop("'save_period' cannot be negative")
-
-  callback <- function(env = parent.frame()) {
-    if (is.null(env$bst))
-      stop("'save_model' callback requires the 'bst' booster object in its calling frame")
-
-    if ((save_period > 0 && (env$iteration - env$begin_iteration) %% save_period == 0) ||
-        (save_period == 0 && env$iteration == env$end_iteration)) {
-      # Note: this throws a warning if the name doesn't have anything to format through 'sprintf'
-      suppressWarnings({
-        save_name <- sprintf(save_name, env$iteration)
-      })
-      xgb.save(env$bst, save_name)
-    }
-  }
-  attr(callback, 'call') <- match.call()
-  attr(callback, 'name') <- 'cb.save.model'
-  callback
-}
-
-
-#' Callback closure for returning cross-validation based predictions.
+#' It should match with argument `nrounds` passed to \link{xgb.train} or \link{xgb.cv}.
 #'
-#' @param save_models a flag for whether to save the folds' models.
+#' Note that boosting might be interrupted before reaching this last iteration, for
+#' example by using the early stopping callback \link{xgb.cb.early.stop}.
 #'
-#' @details
-#' This callback function saves predictions for all of the test folds,
-#' and also allows to save the folds' models.
+#' \item iteration Index of the iteration number that is being executed (first iteration
+#' will be the same as parameter `begin_iteration`, then next one will add +1, and so on).
 #'
-#' It is a "finalizer" callback and it uses early stopping information whenever it is available,
-#' thus it must be run after the early stopping callback if the early stopping is used.
+#' \item iter_feval Evaluation metrics for `evals` that were supplied, either
+#' determined by the objective, or by parameter `feval`.
 #'
-#' Callback function expects the following values to be set in its calling frame:
-#' \code{bst_folds},
-#' \code{basket},
-#' \code{data},
-#' \code{end_iteration},
-#' \code{params},
+#' For \link{xgb.train}, this will be a named vector with one entry per element in
+#' `evals`, where the names are determined as 'evals name' + '-' + 'metric name' - for
+#' example, if `evals` contains an entry named "tr" and the metric is "rmse",
+#' this will be a one-element vector with name "tr-rmse".
 #'
-#' @return
-#' Predictions are returned inside of the \code{pred} element, which is either a vector or a matrix,
-#' depending on the number of prediction outputs per data row. The order of predictions corresponds
-#' to the order of rows in the original dataset. Note that when a custom \code{folds} list is
-#' provided in \code{xgb.cv}, the predictions would only be returned properly when this list is a
-#' non-overlapping list of k sets of indices, as in a standard k-fold CV. The predictions would not be
-#' meaningful when user-provided folds have overlapping indices as in, e.g., random sampling splits.
-#' When some of the indices in the training dataset are not included into user-provided \code{folds},
-#' their prediction value would be \code{NA}.
+#' For \link{xgb.cv}, this will be a 2d matrix with dimensions `[length(evals), nfolds]`,
+#' where the row names will follow the same naming logic as the one-dimensional vector
+#' that is passed in \link{xgb.train}.
 #'
-#' @seealso
-#' \code{\link{callbacks}}
+#' Note that, internally, the built-in callbacks such as \link{xgb.cb.print.evaluation} summarize
+#' this table by calculating the row-wise means and standard deviations.
 #'
-#' @export
-cb.cv.predict <- function(save_models = FALSE) {
-
-  finalizer <- function(env) {
-    if (is.null(env$basket) || is.null(env$bst_folds))
-      stop("'cb.cv.predict' callback requires 'basket' and 'bst_folds' lists in its calling frame")
-
-    N <- nrow(env$data)
-    pred <- NULL
-
-    iterationrange <- c(1, NVL(env$basket$best_iteration, env$end_iteration))
-    if (NVL(env$params[['booster']], '') == 'gblinear') {
-      iterationrange <- "all"
-    }
-    for (fd in env$bst_folds) {
-      pr <- predict(fd$bst, fd$watchlist[[2]], iterationrange = iterationrange, reshape = TRUE)
-      if (is.null(pred)) {
-        if (NCOL(pr) > 1L) {
-          pred <- matrix(NA_real_, N, ncol(pr))
-        } else {
-          pred <- matrix(NA_real_, N)
-        }
-      }
-      if (is.matrix(pred)) {
-        pred[fd$index, ] <- pr
-      } else {
-        pred[fd$index] <- pr
-      }
-    }
-    env$basket$pred <- pred
-    if (save_models) {
-      env$basket$models <- lapply(env$bst_folds, function(fd) {
-        return(fd$bst)
-      })
-    }
-  }
-
-  callback <- function(env = parent.frame(), finalize = FALSE) {
-    if (finalize)
-      return(finalizer(env))
-  }
-  attr(callback, 'call') <- match.call()
-  attr(callback, 'name') <- 'cb.cv.predict'
-  callback
-}
-
-
-#' Callback closure for collecting the model coefficients history of a gblinear booster
-#' during its training.
+#' \item final_feval The evaluation results after the last boosting round is executed
+#' (same format as `iter_feval`, and will be the exact same input as passed under
+#' `iter_feval` to the last round that is executed during model fitting).
 #'
-#' @param sparse when set to FALSE/TRUE, a dense/sparse matrix is used to store the result.
-#'       Sparse format is useful when one expects only a subset of coefficients to be non-zero,
-#'       when using the "thrifty" feature selector with fairly small number of top features
-#'       selected per iteration.
+#' \item prev_cb_res Result from a previous run of a callback sharing the same name
+#' (as given by parameter `cb_name`) when conducting training continuation, if there
+#' was any in the booster R attributes.
 #'
-#' @details
-#' To keep things fast and simple, gblinear booster does not internally store the history of linear
-#' model coefficients at each boosting iteration. This callback provides a workaround for storing
-#' the coefficients' path, by extracting them after each training iteration.
+#' Some times, one might want to append the new results to the previous one, and this will
+#' be done automatically by the built-in callbacks such as \link{xgb.cb.evaluation.log},
+#' which will append the new rows to the previous table.
 #'
-#' Callback function expects the following values to be set in its calling frame:
-#' \code{bst} (or \code{bst_folds}).
+#' If no such previous callback result is available (which it never will when fitting
+#' a model from start instead of updating an existing model), this will be `NULL`.
 #'
-#' @return
-#' Results are stored in the \code{coefs} element of the closure.
-#' The \code{\link{xgb.gblinear.history}} convenience function provides an easy
-#' way to access it.
-#' With \code{xgb.train}, it is either a dense of a sparse matrix.
-#' While with \code{xgb.cv}, it is a list (an element per each fold) of such
-#' matrices.
+#' For \link{xgb.cv}, which doesn't support training continuation, this will always be `NULL`.
+#' }
 #'
-#' @seealso
-#' \code{\link{callbacks}}, \code{\link{xgb.gblinear.history}}.
+#' The following names (`cb_name` values) are reserved for internal callbacks:\itemize{
+#' \item print_evaluation
+#' \item evaluation_log
+#' \item reset_parameters
+#' \item early_stop
+#' \item save_model
+#' \item cv_predict
+#' \item gblinear_history
+#' }
 #'
+#' The following names are reserved for other non-callback attributes:\itemize{
+#' \item names
+#' \item class
+#' \item call
+#' \item params
+#' \item niter
+#' \item nfeatures
+#' \item folds
+#' }
+#'
+#' When using the built-in early stopping callback (\link{xgb.cb.early.stop}), said callback
+#' will always be executed before the others, as it sets some booster C-level attributes
+#' that other callbacks might also use. Otherwise, the order of execution will match with
+#' the order in which the callbacks are passed to the model fitting function.
+#' @seealso Built-in callbacks:\itemize{
+#' \item \link{xgb.cb.print.evaluation}
+#' \item \link{xgb.cb.evaluation.log}
+#' \item \link{xgb.cb.reset.parameters}
+#' \item \link{xgb.cb.early.stop}
+#' \item \link{xgb.cb.save.model}
+#' \item \link{xgb.cb.cv.predict}
+#' \item \link{xgb.cb.gblinear.history}
+#' }
 #' @examples
-#' #### Binary classification:
+#' # Example constructing a custom callback that calculates
+#' # squared error on the training data (no separate test set),
+#' # and outputs the per-iteration results.
+#' ssq_callback <- xgb.Callback(
+#'   cb_name = "ssq",
+#'   f_before_training = function(env, model, data, evals,
+#'                                begin_iteration, end_iteration) {
+#'     # A vector to keep track of a number at each iteration
+#'     env$logs <- rep(NA_real_, end_iteration - begin_iteration + 1)
+#'   },
+#'   f_after_iter = function(env, model, data, evals, iteration, iter_feval) {
+#'     # This calculates the sum of squared errors on the training data.
+#'     # Note that this can be better done by passing an 'evals' entry,
+#'     # but this demonstrates a way in which callbacks can be structured.
+#'     pred <- predict(model, data)
+#'     err <- pred - getinfo(data, "label")
+#'     sq_err <- sum(err^2)
+#'     env$logs[iteration] <- sq_err
+#'     cat(
+#'       sprintf(
+#'         "Squared error at iteration %d: %.2f\n",
+#'         iteration, sq_err
+#'       )
+#'     )
 #'
-#' ## Keep the number of threads to 1 for examples
-#' nthread <- 1
-#' data.table::setDTthreads(nthread)
+#'     # A return value of 'TRUE' here would signal to finalize the training
+#'     return(FALSE)
+#'   },
+#'   f_after_training = function(env, model, data, evals, iteration,
+#'                               final_feval, prev_cb_res) {
+#'     return(env$logs)
+#'   }
+#' )
 #'
-#' # In the iris dataset, it is hard to linearly separate Versicolor class from the rest
-#' # without considering the 2nd order interactions:
-#' x <- model.matrix(Species ~ .^2, iris)[,-1]
-#' colnames(x)
-#' dtrain <- xgb.DMatrix(scale(x), label = 1*(iris$Species == "versicolor"), nthread = nthread)
-#' param <- list(booster = "gblinear", objective = "reg:logistic", eval_metric = "auc",
-#'               lambda = 0.0003, alpha = 0.0003, nthread = nthread)
-#' # For 'shotgun', which is a default linear updater, using high eta values may result in
-#' # unstable behaviour in some datasets. With this simple dataset, however, the high learning
-#' # rate does not break the convergence, but allows us to illustrate the typical pattern of
-#' # "stochastic explosion" behaviour of this lock-free algorithm at early boosting iterations.
-#' bst <- xgb.train(param, dtrain, list(tr=dtrain), nrounds = 200, eta = 1.,
-#'                  callbacks = list(cb.gblinear.history()))
-#' # Extract the coefficients' path and plot them vs boosting iteration number:
-#' coef_path <- xgb.gblinear.history(bst)
-#' matplot(coef_path, type = 'l')
-#'
-#' # With the deterministic coordinate descent updater, it is safer to use higher learning rates.
-#' # Will try the classical componentwise boosting which selects a single best feature per round:
-#' bst <- xgb.train(param, dtrain, list(tr=dtrain), nrounds = 200, eta = 0.8,
-#'                  updater = 'coord_descent', feature_selector = 'thrifty', top_k = 1,
-#'                  callbacks = list(cb.gblinear.history()))
-#' matplot(xgb.gblinear.history(bst), type = 'l')
-#' #  Componentwise boosting is known to have similar effect to Lasso regularization.
-#' # Try experimenting with various values of top_k, eta, nrounds,
-#' # as well as different feature_selectors.
-#'
-#' # For xgb.cv:
-#' bst <- xgb.cv(param, dtrain, nfold = 5, nrounds = 100, eta = 0.8,
-#'               callbacks = list(cb.gblinear.history()))
-#' # coefficients in the CV fold #3
-#' matplot(xgb.gblinear.history(bst)[[3]], type = 'l')
-#'
-#'
-#' #### Multiclass classification:
-#' #
-#' dtrain <- xgb.DMatrix(scale(x), label = as.numeric(iris$Species) - 1, nthread = nthread)
-#' param <- list(booster = "gblinear", objective = "multi:softprob", num_class = 3,
-#'               lambda = 0.0003, alpha = 0.0003, nthread = nthread)
-#' # For the default linear updater 'shotgun' it sometimes is helpful
-#' # to use smaller eta to reduce instability
-#' bst <- xgb.train(param, dtrain, list(tr=dtrain), nrounds = 50, eta = 0.5,
-#'                  callbacks = list(cb.gblinear.history()))
-#' # Will plot the coefficient paths separately for each class:
-#' matplot(xgb.gblinear.history(bst, class_index = 0), type = 'l')
-#' matplot(xgb.gblinear.history(bst, class_index = 1), type = 'l')
-#' matplot(xgb.gblinear.history(bst, class_index = 2), type = 'l')
-#'
-#' # CV:
-#' bst <- xgb.cv(param, dtrain, nfold = 5, nrounds = 70, eta = 0.5,
-#'               callbacks = list(cb.gblinear.history(FALSE)))
-#' # 1st fold of 1st class
-#' matplot(xgb.gblinear.history(bst, class_index = 0)[[1]], type = 'l')
+#' data(mtcars)
+#' y <- mtcars$mpg
+#' x <- as.matrix(mtcars[, -1])
+#' dm <- xgb.DMatrix(x, label = y, nthread = 1)
+#' model <- xgb.train(
+#'   data = dm,
+#'   params = list(objective = "reg:squarederror", nthread = 1),
+#'   nrounds = 5,
+#'   callbacks = list(ssq_callback),
+#'   keep_extra_attributes = TRUE
+#' )
 #'
+#' # Result from 'f_after_iter' will be available as an attribute
+#' attributes(model)$ssq
 #' @export
-cb.gblinear.history <- function(sparse = FALSE) {
-  coefs <- NULL
+xgb.Callback <- function(
+  cb_name = "custom_callback",
+  env = new.env(),
+  f_before_training = function(env, model, data, evals, begin_iteration, end_iteration) NULL,
+  f_before_iter = function(env, model, data, evals, iteration) NULL,
+  f_after_iter = function(env, model, data, evals, iteration, iter_feval) NULL,
+  f_after_training = function(env, model, data, evals, iteration, final_feval, prev_cb_res) NULL
+) {
+  stopifnot(is.null(f_before_training) || is.function(f_before_training))
+  stopifnot(is.null(f_before_iter) || is.function(f_before_iter))
+  stopifnot(is.null(f_after_iter) || is.function(f_after_iter))
+  stopifnot(is.null(f_after_training) || is.function(f_after_training))
+  stopifnot(is.character(cb_name) && length(cb_name) == 1)
 
-  init <- function(env) {
-    # xgb.train(): bst will be present
-    # xgb.cv(): bst_folds will be present
-    if (is.null(env$bst) && is.null(env$bst_folds)) {
-        stop("Parent frame has neither 'bst' nor 'bst_folds'")
-    }
+  if (cb_name %in% .reserved_cb_names) {
+    stop("Cannot use reserved callback name '", cb_name, "'.")
   }
 
-  # convert from list to (sparse) matrix
-  list2mat <- function(coef_list) {
-    if (sparse) {
-      coef_mat <- sparseMatrix(x = unlist(lapply(coef_list, slot, "x")),
-                               i = unlist(lapply(coef_list, slot, "i")),
-                               p = c(0, cumsum(sapply(coef_list, function(x) length(x@x)))),
-                               dims = c(length(coef_list[[1]]), length(coef_list)))
-      return(t(coef_mat))
-    } else {
-      return(do.call(rbind, coef_list))
-    }
-  }
+  out <- list(
+    cb_name = cb_name,
+    env = env,
+    f_before_training = f_before_training,
+    f_before_iter = f_before_iter,
+    f_after_iter = f_after_iter,
+    f_after_training = f_after_training
+  )
+  class(out) <- "xgb.Callback"
+  return(out)
+}
 
-  finalizer <- function(env) {
-    if (length(coefs) == 0)
-      return()
-    if (!is.null(env$bst)) { # # xgb.train:
-      coefs <<- list2mat(coefs)
-    } else { # xgb.cv:
-      # second lapply transposes the list
-      coefs <<- lapply(
-        X = lapply(
-          X = seq_along(coefs[[1]]),
-          FUN = function(i) lapply(coefs, "[[", i)
-        ),
-        FUN = list2mat
+.execute.cb.before.training <- function(
+  callbacks,
+  model,
+  data,
+  evals,
+  begin_iteration,
+  end_iteration
+) {
+  for (callback in callbacks) {
+    if (!is.null(callback$f_before_training)) {
+      callback$f_before_training(
+        callback$env,
+        model,
+        data,
+        evals,
+        begin_iteration,
+        end_iteration
       )
     }
   }
-
-  extract.coef <- function(env) {
-    if (!is.null(env$bst)) { # # xgb.train:
-      cf <- as.numeric(grep('(booster|bias|weigh)', xgb.dump(env$bst), invert = TRUE, value = TRUE))
-      if (sparse) cf <- as(cf, "sparseVector")
-    } else { # xgb.cv:
-      cf <- vector("list", length(env$bst_folds))
-      for (i in seq_along(env$bst_folds)) {
-        dmp <- xgb.dump(env$bst_folds[[i]]$bst)
-        cf[[i]] <- as.numeric(grep('(booster|bias|weigh)', dmp, invert = TRUE, value = TRUE))
-        if (sparse) cf[[i]] <- as(cf[[i]], "sparseVector")
-      }
-    }
-    cf
-  }
-
-  callback <- function(env = parent.frame(), finalize = FALSE) {
-    if (is.null(coefs)) init(env)
-    if (finalize) return(finalizer(env))
-    cf <- extract.coef(env)
-    coefs <<- c(coefs, list(cf))
-  }
-
-  attr(callback, 'call') <- match.call()
-  attr(callback, 'name') <- 'cb.gblinear.history'
-  callback
 }
 
-#' @title Extract gblinear coefficients history.
-#' @description A helper function to extract the matrix of linear coefficients' history
-#' from a gblinear model created while using the \code{cb.gblinear.history()}
-#' callback.
-#' @details Note that this is an R-specific function that relies on R attributes that
-#' are not saved when using xgboost's own serialization functions like \link{xgb.load}
-#' or \link{xgb.load.raw}.
-#'
-#' In order for a serialized model to be accepted by tgis function, one must use R
-#' serializers such as \link{saveRDS}.
-#' @param model either an \code{xgb.Booster} or a result of \code{xgb.cv()}, trained
-#'        using the \code{cb.gblinear.history()} callback, but \bold{not} a booster
-#'        loaded from \link{xgb.load} or \link{xgb.load.raw}.
-#' @param class_index zero-based class index to extract the coefficients for only that
-#'        specific class in a multinomial multiclass model. When it is NULL, all the
-#'        coefficients are returned. Has no effect in non-multiclass models.
-#'
-#' @return
-#' For an \code{xgb.train} result, a matrix (either dense or sparse) with the columns
-#' corresponding to iteration's coefficients (in the order as \code{xgb.dump()} would
-#' return) and the rows corresponding to boosting iterations.
-#'
-#' For an \code{xgb.cv} result, a list of such matrices is returned with the elements
-#' corresponding to CV folds.
-#'
-#' @export
-xgb.gblinear.history <- function(model, class_index = NULL) {
-
-  if (!(inherits(model, "xgb.Booster") ||
-        inherits(model, "xgb.cv.synchronous")))
-    stop("model must be an object of either xgb.Booster or xgb.cv.synchronous class")
-  is_cv <- inherits(model, "xgb.cv.synchronous")
-
-  if (is_cv) {
-    callbacks <- model$callbacks
-  } else {
-    callbacks <- attributes(model)$callbacks
+.execute.cb.before.iter <- function(
+  callbacks,
+  model,
+  data,
+  evals,
+  iteration
+) {
+  if (!length(callbacks)) {
+    return(FALSE)
   }
+  out <- sapply(callbacks, function(cb) {
+    if (is.null(cb$f_before_iter)) {
+      return(FALSE)
+    }
+    should_stop <- cb$f_before_iter(
+      cb$env,
+      model,
+      data,
+      evals,
+      iteration
+    )
+    if (!NROW(should_stop)) {
+      should_stop <- FALSE
+    } else if (NROW(should_stop) > 1) {
+      should_stop <- head(as.logical(should_stop), 1)
+    }
+    return(should_stop)
+  })
+  return(any(out))
+}
 
-  if (is.null(callbacks) || is.null(callbacks$cb.gblinear.history))
-    stop("model must be trained while using the cb.gblinear.history() callback")
-
-  if (!is_cv) {
-    num_class <- xgb.num_class(model)
-    num_feat <- xgb.num_feature(model)
-  } else {
-    # in case of CV, the object is expected to have this info
-    if (model$params$booster != "gblinear")
-      stop("It does not appear to be a gblinear model")
-    num_class <- NVL(model$params$num_class, 1)
-    num_feat <- model$nfeatures
-    if (is.null(num_feat))
-      stop("This xgb.cv result does not have nfeatures info")
+.execute.cb.after.iter <- function(
+  callbacks,
+  model,
+  data,
+  evals,
+  iteration,
+  iter_feval
+) {
+  if (!length(callbacks)) {
+    return(FALSE)
   }
+  out <- sapply(callbacks, function(cb) {
+    if (is.null(cb$f_after_iter)) {
+      return(FALSE)
+    }
+    should_stop <- cb$f_after_iter(
+      cb$env,
+      model,
+      data,
+      evals,
+      iteration,
+      iter_feval
+    )
+    if (!NROW(should_stop)) {
+      should_stop <- FALSE
+    } else if (NROW(should_stop) > 1) {
+      should_stop <- head(as.logical(should_stop), 1)
+    }
+    return(should_stop)
+  })
+  return(any(out))
+}
 
-  if (!is.null(class_index) &&
-      num_class > 1 &&
-      (class_index[1] < 0 || class_index[1] >= num_class))
-    stop("class_index has to be within [0,", num_class - 1, "]")
-
-  coef_path <- environment(callbacks$cb.gblinear.history)[["coefs"]]
-  if (!is.null(class_index) && num_class > 1) {
-    coef_path <- if (is.list(coef_path)) {
-      lapply(coef_path,
-             function(x) x[, seq(1 + class_index, by = num_class, length.out = num_feat)])
+.execute.cb.after.training <- function(
+  callbacks,
+  model,
+  data,
+  evals,
+  iteration,
+  final_feval,
+  prev_cb_res
+) {
+  if (!length(callbacks)) {
+    return(NULL)
+  }
+  old_cb_res <- attributes(model)
+  out <- lapply(callbacks, function(cb) {
+    if (is.null(cb$f_after_training)) {
+      return(NULL)
     } else {
-      coef_path <- coef_path[, seq(1 + class_index, by = num_class, length.out = num_feat)]
+      return(
+        cb$f_after_training(
+          cb$env,
+          model,
+          data,
+          evals,
+          iteration,
+          final_feval,
+          getElement(old_cb_res, cb$cb_name)
+        )
+      )
     }
+  })
+  names(out) <- sapply(callbacks, function(cb) cb$cb_name)
+  if (NROW(out)) {
+    out <- out[!sapply(out, is.null)]
   }
-  coef_path
+  return(out)
 }
 
+.summarize.feval <- function(iter_feval, showsd) {
+  if (NCOL(iter_feval) > 1L && showsd) {
+    stdev <- apply(iter_feval, 1, sd)
+  } else {
+    stdev <- NULL
+  }
+  if (NCOL(iter_feval) > 1L) {
+    iter_feval <- rowMeans(iter_feval)
+  }
+  return(list(feval = iter_feval, stdev = stdev))
+}
 
-#
-# Internal utility functions for callbacks ------------------------------------
-#
+.print.evaluation <- function(iter_feval, showsd, iteration) {
+  tmp <- .summarize.feval(iter_feval, showsd)
+  msg <- .format_eval_string(iteration, tmp$feval, tmp$stdev)
+  cat(msg, '\n')
+}
 
 # Format the evaluation metric string
 .format_eval_string <- function(iter, eval_res, eval_err = NULL) {
@@ -784,69 +407,838 @@ xgb.gblinear.history <- function(model, class_index = NULL) {
   return(paste0(iter, res))
 }
 
-# Extract callback names from the list of callbacks
-callback.names <- function(cb_list) {
-  unlist(lapply(cb_list, function(x) attr(x, 'name')))
-}
-
-# Extract callback calls from the list of callbacks
-callback.calls <- function(cb_list) {
-  unlist(lapply(cb_list, function(x) attr(x, 'call')))
-}
-
-# Add a callback cb to the list and make sure that
-# cb.early.stop and cb.cv.predict are at the end of the list
-# with cb.cv.predict being the last (when present)
-add.cb <- function(cb_list, cb) {
-  cb_list <- c(cb_list, cb)
-  names(cb_list) <- callback.names(cb_list)
-  if ('cb.early.stop' %in% names(cb_list)) {
-    cb_list <- c(cb_list, cb_list['cb.early.stop'])
-    # this removes only the first one
-    cb_list['cb.early.stop'] <- NULL
+#' @title Callback for printing the result of evaluation
+#' @param period results would be printed every number of periods
+#' @param showsd whether standard deviations should be printed (when available)
+#' @return An `xgb.Callback` object, which can be passed to \link{xgb.train} or \link{xgb.cv}.
+#' @description
+#' The callback function prints the result of evaluation at every \code{period} iterations.
+#' The initial and the last iteration's evaluations are always printed.
+#'
+#' Does not leave any attribute in the booster (see \link{xgb.cb.evaluation.log} for that).
+#' @seealso \link{xgb.Callback}
+#' @export
+xgb.cb.print.evaluation <- function(period = 1, showsd = TRUE) {
+  if (length(period) != 1 || period != floor(period) || period < 1) {
+    stop("'period' must be a positive integer.")
   }
-  if ('cb.cv.predict' %in% names(cb_list)) {
-    cb_list <- c(cb_list, cb_list['cb.cv.predict'])
-    cb_list['cb.cv.predict'] <- NULL
-  }
-  cb_list
-}
 
-# Sort callbacks list into categories
-categorize.callbacks <- function(cb_list) {
-  list(
-    pre_iter = Filter(function(x) {
-        pre <- attr(x, 'is_pre_iteration')
-        !is.null(pre) && pre
-      }, cb_list),
-    post_iter = Filter(function(x) {
-        pre <- attr(x, 'is_pre_iteration')
-        is.null(pre) || !pre
-      }, cb_list),
-    finalize = Filter(function(x) {
-        'finalize' %in% names(formals(x))
-      }, cb_list)
+  xgb.Callback(
+    cb_name = "print_evaluation",
+    env = as.environment(list(period = period, showsd = showsd, is_first_call = TRUE)),
+    f_before_training = NULL,
+    f_before_iter = NULL,
+    f_after_iter = function(env, model, data, evals, iteration, iter_feval) {
+      if (is.null(iter_feval)) {
+        return(FALSE)
+      }
+      if (env$is_first_call || (iteration - 1) %% env$period == 0) {
+        .print.evaluation(iter_feval, env$showsd, iteration)
+        env$last_printed_iter <- iteration
+      }
+      env$is_first_call <- FALSE
+      return(FALSE)
+    },
+    f_after_training = function(env, model, data, evals, iteration, final_feval, prev_cb_res) {
+      if (is.null(final_feval)) {
+        return(NULL)
+      }
+      if (is.null(env$last_printed_iter) || iteration > env$last_printed_iter) {
+        .print.evaluation(final_feval, env$showsd, iteration)
+      }
+    }
   )
 }
 
-# Check whether all callback functions with names given by 'query_names' are present in the 'cb_list'.
-has.callbacks <- function(cb_list, query_names) {
-  if (length(cb_list) < length(query_names))
-    return(FALSE)
-  if (!is.list(cb_list) ||
-      any(sapply(cb_list, class) != 'function')) {
-    stop('`cb_list` must be a list of callback functions')
-  }
-  cb_names <- callback.names(cb_list)
-  if (!is.character(cb_names) ||
-      length(cb_names) != length(cb_list) ||
-      any(cb_names == "")) {
-    stop('All callbacks in the `cb_list` must have a non-empty `name` attribute')
-  }
-  if (!is.character(query_names) ||
-      length(query_names) == 0 ||
-      any(query_names == "")) {
-    stop('query_names must be a non-empty vector of non-empty character names')
-  }
-  return(all(query_names %in% cb_names))
+#' @title Callback for logging the evaluation history
+#' @return An `xgb.Callback` object, which can be passed to \link{xgb.train} or \link{xgb.cv}.
+#' @details This callback creates a table with per-iteration evaluation metrics (see parameters
+#' `evals` and `feval` in \link{xgb.train}).
+#' @details
+#' Note: in the column names of the final data.table, the dash '-' character is replaced with
+#' the underscore '_' in order to make the column names more like regular R identifiers.
+#' @seealso \link{xgb.cb.print.evaluation}
+#' @export
+xgb.cb.evaluation.log <- function() {
+  xgb.Callback(
+    cb_name = "evaluation_log",
+    f_before_training = function(env, model, data, evals, begin_iteration, end_iteration) {
+      env$evaluation_log <- vector("list", end_iteration - begin_iteration + 1)
+      env$next_log <- 1
+    },
+    f_before_iter = NULL,
+    f_after_iter = function(env, model, data, evals, iteration, iter_feval) {
+      tmp <- .summarize.feval(iter_feval, TRUE)
+      env$evaluation_log[[env$next_log]] <- list(iter = iteration, metrics = tmp$feval, sds = tmp$stdev)
+      env$next_log <- env$next_log + 1
+      return(FALSE)
+    },
+    f_after_training = function(env, model, data, evals, iteration, final_feval, prev_cb_res) {
+      if (!NROW(env$evaluation_log)) {
+        return(prev_cb_res)
+      }
+      # in case of early stopping
+      if (env$next_log <= length(env$evaluation_log)) {
+        env$evaluation_log <- head(env$evaluation_log, env$next_log - 1)
+      }
+
+      iters <- data.frame(iter = sapply(env$evaluation_log, function(x) x$iter))
+      metrics <- do.call(rbind, lapply(env$evaluation_log, function(x) x$metrics))
+      mnames <- gsub("-", "_", names(env$evaluation_log[[1]]$metrics), fixed = TRUE)
+      colnames(metrics) <- mnames
+      has_sds <- !is.null(env$evaluation_log[[1]]$sds)
+      if (has_sds) {
+        sds <- do.call(rbind, lapply(env$evaluation_log, function(x) x$sds))
+        colnames(sds) <- mnames
+        metrics <- lapply(
+          mnames,
+          function(metric) {
+            out <- cbind(metrics[, metric], sds[, metric])
+            colnames(out) <- paste0(metric, c("_mean", "_std"))
+            return(out)
+          }
+        )
+        metrics <- do.call(cbind, metrics)
+      }
+      evaluation_log <- cbind(iters, metrics)
+
+      if (!is.null(prev_cb_res)) {
+        if (!is.data.table(prev_cb_res)) {
+          prev_cb_res <- data.table::as.data.table(prev_cb_res)
+        }
+        prev_take <- prev_cb_res[prev_cb_res$iter < min(evaluation_log$iter)]
+        if (nrow(prev_take)) {
+          evaluation_log <- rbind(prev_cb_res, evaluation_log)
+        }
+      }
+      evaluation_log <- data.table::as.data.table(evaluation_log)
+      return(evaluation_log)
+    }
+  )
+}
+
+#' @title Callback for resetting the booster's parameters at each iteration.
+#' @param new_params a list where each element corresponds to a parameter that needs to be reset.
+#'        Each element's value must be either a vector of values of length \code{nrounds}
+#'        to be set at each iteration,
+#'        or a function of two parameters \code{learning_rates(iteration, nrounds)}
+#'        which returns a new parameter value by using the current iteration number
+#'        and the total number of boosting rounds.
+#' @return An `xgb.Callback` object, which can be passed to \link{xgb.train} or \link{xgb.cv}.
+#' @details
+#' Note that when training is resumed from some previous model, and a function is used to
+#' reset a parameter value, the \code{nrounds} argument in this function would be the
+#' the number of boosting rounds in the current training.
+#'
+#' Does not leave any attribute in the booster.
+#' @export
+xgb.cb.reset.parameters <- function(new_params) {
+  stopifnot(is.list(new_params))
+  pnames <- gsub(".", "_", names(new_params), fixed = TRUE)
+  not_allowed <- pnames %in%
+    c('num_class', 'num_output_group', 'size_leaf_vector', 'updater_seq')
+  if (any(not_allowed))
+    stop('Parameters ', paste(pnames[not_allowed]), " cannot be changed during boosting.")
+
+  xgb.Callback(
+    cb_name = "reset_parameters",
+    env = as.environment(list(new_params = new_params)),
+    f_before_training = function(env, model, data, evals, begin_iteration, end_iteration) {
+      env$end_iteration <- end_iteration
+
+      pnames <- gsub(".", "_", names(env$new_params), fixed = TRUE)
+      for (n in pnames) {
+        p <- env$new_params[[n]]
+        if (is.function(p)) {
+          if (length(formals(p)) != 2)
+            stop("Parameter '", n, "' is a function but not of two arguments")
+        } else if (is.numeric(p) || is.character(p)) {
+          if (length(p) != env$end_iteration)
+            stop("Length of '", n, "' has to be equal to 'nrounds'")
+        } else {
+          stop("Parameter '", n, "' is not a function or a vector")
+        }
+      }
+    },
+    f_before_iter = function(env, model, data, evals, iteration) {
+      pars <- lapply(env$new_params, function(p) {
+        if (is.function(p)) {
+          return(p(iteration, env$end_iteration))
+        } else {
+          return(p[iteration])
+        }
+      })
+
+      if (inherits(model, "xgb.Booster")) {
+        xgb.parameters(model) <- pars
+      } else {
+        for (fd in model) {
+          xgb.parameters(fd$bst) <- pars
+        }
+      }
+      return(FALSE)
+    },
+    f_after_iter = NULL,
+    f_after_training = NULL
+  )
+}
+
+#' @title Callback to activate early stopping
+#' @param stopping_rounds The number of rounds with no improvement in
+#'        the evaluation metric in order to stop the training.
+#' @param maximize Whether to maximize the evaluation metric.
+#' @param metric_name The name of an evaluation column to use as a criteria for early
+#'        stopping. If not set, the last column would be used.
+#'        Let's say the test data in \code{evals} was labelled as \code{dtest},
+#'        and one wants to use the AUC in test data for early stopping regardless of where
+#'        it is in the \code{evals}, then one of the following would need to be set:
+#'        \code{metric_name='dtest-auc'} or \code{metric_name='dtest_auc'}.
+#'        All dash '-' characters in metric names are considered equivalent to '_'.
+#' @param verbose Whether to print the early stopping information.
+#' @param keep_all_iter Whether to keep all of the boosting rounds that were produced
+#'        in the resulting object. If passing `FALSE`, will only keep the boosting rounds
+#'        up to the detected best iteration, discarding the ones that come after.
+#' @return An `xgb.Callback` object, which can be passed to \link{xgb.train} or \link{xgb.cv}.
+#' @description
+#' This callback function determines the condition for early stopping.
+#'
+#' The following attributes are assigned to the booster's object:
+#' \itemize{
+#' \item \code{best_score} the evaluation score at the best iteration
+#' \item \code{best_iteration} at which boosting iteration the best score has occurred
+#' (0-based index for interoperability of binary models)
+#' }
+#'
+#' The same values are also stored as R attributes as a result of the callback, plus an additional
+#' attribute `stopped_by_max_rounds` which indicates whether an early stopping by the `stopping_rounds`
+#' condition occurred. Note that the `best_iteration` that is stored under R attributes will follow
+#' base-1 indexing, so it will be larger by '1' than the C-level 'best_iteration' that is accessed
+#' through \link{xgb.attr} or \link{xgb.attributes}.
+#'
+#' At least one dataset is required in `evals` for early stopping to work.
+#' @export
+xgb.cb.early.stop <- function(
+  stopping_rounds,
+  maximize = FALSE,
+  metric_name = NULL,
+  verbose = TRUE,
+  keep_all_iter = TRUE
+) {
+  if (!is.null(metric_name)) {
+    stopifnot(is.character(metric_name))
+    stopifnot(length(metric_name) == 1L)
+  }
+
+  xgb.Callback(
+    cb_name = "early_stop",
+    env = as.environment(
+      list(
+        checked_evnames = FALSE,
+        stopping_rounds = stopping_rounds,
+        maximize = maximize,
+        metric_name = metric_name,
+        verbose = verbose,
+        keep_all_iter = keep_all_iter,
+        stopped_by_max_rounds = FALSE
+      )
+    ),
+    f_before_training = function(env, model, data, evals, begin_iteration, end_iteration) {
+      if (inherits(model, "xgb.Booster") && !length(evals)) {
+        stop("For early stopping, 'evals' must have at least one element")
+      }
+      env$begin_iteration <- begin_iteration
+      return(NULL)
+    },
+    f_before_iter = function(env, model, data, evals, iteration) NULL,
+    f_after_iter = function(env, model, data, evals, iteration, iter_feval) {
+      sds <- NULL
+      if (NCOL(iter_feval) > 1) {
+        tmp <- .summarize.feval(iter_feval, TRUE)
+        iter_feval <- tmp$feval
+        sds <- tmp$stdev
+      }
+
+      if (!env$checked_evnames) {
+
+        eval_names <- gsub('-', '_', names(iter_feval), fixed = TRUE)
+        if (!is.null(env$metric_name)) {
+          env$metric_idx <- which(gsub('-', '_', env$metric_name, fixed = TRUE) == eval_names)
+          if (length(env$metric_idx) == 0)
+            stop("'metric_name' for early stopping is not one of the following:\n",
+                 paste(eval_names, collapse = ' '), '\n')
+        }
+
+        if (is.null(env$metric_name)) {
+          if (NROW(iter_feval) == 1) {
+            env$metric_idx <- 1L
+          } else {
+            env$metric_idx <- length(eval_names)
+            if (env$verbose)
+              cat('Multiple eval metrics are present. Will use ',
+                  eval_names[env$metric_idx], ' for early stopping.\n', sep = '')
+          }
+        }
+
+        env$metric_name <- eval_names[env$metric_idx]
+
+        # maximize is usually NULL when not set in xgb.train and built-in metrics
+        if (is.null(env$maximize))
+          env$maximize <- grepl('(_auc|_aupr|_map|_ndcg|_pre)', env$metric_name)
+
+        if (env$verbose)
+          cat("Will train until ", env$metric_name, " hasn't improved in ",
+              env$stopping_rounds, " rounds.\n\n", sep = '')
+
+        env$best_iteration <- env$begin_iteration
+        if (env$maximize) {
+          env$best_score <- -Inf
+        } else {
+          env$best_score <- Inf
+        }
+
+        if (inherits(model, "xgb.Booster")) {
+          best_score <- xgb.attr(model, 'best_score')
+          if (NROW(best_score)) env$best_score <- as.numeric(best_score)
+          best_iteration <- xgb.attr(model, 'best_iteration')
+          if (NROW(best_iteration)) env$best_iteration <- as.numeric(best_iteration) + 1
+        }
+
+        env$checked_evnames <- TRUE
+      }
+
+      score <- iter_feval[env$metric_idx]
+      if ((env$maximize && score > env$best_score) ||
+          (!env$maximize && score < env$best_score)) {
+
+        env$best_score <- score
+        env$best_iteration <- iteration
+        # save the property to attributes, so they will occur in checkpoint
+        if (inherits(model, "xgb.Booster")) {
+          xgb.attributes(model) <- list(
+            best_iteration = env$best_iteration - 1, # convert to 0-based index
+            best_score = env$best_score
+          )
+        }
+      } else if (iteration - env$best_iteration >= env$stopping_rounds) {
+        if (env$verbose) {
+          best_msg <- .format_eval_string(iteration, iter_feval, sds)
+          cat("Stopping. Best iteration:\n", best_msg, "\n\n", sep = '')
+        }
+        env$stopped_by_max_rounds <- TRUE
+        return(TRUE)
+      }
+      return(FALSE)
+    },
+    f_after_training = function(env, model, data, evals, iteration, final_feval, prev_cb_res) {
+      if (inherits(model, "xgb.Booster") && !env$keep_all_iter && env$best_iteration < iteration) {
+        # Note: it loses the attributes after being sliced,
+        # so they have to be re-assigned afterwards.
+        prev_attr <- xgb.attributes(model)
+        if (NROW(prev_attr)) {
+          suppressWarnings({
+            prev_attr <- within(prev_attr, rm("best_score", "best_iteration"))
+          })
+        }
+        .Call(XGBoosterSliceAndReplace_R, xgb.get.handle(model), 0L, env$best_iteration, 1L)
+        if (NROW(prev_attr)) {
+          xgb.attributes(model) <- prev_attr
+        }
+      }
+      attrs_set <- list(best_iteration = env$best_iteration - 1, best_score = env$best_score)
+      if (inherits(model, "xgb.Booster")) {
+        xgb.attributes(model) <- attrs_set
+      } else {
+        for (fd in model) {
+          xgb.attributes(fd$bst) <- attrs_set # to use in the cv.predict callback
+        }
+      }
+      return(
+        list(
+          best_iteration = env$best_iteration,
+          best_score = env$best_score,
+          stopped_by_max_rounds = env$stopped_by_max_rounds
+        )
+      )
+    }
+  )
+}
+
+.save.model.w.formatted.name <- function(model, save_name, iteration) {
+  # Note: this throws a warning if the name doesn't have anything to format through 'sprintf'
+  suppressWarnings({
+    save_name <- sprintf(save_name, iteration)
+  })
+  xgb.save(model, save_name)
+}
+
+#' @title Callback for saving a model file.
+#' @param save_period Save the model to disk after every
+#'        \code{save_period} iterations; 0 means save the model at the end.
+#' @param save_name The name or path for the saved model file.
+#'        It can contain a \code{\link[base]{sprintf}} formatting specifier
+#'        to include the integer iteration number in the file name.
+#'        E.g., with \code{save_name} = 'xgboost_%04d.model',
+#'        the file saved at iteration 50 would be named "xgboost_0050.model".
+#' @return An `xgb.Callback` object, which can be passed to \link{xgb.train},
+#'         but \bold{not} to \link{xgb.cv}.
+#' @description
+#' This callback function allows to save an xgb-model file, either periodically
+#' after each \code{save_period}'s or at the end.
+#'
+#' Does not leave any attribute in the booster.
+#' @export
+xgb.cb.save.model <- function(save_period = 0, save_name = "xgboost.ubj") {
+  if (save_period < 0) {
+    stop("'save_period' cannot be negative")
+  }
+  if (!is.character(save_name) || length(save_name) != 1L) {
+    stop("'save_name' must be a single character refering to file name.")
+  }
+
+  xgb.Callback(
+    cb_name = "save_model",
+    env = as.environment(list(save_period = save_period, save_name = save_name, last_save = 0)),
+    f_before_training = function(env, model, data, evals, begin_iteration, end_iteration) {
+      env$begin_iteration <- begin_iteration
+    },
+    f_before_iter = NULL,
+    f_after_iter = function(env, model, data, evals, iteration, iter_feval) {
+      if (env$save_period > 0 && (iteration - env$begin_iteration) %% env$save_period == 0) {
+        .save.model.w.formatted.name(model, env$save_name, iteration)
+        env$last_save <- iteration
+      }
+      return(FALSE)
+    },
+    f_after_training = function(env, model, data, evals, iteration, final_feval, prev_cb_res) {
+      if (env$save_period == 0 && iteration > env$last_save) {
+        .save.model.w.formatted.name(model, env$save_name, iteration)
+      }
+    }
+  )
+}
+
+#' @title Callback for returning cross-validation based predictions.
+#' @param save_models A flag for whether to save the folds' models.
+#' @param outputmargin Whether to save margin predictions (same effect as passing this
+#' parameter to \link{predict.xgb.Booster}).
+#' @return An `xgb.Callback` object, which can be passed to \link{xgb.cv},
+#'         but \bold{not} to \link{xgb.train}.
+#' @description
+#' This callback function saves predictions for all of the test folds,
+#' and also allows to save the folds' models.
+#' @details
+#' Predictions are saved inside of the \code{pred} element, which is either a vector or a matrix,
+#' depending on the number of prediction outputs per data row. The order of predictions corresponds
+#' to the order of rows in the original dataset. Note that when a custom \code{folds} list is
+#' provided in \code{xgb.cv}, the predictions would only be returned properly when this list is a
+#' non-overlapping list of k sets of indices, as in a standard k-fold CV. The predictions would not be
+#' meaningful when user-provided folds have overlapping indices as in, e.g., random sampling splits.
+#' When some of the indices in the training dataset are not included into user-provided \code{folds},
+#' their prediction value would be \code{NA}.
+#' @export
+xgb.cb.cv.predict <- function(save_models = FALSE, outputmargin = FALSE) {
+  xgb.Callback(
+    cb_name = "cv_predict",
+    env = as.environment(list(save_models = save_models, outputmargin = outputmargin)),
+    f_before_training = function(env, model, data, evals, begin_iteration, end_iteration) {
+      if (inherits(model, "xgb.Booster")) {
+        stop("'cv.predict' callback is only for 'xgb.cv'.")
+      }
+    },
+    f_before_iter = NULL,
+    f_after_iter = NULL,
+    f_after_training = function(env, model, data, evals, iteration, final_feval, prev_cb_res) {
+      pred <- NULL
+      for (fd in model) {
+        pr <- predict(
+          fd$bst,
+          fd$evals[[2L]],
+          outputmargin = env$outputmargin,
+          reshape = TRUE
+        )
+        if (is.null(pred)) {
+          if (NCOL(pr) > 1L) {
+            pred <- matrix(NA_real_, nrow(data), ncol(pr))
+          } else {
+            pred <- matrix(NA_real_, nrow(data))
+          }
+        }
+        if (is.matrix(pred)) {
+          pred[fd$index, ] <- pr
+        } else {
+          pred[fd$index] <- pr
+        }
+      }
+      out <- list(pred = pred)
+      if (env$save_models) {
+        out$models <- lapply(model, function(fd) fd$bst)
+      }
+      return(out)
+    }
+  )
+}
+
+.list2mat <- function(coef_list, sparse) {
+  if (sparse) {
+    coef_mat <- methods::new("dgRMatrix")
+    coef_mat@p <- as.integer(c(0, cumsum(sapply(coef_list, function(x) length(x@x)))))
+    coef_mat@j <- as.integer(unlist(lapply(coef_list, slot, "i")) - 1L)
+    coef_mat@x <- unlist(lapply(coef_list, slot, "x"))
+    coef_mat@Dim <- as.integer(c(length(coef_list), length(coef_list[[1L]])))
+    # Note: function 'xgb.gblinear.history' might later on try to slice by columns
+    coef_mat <- methods::as(coef_mat, "CsparseMatrix")
+    return(coef_mat)
+  } else {
+    return(unname(do.call(rbind, coef_list)))
+  }
+}
+
+.extract.coef <- function(model, sparse) {
+  coefs <- .internal.coef.xgb.Booster(model, add_names = FALSE)
+  if (NCOL(coefs) > 1L) {
+    coefs <- as.vector(coefs)
+  }
+  if (sparse) {
+    coefs <- methods::as(coefs, "sparseVector")
+  }
+  return(coefs)
+}
+
+#' @title Callback for collecting coefficients history of a gblinear booster
+#' @param sparse when set to `FALSE`/`TRUE`, a dense/sparse matrix is used to store the result.
+#'       Sparse format is useful when one expects only a subset of coefficients to be non-zero,
+#'       when using the "thrifty" feature selector with fairly small number of top features
+#'       selected per iteration.
+#' @return An `xgb.Callback` object, which can be passed to \link{xgb.train} or \link{xgb.cv}.
+#' @details
+#' To keep things fast and simple, gblinear booster does not internally store the history of linear
+#' model coefficients at each boosting iteration. This callback provides a workaround for storing
+#' the coefficients' path, by extracting them after each training iteration.
+#'
+#' This callback will construct a matrix where rows are boosting iterations and columns are
+#' feature coefficients (same order as when calling \link{coef.xgb.Booster}, with the intercept
+#' corresponding to the first column).
+#'
+#' When there is more than one coefficient per feature (e.g. multi-class classification),
+#' the result will be reshaped into a vector where coefficients are arranged first by features and
+#' then by class (e.g. first 1 through N coefficients will be for the first class, then
+#' coefficients N+1 through 2N for the second class, and so on).
+#'
+#' If the result has only one coefficient per feature in the data, then the resulting matrix
+#' will have column names matching with the feature names, otherwise (when there's more than
+#' one coefficient per feature) the names will be composed as 'column name' + ':' + 'class index'
+#' (so e.g. column 'c1' for class '0' will be named 'c1:0').
+#'
+#' With \code{xgb.train}, the output is either a dense or a sparse matrix.
+#' With with \code{xgb.cv}, it is a list (one element per each fold) of such
+#' matrices.
+#'
+#' Function \link{xgb.gblinear.history} function provides an easy way to retrieve the
+#' outputs from this callback.
+#' @seealso \link{xgb.gblinear.history}, \link{coef.xgb.Booster}.
+#' @examples
+#' #### Binary classification:
+#'
+#' ## Keep the number of threads to 1 for examples
+#' nthread <- 1
+#' data.table::setDTthreads(nthread)
+#'
+#' # In the iris dataset, it is hard to linearly separate Versicolor class from the rest
+#' # without considering the 2nd order interactions:
+#' x <- model.matrix(Species ~ .^2, iris)[,-1]
+#' colnames(x)
+#' dtrain <- xgb.DMatrix(scale(x), label = 1*(iris$Species == "versicolor"), nthread = nthread)
+#' param <- list(booster = "gblinear", objective = "reg:logistic", eval_metric = "auc",
+#'               lambda = 0.0003, alpha = 0.0003, nthread = nthread)
+#' # For 'shotgun', which is a default linear updater, using high eta values may result in
+#' # unstable behaviour in some datasets. With this simple dataset, however, the high learning
+#' # rate does not break the convergence, but allows us to illustrate the typical pattern of
+#' # "stochastic explosion" behaviour of this lock-free algorithm at early boosting iterations.
+#' bst <- xgb.train(param, dtrain, list(tr=dtrain), nrounds = 200, eta = 1.,
+#'                  callbacks = list(xgb.cb.gblinear.history()))
+#' # Extract the coefficients' path and plot them vs boosting iteration number:
+#' coef_path <- xgb.gblinear.history(bst)
+#' matplot(coef_path, type = 'l')
+#'
+#' # With the deterministic coordinate descent updater, it is safer to use higher learning rates.
+#' # Will try the classical componentwise boosting which selects a single best feature per round:
+#' bst <- xgb.train(param, dtrain, list(tr=dtrain), nrounds = 200, eta = 0.8,
+#'                  updater = 'coord_descent', feature_selector = 'thrifty', top_k = 1,
+#'                  callbacks = list(xgb.cb.gblinear.history()))
+#' matplot(xgb.gblinear.history(bst), type = 'l')
+#' #  Componentwise boosting is known to have similar effect to Lasso regularization.
+#' # Try experimenting with various values of top_k, eta, nrounds,
+#' # as well as different feature_selectors.
+#'
+#' # For xgb.cv:
+#' bst <- xgb.cv(param, dtrain, nfold = 5, nrounds = 100, eta = 0.8,
+#'               callbacks = list(xgb.cb.gblinear.history()))
+#' # coefficients in the CV fold #3
+#' matplot(xgb.gblinear.history(bst)[[3]], type = 'l')
+#'
+#'
+#' #### Multiclass classification:
+#' #
+#' dtrain <- xgb.DMatrix(scale(x), label = as.numeric(iris$Species) - 1, nthread = nthread)
+#' param <- list(booster = "gblinear", objective = "multi:softprob", num_class = 3,
+#'               lambda = 0.0003, alpha = 0.0003, nthread = nthread)
+#' # For the default linear updater 'shotgun' it sometimes is helpful
+#' # to use smaller eta to reduce instability
+#' bst <- xgb.train(param, dtrain, list(tr=dtrain), nrounds = 50, eta = 0.5,
+#'                  callbacks = list(xgb.cb.gblinear.history()))
+#' # Will plot the coefficient paths separately for each class:
+#' matplot(xgb.gblinear.history(bst, class_index = 0), type = 'l')
+#' matplot(xgb.gblinear.history(bst, class_index = 1), type = 'l')
+#' matplot(xgb.gblinear.history(bst, class_index = 2), type = 'l')
+#'
+#' # CV:
+#' bst <- xgb.cv(param, dtrain, nfold = 5, nrounds = 70, eta = 0.5,
+#'               callbacks = list(xgb.cb.gblinear.history(FALSE)))
+#' # 1st fold of 1st class
+#' matplot(xgb.gblinear.history(bst, class_index = 0)[[1]], type = 'l')
+#'
+#' @export
+xgb.cb.gblinear.history <- function(sparse = FALSE) {
+  xgb.Callback(
+    cb_name = "gblinear_history",
+    env = as.environment(list(sparse = sparse)),
+    f_before_training = function(env, model, data, evals, begin_iteration, end_iteration) {
+      if (!inherits(model, "xgb.Booster")) {
+        model <- model[[1L]]$bst
+      }
+      if (xgb.booster_type(model) != "gblinear") {
+        stop("Callback 'xgb.cb.gblinear.history' is only for booster='gblinear'.")
+      }
+      env$coef_hist <- vector("list", end_iteration - begin_iteration + 1)
+      env$next_idx <- 1
+    },
+    f_before_iter = NULL,
+    f_after_iter = function(env, model, data, evals, iteration, iter_feval) {
+      if (inherits(model, "xgb.Booster")) {
+        coef_this <- .extract.coef(model, env$sparse)
+      } else {
+        coef_this <- lapply(model, function(fd) .extract.coef(fd$bst, env$sparse))
+      }
+      env$coef_hist[[env$next_idx]] <- coef_this
+      env$next_idx <- env$next_idx + 1
+      return(FALSE)
+    },
+    f_after_training = function(env, model, data, evals, iteration, final_feval, prev_cb_res) {
+      # in case of early stopping
+      if (env$next_idx <= length(env$coef_hist)) {
+        env$coef_hist <- head(env$coef_hist, env$next_idx - 1)
+      }
+
+      is_booster <- inherits(model, "xgb.Booster")
+      if (is_booster) {
+        out <- .list2mat(env$coef_hist, env$sparse)
+      } else {
+        out <- lapply(
+          X = lapply(
+            X = seq_along(env$coef_hist[[1]]),
+            FUN = function(i) lapply(env$coef_hist, "[[", i)
+          ),
+          FUN = .list2mat,
+          env$sparse
+        )
+      }
+      if (!is.null(prev_cb_res)) {
+        if (is_booster) {
+          out <- rbind(prev_cb_res, out)
+        } else {
+          # Note: this case should never be encountered, since training cannot
+          # be continued from the result of xgb.cv, but this code should in
+          # theory do the job if the situation were to be encountered.
+          out <- lapply(
+            out,
+            function(lst) {
+              lapply(
+                seq_along(lst),
+                function(i) rbind(prev_cb_res[[i]], lst[[i]])
+              )
+            }
+          )
+        }
+      }
+      feature_names <- getinfo(data, "feature_name")
+      if (!NROW(feature_names)) {
+        feature_names <- paste0("V", seq(1L, ncol(data)))
+      }
+      expected_ncols <- length(feature_names) + 1
+      if (is_booster) {
+        mat_ncols <- ncol(out)
+      } else {
+        mat_ncols <- ncol(out[[1L]])
+      }
+      if (mat_ncols %% expected_ncols == 0) {
+        feature_names <- c("(Intercept)", feature_names)
+        n_rep <- mat_ncols / expected_ncols
+        if (n_rep > 1) {
+          feature_names <- unlist(
+            lapply(
+              seq(1, n_rep),
+              function(cl) paste(feature_names, cl - 1, sep = ":")
+            )
+          )
+        }
+        if (is_booster) {
+          colnames(out) <- feature_names
+        } else {
+          out <- lapply(
+            out,
+            function(mat) {
+              colnames(mat) <- feature_names
+              return(mat)
+            }
+          )
+        }
+      }
+      return(out)
+    }
+  )
+}
+
+#' @title Extract gblinear coefficients history.
+#' @description A helper function to extract the matrix of linear coefficients' history
+#' from a gblinear model created while using the \link{xgb.cb.gblinear.history}
+#' callback (which must be added manually as by default it's not used).
+#' @details Note that this is an R-specific function that relies on R attributes that
+#' are not saved when using xgboost's own serialization functions like \link{xgb.load}
+#' or \link{xgb.load.raw}.
+#'
+#' In order for a serialized model to be accepted by this function, one must use R
+#' serializers such as \link{saveRDS}.
+#' @param model either an \code{xgb.Booster} or a result of \code{xgb.cv()}, trained
+#'        using the \link{xgb.cb.gblinear.history} callback, but \bold{not} a booster
+#'        loaded from \link{xgb.load} or \link{xgb.load.raw}.
+#' @param class_index zero-based class index to extract the coefficients for only that
+#'        specific class in a multinomial multiclass model. When it is NULL, all the
+#'        coefficients are returned. Has no effect in non-multiclass models.
+#'
+#' @return
+#' For an \link{xgb.train} result, a matrix (either dense or sparse) with the columns
+#' corresponding to iteration's coefficients and the rows corresponding to boosting iterations.
+#'
+#' For an \link{xgb.cv} result, a list of such matrices is returned with the elements
+#' corresponding to CV folds.
+#'
+#' When there is more than one coefficient per feature (e.g. multi-class classification)
+#' and `class_index` is not provided,
+#' the result will be reshaped into a vector where coefficients are arranged first by features and
+#' then by class (e.g. first 1 through N coefficients will be for the first class, then
+#' coefficients N+1 through 2N for the second class, and so on).
+#' @seealso \link{xgb.cb.gblinear.history}, \link{coef.xgb.Booster}.
+#' @export
+xgb.gblinear.history <- function(model, class_index = NULL) {
+
+  if (!(inherits(model, "xgb.Booster") ||
+        inherits(model, "xgb.cv.synchronous")))
+    stop("model must be an object of either xgb.Booster or xgb.cv.synchronous class")
+  is_cv <- inherits(model, "xgb.cv.synchronous")
+
+  if (!is_cv) {
+    coef_path <- getElement(attributes(model), "gblinear_history")
+  } else {
+    coef_path <- getElement(model, "gblinear_history")
+  }
+  if (is.null(coef_path)) {
+    stop("model must be trained while using the xgb.cb.gblinear.history() callback")
+  }
+
+  if (!is_cv) {
+    num_class <- xgb.num_class(model)
+    num_feat <- xgb.num_feature(model)
+  } else {
+    # in case of CV, the object is expected to have this info
+    if (model$params$booster != "gblinear")
+      stop("It does not appear to be a gblinear model")
+    num_class <- NVL(model$params$num_class, 1)
+    num_feat <- model$nfeatures
+    if (is.null(num_feat))
+      stop("This xgb.cv result does not have nfeatures info")
+  }
+
+  if (!is.null(class_index) &&
+      num_class > 1 &&
+      (class_index[1] < 0 || class_index[1] >= num_class))
+    stop("class_index has to be within [0,", num_class - 1, "]")
+
+  if (!is.null(class_index) && num_class > 1) {
+    seq_take <- seq(1 + class_index * (num_feat + 1), (class_index + 1) * (num_feat + 1))
+    coef_path <- if (is.list(coef_path)) {
+      lapply(coef_path, function(x) x[, seq_take])
+    } else {
+      coef_path <- coef_path[, seq_take]
+    }
+  }
+  return(coef_path)
+}
+
+.callbacks.only.train <- "save_model"
+.callbacks.only.cv <- "cv_predict"
+
+.process.callbacks <- function(callbacks, is_cv) {
+  if (inherits(callbacks, "xgb.Callback")) {
+    callbacks <- list(callbacks)
+  }
+  if (!is.list(callbacks)) {
+    stop("'callbacks' must be a list.")
+  }
+  cb_names <- character()
+  if (length(callbacks)) {
+    is_callback <- sapply(callbacks, inherits, "xgb.Callback")
+    if (!all(is_callback)) {
+      stop("Entries in 'callbacks' must be 'xgb.Callback' objects.")
+    }
+    cb_names <- sapply(callbacks, function(cb) cb$cb_name)
+    if (length(cb_names) != length(callbacks)) {
+      stop("Passed invalid callback(s).")
+    }
+    if (anyDuplicated(cb_names) > 0) {
+      stop("Callbacks must have unique names.")
+    }
+    if (is_cv) {
+      if (any(.callbacks.only.train %in% cb_names)) {
+        stop(
+          "Passed callback(s) not supported for 'xgb.cv': ",
+          paste(intersect(.callbacks.only.train, cb_names), collapse = ", ")
+        )
+      }
+    } else {
+      if (any(.callbacks.only.cv %in% cb_names)) {
+        stop(
+          "Passed callback(s) not supported for 'xgb.train': ",
+          paste(intersect(.callbacks.only.cv, cb_names), collapse = ", ")
+        )
+      }
+    }
+    # Early stopping callback needs to be executed before the others
+    if ("early_stop" %in% cb_names) {
+      mask <- cb_names == "early_stop"
+      callbacks <- c(list(callbacks[[which(mask)]]), callbacks[!mask])
+    }
+  }
+  return(list(callbacks = callbacks, cb_names = cb_names))
+}
+
+# Note: don't try to use functions like 'append', as they will
+# merge the elements of the different callbacks into a single list.
+add.callback <- function(callbacks, cb, as_first_elt = FALSE) {
+  if (!as_first_elt) {
+    callbacks[[length(callbacks) + 1]] <- cb
+    return(callbacks)
+  } else {
+    if (!length(callbacks)) {
+      return(list(cb))
+    }
+    new_cb <- vector("list", length(callbacks) + 1)
+    new_cb[[1]] <- cb
+    new_cb[seq(2, length(new_cb))] <- callbacks
+    return(new_cb)
+  }
+}
+
+has.callbacks <- function(callbacks, cb_name) {
+  cb_names <- sapply(callbacks, function(cb) cb$name)
+  return(cb_name %in% cb_names)
 }
diff --git a/R-package/R/utils.R b/R-package/R/utils.R
index e8ae787fc..08afab889 100644
--- a/R-package/R/utils.R
+++ b/R-package/R/utils.R
@@ -142,7 +142,7 @@ check.custom.eval <- function(env = parent.frame()) {
   if (!is.null(env$feval) &&
       is.null(env$maximize) && (
         !is.null(env$early_stopping_rounds) ||
-        has.callbacks(env$callbacks, 'cb.early.stop')))
+        has.callbacks(env$callbacks, "early_stop")))
     stop("Please set 'maximize' to indicate whether the evaluation metric needs to be maximized or not")
 }
 
@@ -193,20 +193,20 @@ xgb.iter.update <- function(bst, dtrain, iter, obj) {
 # Evaluate one iteration.
 # Returns a named vector of evaluation metrics
 # with the names in a 'datasetname-metricname' format.
-xgb.iter.eval <- function(bst, watchlist, iter, feval) {
+xgb.iter.eval <- function(bst, evals, iter, feval) {
   handle <- xgb.get.handle(bst)
 
-  if (length(watchlist) == 0)
+  if (length(evals) == 0)
     return(NULL)
 
-  evnames <- names(watchlist)
+  evnames <- names(evals)
   if (is.null(feval)) {
-    msg <- .Call(XGBoosterEvalOneIter_R, handle, as.integer(iter), watchlist, as.list(evnames))
+    msg <- .Call(XGBoosterEvalOneIter_R, handle, as.integer(iter), evals, as.list(evnames))
     mat <- matrix(strsplit(msg, '\\s+|:')[[1]][-1], nrow = 2)
     res <- structure(as.numeric(mat[2, ]), names = mat[1, ])
   } else {
-    res <- sapply(seq_along(watchlist), function(j) {
-      w <- watchlist[[j]]
+    res <- sapply(seq_along(evals), function(j) {
+      w <- evals[[j]]
       ## predict using all trees
       preds <- predict(bst, w, outputmargin = TRUE, iterationrange = "all")
       eval_res <- feval(preds, w)
diff --git a/R-package/R/xgb.Booster.R b/R-package/R/xgb.Booster.R
index febefb757..77d75fa9c 100644
--- a/R-package/R/xgb.Booster.R
+++ b/R-package/R/xgb.Booster.R
@@ -77,26 +77,45 @@ xgb.get.handle <- function(object) {
 
 #' Predict method for XGBoost model
 #'
-#' Predicted values based on either xgboost model or model handle object.
+#' Predict values on data based on xgboost model.
 #'
 #' @param object Object of class `xgb.Booster`.
-#' @param newdata Takes `matrix`, `dgCMatrix`, `dgRMatrix`, `dsparseVector`,
+#' @param newdata Takes `data.frame`, `matrix`, `dgCMatrix`, `dgRMatrix`, `dsparseVector`,
 #'        local data file, or `xgb.DMatrix`.
-#'        For single-row predictions on sparse data, it is recommended to use the CSR format.
-#'        If passing a sparse vector, it will take it as a row vector.
-#' @param missing Only used when input is a dense matrix. Pick a float value that represents
-#'        missing values in data (e.g., 0 or some other extreme value).
+#'
+#'        For single-row predictions on sparse data, it's recommended to use CSR format. If passing
+#'        a sparse vector, it will take it as a row vector.
+#'
+#'        Note that, for repeated predictions on the same data, one might want to create a DMatrix to
+#'        pass here instead of passing R types like matrices or data frames, as predictions will be
+#'        faster on DMatrix.
+#'
+#'        If `newdata` is a `data.frame`, be aware that:\itemize{
+#'        \item Columns will be converted to numeric if they aren't already, which could potentially make
+#'              the operation slower than in an equivalent `matrix` object.
+#'        \item The order of the columns must match with that of the data from which the model was fitted
+#'              (i.e. columns will not be referenced by their names, just by their order in the data).
+#'        \item If the model was fitted to data with categorical columns, these columns must be of
+#'              `factor` type here, and must use the same encoding (i.e. have the same levels).
+#'        \item If `newdata` contains any `factor` columns, they will be converted to base-0
+#'              encoding (same as during DMatrix creation) - hence, one should not pass a `factor`
+#'              under a column which during training had a different type.
+#'        }
+#' @param missing Float value that represents missing values in data (e.g., 0 or some other extreme value).
+#'
+#'        This parameter is not used when `newdata` is an `xgb.DMatrix` - in such cases, should pass
+#'        this as an argument to the DMatrix constructor instead.
 #' @param outputmargin Whether the prediction should be returned in the form of original untransformed
 #'        sum of predictions from boosting iterations' results. E.g., setting `outputmargin=TRUE` for
 #'        logistic regression would return log-odds instead of probabilities.
-#' @param predleaf Whether to predict pre-tree leaf indices.
+#' @param predleaf Whether to predict per-tree leaf indices.
 #' @param predcontrib Whether to return feature contributions to individual predictions (see Details).
 #' @param approxcontrib Whether to use a fast approximation for feature contributions (see Details).
 #' @param predinteraction Whether to return contributions of feature interactions to individual predictions (see Details).
 #' @param reshape Whether to reshape the vector of predictions to matrix form when there are several
 #'        prediction outputs per case. No effect if `predleaf`, `predcontrib`,
 #'        or `predinteraction` is `TRUE`.
-#' @param training Whether the predictions are used for training. For dart booster,
+#' @param training Whether the prediction result is used for training. For dart booster,
 #'        training predicting will perform dropout.
 #' @param iterationrange Sequence of rounds/iterations from the model to use for prediction, specified by passing
 #'        a two-dimensional vector with the start and end numbers in the sequence (same format as R's `seq` - i.e.
@@ -111,6 +130,12 @@ xgb.get.handle <- function(object) {
 #'        If passing "all", will use all of the rounds regardless of whether the model had early stopping or not.
 #' @param strict_shape Default is `FALSE`. When set to `TRUE`, the output
 #'        type and shape of predictions are invariant to the model type.
+#' @param base_margin Base margin used for boosting from existing model.
+#'
+#'        Note that, if `newdata` is an `xgb.DMatrix` object, this argument will
+#'        be ignored as it needs to be added to the DMatrix instead (e.g. by passing it as
+#'        an argument in its constructor, or by calling \link{setinfo.xgb.DMatrix}).
+#'
 #' @param validate_features When `TRUE`, validate that the Booster's and newdata's feature_names
 #'        match (only applicable when both `object` and `newdata` have feature names).
 #'
@@ -287,16 +312,80 @@ xgb.get.handle <- function(object) {
 predict.xgb.Booster <- function(object, newdata, missing = NA, outputmargin = FALSE,
                                 predleaf = FALSE, predcontrib = FALSE, approxcontrib = FALSE, predinteraction = FALSE,
                                 reshape = FALSE, training = FALSE, iterationrange = NULL, strict_shape = FALSE,
-                                validate_features = FALSE, ...) {
+                                validate_features = FALSE, base_margin = NULL, ...) {
   if (validate_features) {
     newdata <- validate.features(object, newdata)
   }
-  if (!inherits(newdata, "xgb.DMatrix")) {
+  is_dmatrix <- inherits(newdata, "xgb.DMatrix")
+  if (is_dmatrix && !is.null(base_margin)) {
+    stop(
+      "'base_margin' is not supported when passing 'xgb.DMatrix' as input.",
+      " Should be passed as argument to 'xgb.DMatrix' constructor."
+    )
+  }
+
+  use_as_df <- FALSE
+  use_as_dense_matrix <- FALSE
+  use_as_csr_matrix <- FALSE
+  n_row <- NULL
+  if (!is_dmatrix) {
+
+    inplace_predict_supported <- !predcontrib && !predinteraction && !predleaf
+    if (inplace_predict_supported) {
+      booster_type <- xgb.booster_type(object)
+      if (booster_type == "gblinear" || (booster_type == "dart" && training)) {
+        inplace_predict_supported <- FALSE
+      }
+    }
+    if (inplace_predict_supported) {
+
+      if (is.matrix(newdata)) {
+        use_as_dense_matrix <- TRUE
+      } else if (is.data.frame(newdata)) {
+        # note: since here it turns it into a non-data-frame list,
+        # needs to keep track of the number of rows it had for later
+        n_row <- nrow(newdata)
+        newdata <- lapply(
+          newdata,
+          function(x) {
+            if (is.factor(x)) {
+              return(as.numeric(x) - 1)
+            } else {
+              return(as.numeric(x))
+            }
+          }
+        )
+        use_as_df <- TRUE
+      } else if (inherits(newdata, "dgRMatrix")) {
+        use_as_csr_matrix <- TRUE
+        csr_data <- list(newdata@p, newdata@j, newdata@x, ncol(newdata))
+      } else if (inherits(newdata, "dsparseVector")) {
+        use_as_csr_matrix <- TRUE
+        n_row <- 1L
+        i <- newdata@i - 1L
+        if (storage.mode(i) != "integer") {
+          storage.mode(i) <- "integer"
+        }
+        csr_data <- list(c(0L, length(i)), i, newdata@x, length(newdata))
+      }
+
+    }
+
+  } # if (!is_dmatrix)
+
+  if (!is_dmatrix && !use_as_dense_matrix && !use_as_csr_matrix && !use_as_df) {
     nthread <- xgb.nthread(object)
     newdata <- xgb.DMatrix(
       newdata,
-      missing = missing, nthread = NVL(nthread, -1)
+      missing = missing,
+      base_margin = base_margin,
+      nthread = NVL(nthread, -1)
     )
+    is_dmatrix <- TRUE
+  }
+
+  if (is.null(n_row)) {
+    n_row <- nrow(newdata)
   }
 
 
@@ -354,18 +443,30 @@ predict.xgb.Booster <- function(object, newdata, missing = NA, outputmargin = FA
     args$type <- set_type(6)
   }
 
-  predts <- .Call(
-    XGBoosterPredictFromDMatrix_R,
-    xgb.get.handle(object),
-    newdata,
-    jsonlite::toJSON(args, auto_unbox = TRUE)
-  )
+  json_conf <- jsonlite::toJSON(args, auto_unbox = TRUE)
+  if (is_dmatrix) {
+    predts <- .Call(
+      XGBoosterPredictFromDMatrix_R, xgb.get.handle(object), newdata, json_conf
+    )
+  } else if (use_as_dense_matrix) {
+    predts <- .Call(
+      XGBoosterPredictFromDense_R, xgb.get.handle(object), newdata, missing, json_conf, base_margin
+    )
+  } else if (use_as_csr_matrix) {
+    predts <- .Call(
+      XGBoosterPredictFromCSR_R, xgb.get.handle(object), csr_data, missing, json_conf, base_margin
+    )
+  } else if (use_as_df) {
+    predts <- .Call(
+      XGBoosterPredictFromColumnar_R, xgb.get.handle(object), newdata, missing, json_conf, base_margin
+    )
+  }
+
   names(predts) <- c("shape", "results")
   shape <- predts$shape
   arr <- predts$results
 
   n_ret <- length(arr)
-  n_row <- nrow(newdata)
   if (n_row != shape[1]) {
     stop("Incorrect predict shape.")
   }
@@ -970,6 +1071,10 @@ xgb.best_iteration <- function(bst) {
 #' coef(model)
 #' @export
 coef.xgb.Booster <- function(object, ...) {
+  return(.internal.coef.xgb.Booster(object, add_names = TRUE))
+}
+
+.internal.coef.xgb.Booster <- function(object, add_names = TRUE) {
   booster_type <- xgb.booster_type(object)
   if (booster_type != "gblinear") {
     stop("Coefficients are not defined for Booster type ", booster_type)
@@ -988,21 +1093,27 @@ coef.xgb.Booster <- function(object, ...) {
   intercepts <- weights[seq(sep + 1, length(weights))]
   intercepts <- intercepts + as.numeric(base_score)
 
-  feature_names <- xgb.feature_names(object)
-  if (!NROW(feature_names)) {
-    # This mimics the default naming in R which names columns as "V1..N"
-    # when names are needed but not available
-    feature_names <- paste0("V", seq(1L, num_feature))
+  if (add_names) {
+    feature_names <- xgb.feature_names(object)
+    if (!NROW(feature_names)) {
+      # This mimics the default naming in R which names columns as "V1..N"
+      # when names are needed but not available
+      feature_names <- paste0("V", seq(1L, num_feature))
+    }
+    feature_names <- c("(Intercept)", feature_names)
   }
-  feature_names <- c("(Intercept)", feature_names)
   if (n_cols == 1L) {
     out <- c(intercepts, coefs)
-    names(out) <- feature_names
+    if (add_names) {
+      names(out) <- feature_names
+    }
   } else {
     coefs <- matrix(coefs, nrow = num_feature, byrow = TRUE)
     dim(intercepts) <- c(1L, n_cols)
     out <- rbind(intercepts, coefs)
-    row.names(out) <- feature_names
+    if (add_names) {
+      row.names(out) <- feature_names
+    }
     # TODO: if a class names attributes is added,
     # should use those names here.
   }
@@ -1154,12 +1265,9 @@ print.xgb.Booster <- function(x, ...) {
     cat("  ", paste(attr_names, collapse = ", "), "\n")
   }
 
-  if (!is.null(R_attrs$callbacks) && length(R_attrs$callbacks) > 0) {
-    cat('callbacks:\n')
-    lapply(callback.calls(R_attrs$callbacks), function(x) {
-      cat('  ')
-      print(x)
-    })
+  additional_attr <- setdiff(names(R_attrs), .reserved_cb_names)
+  if (NROW(additional_attr)) {
+    cat("callbacks:\n  ", paste(additional_attr, collapse = ", "), "\n")
   }
 
   if (!is.null(R_attrs$evaluation_log)) {
diff --git a/R-package/R/xgb.DMatrix.R b/R-package/R/xgb.DMatrix.R
index ba0686cf9..edbc267c1 100644
--- a/R-package/R/xgb.DMatrix.R
+++ b/R-package/R/xgb.DMatrix.R
@@ -28,10 +28,27 @@
 #' 'xgb.QuantileDMatrix'.
 #' \item Single-row CSR matrices, as class `dsparseVector` from package `Matrix`, which is interpreted
 #' as a single row (only when making predictions from a fitted model).
-#' \item Text files in SVMLight / LibSVM formats, passed as a path to the file. These are \bold{not}
-#' supported for xgb.QuantileDMatrix'.
-#' \item Binary files generated by \link{xgb.DMatrix.save},  passed as a path to the file. These are
-#' \bold{not} supported for xgb.QuantileDMatrix'.
+#' \item Text files in a supported format, passed as a `character` variable containing the URI path to
+#' the file, with an optional format specifier.
+#'
+#' These are \bold{not} supported for `xgb.QuantileDMatrix`. Supported formats are:\itemize{
+#'   \item XGBoost's own binary format for DMatrices, as produced by \link{xgb.DMatrix.save}.
+#'   \item SVMLight (a.k.a. LibSVM) format for CSR matrices. This format can be signaled by suffix
+#'         `?format=libsvm` at the end of the file path. It will be the default format if not
+#'         otherwise specified.
+#'   \item CSV files (comma-separated values). This format can be specified by adding suffix
+#'         `?format=csv` at the end ofthe file path. It will \bold{not} be auto-deduced from file extensions.
+#'   }
+#'
+#' Be aware that the format of the file will not be auto-deduced - for example, if a file is named 'file.csv',
+#' it will not look at the extension or file contents to determine that it is a comma-separated value.
+#' Instead, the format must be specified following the URI format, so the input to `data` should be passed
+#' like this: `"file.csv?format=csv"` (or `"file.csv?format=csv&label_column=0"` if the first column
+#' corresponds to the labels).
+#'
+#' For more information about passing text files as input, see the articles
+#' \href{https://xgboost.readthedocs.io/en/stable/tutorials/input_format.html}{Text Input Format of DMatrix} and
+#' \href{https://xgboost.readthedocs.io/en/stable/python/python_intro.html#python-data-interface}{Data Interface}.
 #' }
 #' @param label Label of the training data. For classification problems, should be passed encoded as
 #' integers with numeration starting at zero.
@@ -81,6 +98,13 @@
 #' @param label_lower_bound Lower bound for survival training.
 #' @param label_upper_bound Upper bound for survival training.
 #' @param feature_weights Set feature weights for column sampling.
+#' @param data_split_mode When passing a URI (as R `character`) as input, this signals
+#' whether to split by row or column. Allowed values are `"row"` and `"col"`.
+#'
+#' In distributed mode, the file is split accordingly; otherwise this is only an indicator on
+#' how the file was split beforehand. Default to row.
+#'
+#' This is not used when `data` is not a URI.
 #' @return An 'xgb.DMatrix' object. If calling 'xgb.QuantileDMatrix', it will have additional
 #' subclass 'xgb.QuantileDMatrix'.
 #'
@@ -117,7 +141,8 @@ xgb.DMatrix <- function(
   qid = NULL,
   label_lower_bound = NULL,
   label_upper_bound = NULL,
-  feature_weights = NULL
+  feature_weights = NULL,
+  data_split_mode = "row"
 ) {
   if (!is.null(group) && !is.null(qid)) {
     stop("Either one of 'group' or 'qid' should be NULL")
@@ -131,7 +156,14 @@ xgb.DMatrix <- function(
       )
     }
     data <- path.expand(data)
-    handle <- .Call(XGDMatrixCreateFromFile_R, data, as.integer(silent))
+    if (data_split_mode == "row") {
+      data_split_mode <- 0L
+    } else if (data_split_mode == "col") {
+      data_split_mode <- 1L
+    } else {
+      stop("Passed invalid 'data_split_mode': ", data_split_mode)
+    }
+    handle <- .Call(XGDMatrixCreateFromURI_R, data, as.integer(silent), data_split_mode)
   } else if (is.matrix(data)) {
     handle <- .Call(
       XGDMatrixCreateFromMat_R, data, missing, nthread
diff --git a/R-package/R/xgb.DMatrix.save.R b/R-package/R/xgb.DMatrix.save.R
index ef4599d0e..243f43047 100644
--- a/R-package/R/xgb.DMatrix.save.R
+++ b/R-package/R/xgb.DMatrix.save.R
@@ -6,6 +6,7 @@
 #' @param fname the name of the file to write.
 #'
 #' @examples
+#' \dontshow{RhpcBLASctl::omp_set_num_threads(1)}
 #' data(agaricus.train, package='xgboost')
 #' dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label, nthread = 2))
 #' fname <- file.path(tempdir(), "xgb.DMatrix.data")
diff --git a/R-package/R/xgb.config.R b/R-package/R/xgb.config.R
index 3f3a9b1a7..20b8aef90 100644
--- a/R-package/R/xgb.config.R
+++ b/R-package/R/xgb.config.R
@@ -4,7 +4,14 @@
 #' values of one or more global-scope parameters. Use \code{xgb.get.config} to fetch the current
 #' values of all global-scope parameters (listed in
 #' \url{https://xgboost.readthedocs.io/en/stable/parameter.html}).
+#' @details
+#' Note that serialization-related functions might use a globally-configured number of threads,
+#' which is managed by the system's OpenMP (OMP) configuration instead. Typically, XGBoost methods
+#' accept an `nthreads` parameter, but some methods like `readRDS` might get executed before such
+#' parameter can be supplied.
 #'
+#' The number of OMP threads can in turn be configured for example through an environment variable
+#' `OMP_NUM_THREADS` (needs to be set before R is started), or through `RhpcBLASctl::omp_set_num_threads`.
 #' @rdname xgbConfig
 #' @title Set and get global configuration
 #' @name xgb.set.config, xgb.get.config
diff --git a/R-package/R/xgb.create.features.R b/R-package/R/xgb.create.features.R
index baef3bb03..27f8a0975 100644
--- a/R-package/R/xgb.create.features.R
+++ b/R-package/R/xgb.create.features.R
@@ -71,7 +71,6 @@
 #' new.dtest <- xgb.DMatrix(
 #'   data = new.features.test, label = agaricus.test$label, nthread = 2
 #' )
-#' watchlist <- list(train = new.dtrain)
 #' bst <- xgb.train(params = param, data = new.dtrain, nrounds = nrounds, nthread = 2)
 #'
 #' # Model accuracy with new features
diff --git a/R-package/R/xgb.cv.R b/R-package/R/xgb.cv.R
index 29bddb57f..1cafd7be7 100644
--- a/R-package/R/xgb.cv.R
+++ b/R-package/R/xgb.cv.R
@@ -27,7 +27,7 @@
 #'        that NA values should be considered as 'missing' by the algorithm.
 #'        Sometimes, 0 or other extreme value might be used to represent missing values.
 #' @param prediction A logical value indicating whether to return the test fold predictions
-#'        from each CV model. This parameter engages the \code{\link{cb.cv.predict}} callback.
+#'        from each CV model. This parameter engages the \code{\link{xgb.cb.cv.predict}} callback.
 #' @param showsd \code{boolean}, whether to show standard deviation of cross validation
 #' @param metrics, list of evaluation metrics to be used in cross validation,
 #'   when it is not specified, the evaluation metric is chosen according to objective function.
@@ -57,17 +57,17 @@
 #' @param verbose \code{boolean}, print the statistics during the process
 #' @param print_every_n Print each n-th iteration evaluation messages when \code{verbose>0}.
 #'        Default is 1 which means all messages are printed. This parameter is passed to the
-#'        \code{\link{cb.print.evaluation}} callback.
+#'        \code{\link{xgb.cb.print.evaluation}} callback.
 #' @param early_stopping_rounds If \code{NULL}, the early stopping function is not triggered.
 #'        If set to an integer \code{k}, training with a validation set will stop if the performance
 #'        doesn't improve for \code{k} rounds.
-#'        Setting this parameter engages the \code{\link{cb.early.stop}} callback.
+#'        Setting this parameter engages the \code{\link{xgb.cb.early.stop}} callback.
 #' @param maximize If \code{feval} and \code{early_stopping_rounds} are set,
 #'        then this parameter must be set as well.
 #'        When it is \code{TRUE}, it means the larger the evaluation score the better.
-#'        This parameter is passed to the \code{\link{cb.early.stop}} callback.
+#'        This parameter is passed to the \code{\link{xgb.cb.early.stop}} callback.
 #' @param callbacks a list of callback functions to perform various task during boosting.
-#'        See \code{\link{callbacks}}. Some of the callbacks are automatically created depending on the
+#'        See \code{\link{xgb.Callback}}. Some of the callbacks are automatically created depending on the
 #'        parameters' values. User can provide either existing or their own callback methods in order
 #'        to customize the training process.
 #' @param ... other parameters to pass to \code{params}.
@@ -90,25 +90,25 @@
 #' \itemize{
 #'   \item \code{call} a function call.
 #'   \item \code{params} parameters that were passed to the xgboost library. Note that it does not
-#'         capture parameters changed by the \code{\link{cb.reset.parameters}} callback.
-#'   \item \code{callbacks} callback functions that were either automatically assigned or
-#'         explicitly passed.
+#'         capture parameters changed by the \code{\link{xgb.cb.reset.parameters}} callback.
 #'   \item \code{evaluation_log} evaluation history stored as a \code{data.table} with the
 #'         first column corresponding to iteration number and the rest corresponding to the
 #'         CV-based evaluation means and standard deviations for the training and test CV-sets.
-#'         It is created by the \code{\link{cb.evaluation.log}} callback.
+#'         It is created by the \code{\link{xgb.cb.evaluation.log}} callback.
 #'   \item \code{niter} number of boosting iterations.
 #'   \item \code{nfeatures} number of features in training data.
 #'   \item \code{folds} the list of CV folds' indices - either those passed through the \code{folds}
 #'         parameter or randomly generated.
 #'   \item \code{best_iteration} iteration number with the best evaluation metric value
 #'         (only available with early stopping).
-#'   \item \code{pred} CV prediction values available when \code{prediction} is set.
-#'         It is either vector or matrix (see \code{\link{cb.cv.predict}}).
-#'   \item \code{models} a list of the CV folds' models. It is only available with the explicit
-#'         setting of the \code{cb.cv.predict(save_models = TRUE)} callback.
 #' }
 #'
+#' Plus other potential elements that are the result of callbacks, such as a list `cv_predict` with
+#' a sub-element `pred` when passing `prediction = TRUE`, which is added by the \link{xgb.cb.cv.predict}
+#' callback (note that one can also pass it manually under `callbacks` with different settings,
+#' such as saving also the models created during cross validation); or a list `early_stop` which
+#' will contain elements such as `best_iteration` when using the early stopping callback (\link{xgb.cb.early.stop}).
+#'
 #' @examples
 #' data(agaricus.train, package='xgboost')
 #' dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label, nthread = 2))
@@ -160,32 +160,38 @@ xgb.cv <- function(params = list(), data, nrounds, nfold, label = NULL, missing
     folds <- generate.cv.folds(nfold, nrow(data), stratified, cv_label, params)
   }
 
+  # Callbacks
+  tmp <- .process.callbacks(callbacks, is_cv = TRUE)
+  callbacks <- tmp$callbacks
+  cb_names <- tmp$cb_names
+  rm(tmp)
+
+  # Early stopping callback
+  if (!is.null(early_stopping_rounds) && !("early_stop" %in% cb_names)) {
+    callbacks <- add.callback(
+      callbacks,
+      xgb.cb.early.stop(
+        early_stopping_rounds,
+        maximize = maximize,
+        verbose = verbose
+      ),
+      as_first_elt = TRUE
+    )
+  }
   # verbosity & evaluation printing callback:
   params <- c(params, list(silent = 1))
   print_every_n <- max(as.integer(print_every_n), 1L)
-  if (!has.callbacks(callbacks, 'cb.print.evaluation') && verbose) {
-    callbacks <- add.cb(callbacks, cb.print.evaluation(print_every_n, showsd = showsd))
+  if (verbose && !("print_evaluation" %in% cb_names)) {
+    callbacks <- add.callback(callbacks, xgb.cb.print.evaluation(print_every_n, showsd = showsd))
   }
   # evaluation log callback: always is on in CV
-  evaluation_log <- list()
-  if (!has.callbacks(callbacks, 'cb.evaluation.log')) {
-    callbacks <- add.cb(callbacks, cb.evaluation.log())
-  }
-  # Early stopping callback
-  stop_condition <- FALSE
-  if (!is.null(early_stopping_rounds) &&
-      !has.callbacks(callbacks, 'cb.early.stop')) {
-    callbacks <- add.cb(callbacks, cb.early.stop(early_stopping_rounds,
-                                                 maximize = maximize, verbose = verbose))
+  if (!("evaluation_log" %in% cb_names)) {
+    callbacks <- add.callback(callbacks, xgb.cb.evaluation.log())
   }
   # CV-predictions callback
-  if (prediction &&
-      !has.callbacks(callbacks, 'cb.cv.predict')) {
-    callbacks <- add.cb(callbacks, cb.cv.predict(save_models = FALSE))
+  if (prediction && !("cv_predict" %in% cb_names)) {
+    callbacks <- add.callback(callbacks, xgb.cb.cv.predict(save_models = FALSE))
   }
-  # Sort the callbacks into categories
-  cb <- categorize.callbacks(callbacks)
-
 
   # create the booster-folds
   # train_folds
@@ -209,11 +215,8 @@ xgb.cv <- function(params = list(), data, nrounds, nfold, label = NULL, missing
       modelfile = NULL
     )
     bst <- bst$bst
-    list(dtrain = dtrain, bst = bst, watchlist = list(train = dtrain, test = dtest), index = folds[[k]])
+    list(dtrain = dtrain, bst = bst, evals = list(train = dtrain, test = dtest), index = folds[[k]])
   })
-  rm(dall)
-  # a "basket" to collect some results from callbacks
-  basket <- list()
 
   # extract parameters that can affect the relationship b/w #trees and #iterations
   num_class <- max(as.numeric(NVL(params[['num_class']], 1)), 1) # nolint
@@ -222,10 +225,25 @@ xgb.cv <- function(params = list(), data, nrounds, nfold, label = NULL, missing
   begin_iteration <- 1
   end_iteration <- nrounds
 
+  .execute.cb.before.training(
+    callbacks,
+    bst_folds,
+    dall,
+    NULL,
+    begin_iteration,
+    end_iteration
+  )
+
   # synchronous CV boosting: run CV folds' models within each iteration
   for (iteration in begin_iteration:end_iteration) {
 
-    for (f in cb$pre_iter) f()
+    .execute.cb.before.iter(
+      callbacks,
+      bst_folds,
+      dall,
+      NULL,
+      iteration
+    )
 
     msg <- lapply(bst_folds, function(fd) {
       xgb.iter.update(
@@ -236,33 +254,42 @@ xgb.cv <- function(params = list(), data, nrounds, nfold, label = NULL, missing
       )
       xgb.iter.eval(
         bst = fd$bst,
-        watchlist = fd$watchlist,
+        evals = fd$evals,
         iter = iteration - 1,
         feval = feval
       )
     })
     msg <- simplify2array(msg)
-    # Note: these variables might look unused here, but they are used in the callbacks
-    bst_evaluation <- rowMeans(msg) # nolint
-    bst_evaluation_err <- apply(msg, 1, sd) # nolint
 
-    for (f in cb$post_iter) f()
+    should_stop <- .execute.cb.after.iter(
+      callbacks,
+      bst_folds,
+      dall,
+      NULL,
+      iteration,
+      msg
+    )
 
-    if (stop_condition) break
+    if (should_stop) break
   }
-  for (f in cb$finalize) f(finalize = TRUE)
+  cb_outputs <- .execute.cb.after.training(
+    callbacks,
+    bst_folds,
+    dall,
+    NULL,
+    iteration,
+    msg
+  )
 
   # the CV result
   ret <- list(
     call = match.call(),
     params = params,
-    callbacks = callbacks,
-    evaluation_log = evaluation_log,
-    niter = end_iteration,
-    nfeatures = ncol(data),
+    niter = iteration,
+    nfeatures = ncol(dall),
     folds = folds
   )
-  ret <- c(ret, basket)
+  ret <- c(ret, cb_outputs)
 
   class(ret) <- 'xgb.cv.synchronous'
   return(invisible(ret))
@@ -308,23 +335,16 @@ print.xgb.cv.synchronous <- function(x, verbose = FALSE, ...) {
                 paste0('"', unlist(x$params), '"'),
                 sep = ' = ', collapse = ', '), '\n', sep = '')
     }
-    if (!is.null(x$callbacks) && length(x$callbacks) > 0) {
-      cat('callbacks:\n')
-      lapply(callback.calls(x$callbacks), function(x) {
-        cat('  ')
-        print(x)
-      })
-    }
 
     for (n in c('niter', 'best_iteration')) {
-      if (is.null(x[[n]]))
+      if (is.null(x$early_stop[[n]]))
         next
-      cat(n, ': ', x[[n]], '\n', sep = '')
+      cat(n, ': ', x$early_stop[[n]], '\n', sep = '')
     }
 
-    if (!is.null(x$pred)) {
+    if (!is.null(x$cv_predict$pred)) {
       cat('pred:\n')
-      str(x$pred)
+      str(x$cv_predict$pred)
     }
   }
 
@@ -332,9 +352,9 @@ print.xgb.cv.synchronous <- function(x, verbose = FALSE, ...) {
     cat('evaluation_log:\n')
   print(x$evaluation_log, row.names = FALSE, ...)
 
-  if (!is.null(x$best_iteration)) {
+  if (!is.null(x$early_stop$best_iteration)) {
     cat('Best iteration:\n')
-    print(x$evaluation_log[x$best_iteration], row.names = FALSE, ...)
+    print(x$evaluation_log[x$early_stop$best_iteration], row.names = FALSE, ...)
   }
   invisible(x)
 }
diff --git a/R-package/R/xgb.dump.R b/R-package/R/xgb.dump.R
index 3a3d2c7dc..2fa5bcb2f 100644
--- a/R-package/R/xgb.dump.R
+++ b/R-package/R/xgb.dump.R
@@ -24,6 +24,7 @@
 #' as a \code{character} vector. Otherwise it will return \code{TRUE}.
 #'
 #' @examples
+#' \dontshow{RhpcBLASctl::omp_set_num_threads(1)}
 #' data(agaricus.train, package='xgboost')
 #' data(agaricus.test, package='xgboost')
 #' train <- agaricus.train
diff --git a/R-package/R/xgb.load.R b/R-package/R/xgb.load.R
index 7d1eab7e9..d5b192bcb 100644
--- a/R-package/R/xgb.load.R
+++ b/R-package/R/xgb.load.R
@@ -6,7 +6,7 @@
 #'
 #' @details
 #' The input file is expected to contain a model saved in an xgboost model format
-#' using either \code{\link{xgb.save}} or \code{\link{cb.save.model}} in R, or using some
+#' using either \code{\link{xgb.save}} or \code{\link{xgb.cb.save.model}} in R, or using some
 #' appropriate methods from other xgboost interfaces. E.g., a model trained in Python and
 #' saved from there in xgboost format, could be loaded from R.
 #'
@@ -20,6 +20,7 @@
 #' \code{\link{xgb.save}}
 #'
 #' @examples
+#' \dontshow{RhpcBLASctl::omp_set_num_threads(1)}
 #' data(agaricus.train, package='xgboost')
 #' data(agaricus.test, package='xgboost')
 #'
diff --git a/R-package/R/xgb.save.R b/R-package/R/xgb.save.R
index e1a61d196..91c545ff7 100644
--- a/R-package/R/xgb.save.R
+++ b/R-package/R/xgb.save.R
@@ -35,6 +35,7 @@
 #' \code{\link{xgb.load}}
 #'
 #' @examples
+#' \dontshow{RhpcBLASctl::omp_set_num_threads(1)}
 #' data(agaricus.train, package='xgboost')
 #' data(agaricus.test, package='xgboost')
 #'
diff --git a/R-package/R/xgb.save.raw.R b/R-package/R/xgb.save.raw.R
index c124a752b..c04f06d9c 100644
--- a/R-package/R/xgb.save.raw.R
+++ b/R-package/R/xgb.save.raw.R
@@ -12,6 +12,7 @@
 #' }
 #'
 #' @examples
+#' \dontshow{RhpcBLASctl::omp_set_num_threads(1)}
 #' data(agaricus.train, package='xgboost')
 #' data(agaricus.test, package='xgboost')
 #'
diff --git a/R-package/R/xgb.train.R b/R-package/R/xgb.train.R
index f0f2332b5..4cea088e0 100644
--- a/R-package/R/xgb.train.R
+++ b/R-package/R/xgb.train.R
@@ -114,13 +114,13 @@
 #' @param data training dataset. \code{xgb.train} accepts only an \code{xgb.DMatrix} as the input.
 #'        \code{xgboost}, in addition, also accepts \code{matrix}, \code{dgCMatrix}, or name of a local data file.
 #' @param nrounds max number of boosting iterations.
-#' @param watchlist named list of xgb.DMatrix datasets to use for evaluating model performance.
+#' @param evals Named list of `xgb.DMatrix` datasets to use for evaluating model performance.
 #'        Metrics specified in either \code{eval_metric} or \code{feval} will be computed for each
 #'        of these datasets during each boosting iteration, and stored in the end as a field named
 #'        \code{evaluation_log} in the resulting object. When either \code{verbose>=1} or
-#'        \code{\link{cb.print.evaluation}} callback is engaged, the performance results are continuously
+#'        \code{\link{xgb.cb.print.evaluation}} callback is engaged, the performance results are continuously
 #'        printed out during the training.
-#'        E.g., specifying \code{watchlist=list(validation1=mat1, validation2=mat2)} allows to track
+#'        E.g., specifying \code{evals=list(validation1=mat1, validation2=mat2)} allows to track
 #'        the performance of each round's model on mat1 and mat2.
 #' @param obj customized objective function. Returns gradient and second order
 #'        gradient with given prediction and dtrain.
@@ -130,31 +130,32 @@
 #' @param verbose If 0, xgboost will stay silent. If 1, it will print information about performance.
 #'        If 2, some additional information will be printed out.
 #'        Note that setting \code{verbose > 0} automatically engages the
-#'        \code{cb.print.evaluation(period=1)} callback function.
+#'        \code{xgb.cb.print.evaluation(period=1)} callback function.
 #' @param print_every_n Print each n-th iteration evaluation messages when \code{verbose>0}.
 #'        Default is 1 which means all messages are printed. This parameter is passed to the
-#'        \code{\link{cb.print.evaluation}} callback.
+#'        \code{\link{xgb.cb.print.evaluation}} callback.
 #' @param early_stopping_rounds If \code{NULL}, the early stopping function is not triggered.
 #'        If set to an integer \code{k}, training with a validation set will stop if the performance
 #'        doesn't improve for \code{k} rounds.
-#'        Setting this parameter engages the \code{\link{cb.early.stop}} callback.
+#'        Setting this parameter engages the \code{\link{xgb.cb.early.stop}} callback.
 #' @param maximize If \code{feval} and \code{early_stopping_rounds} are set,
 #'        then this parameter must be set as well.
 #'        When it is \code{TRUE}, it means the larger the evaluation score the better.
-#'        This parameter is passed to the \code{\link{cb.early.stop}} callback.
+#'        This parameter is passed to the \code{\link{xgb.cb.early.stop}} callback.
 #' @param save_period when it is non-NULL, model is saved to disk after every \code{save_period} rounds,
-#'        0 means save at the end. The saving is handled by the \code{\link{cb.save.model}} callback.
+#'        0 means save at the end. The saving is handled by the \code{\link{xgb.cb.save.model}} callback.
 #' @param save_name the name or path for periodically saved model file.
 #' @param xgb_model a previously built model to continue the training from.
 #'        Could be either an object of class \code{xgb.Booster}, or its raw data, or the name of a
 #'        file with a previously saved model.
 #' @param callbacks a list of callback functions to perform various task during boosting.
-#'        See \code{\link{callbacks}}. Some of the callbacks are automatically created depending on the
+#'        See \code{\link{xgb.Callback}}. Some of the callbacks are automatically created depending on the
 #'        parameters' values. User can provide either existing or their own callback methods in order
 #'        to customize the training process.
 #'
-#'        Note that some callbacks might try to set an evaluation log - be aware that these evaluation logs
-#'        are kept as R attributes, and thus do not get saved when using non-R serializaters like
+#'        Note that some callbacks might try to leave attributes in the resulting model object,
+#'        such as an evaluation log (a `data.table` object) - be aware that these objects are kept
+#'        as R attributes, and thus do not get saved when using XGBoost's own serializaters like
 #'        \link{xgb.save} (but are kept when using R serializers like \link{saveRDS}).
 #' @param ... other parameters to pass to \code{params}.
 #' @param label vector of response values. Should not be provided when data is
@@ -170,7 +171,7 @@
 #' @details
 #' These are the training functions for \code{xgboost}.
 #'
-#' The \code{xgb.train} interface supports advanced features such as \code{watchlist},
+#' The \code{xgb.train} interface supports advanced features such as \code{evals},
 #' customized objective and evaluation metric functions, therefore it is more flexible
 #' than the \code{xgboost} interface.
 #'
@@ -178,6 +179,11 @@
 #' Number of threads can also be manually specified via the \code{nthread}
 #' parameter.
 #'
+#' While in other interfaces, the default random seed defaults to zero, in R, if a parameter `seed`
+#' is not manually supplied, it will generate a random seed through R's own random number generator,
+#' whose seed in turn is controllable through `set.seed`. If `seed` is passed, it will override the
+#' RNG from R.
+#'
 #' The evaluation metric is chosen automatically by XGBoost (according to the objective)
 #' when the \code{eval_metric} parameter is not provided.
 #' User may set one or several \code{eval_metric} parameters.
@@ -201,18 +207,19 @@
 #'
 #' The following callbacks are automatically created when certain parameters are set:
 #' \itemize{
-#'   \item \code{cb.print.evaluation} is turned on when \code{verbose > 0};
+#'   \item \code{xgb.cb.print.evaluation} is turned on when \code{verbose > 0};
 #'         and the \code{print_every_n} parameter is passed to it.
-#'   \item \code{cb.evaluation.log} is on when \code{watchlist} is present.
-#'   \item \code{cb.early.stop}: when \code{early_stopping_rounds} is set.
-#'   \item \code{cb.save.model}: when \code{save_period > 0} is set.
+#'   \item \code{xgb.cb.evaluation.log} is on when \code{evals} is present.
+#'   \item \code{xgb.cb.early.stop}: when \code{early_stopping_rounds} is set.
+#'   \item \code{xgb.cb.save.model}: when \code{save_period > 0} is set.
 #' }
 #'
 #' Note that objects of type `xgb.Booster` as returned by this function behave a bit differently
 #' from typical R objects (it's an 'altrep' list class), and it makes a separation between
 #' internal booster attributes (restricted to jsonifyable data), accessed through \link{xgb.attr}
 #' and shared between interfaces through serialization functions like \link{xgb.save}; and
-#' R-specific attributes, accessed through \link{attributes} and \link{attr}, which are otherwise
+#' R-specific attributes (typically the result from a callback), accessed through \link{attributes}
+#' and \link{attr}, which are otherwise
 #' only used in the R interface, only kept when using R's serializers like \link{saveRDS}, and
 #' not anyhow used by functions like \link{predict.xgb.Booster}.
 #'
@@ -224,7 +231,7 @@
 #' effect elsewhere.
 #'
 #' @seealso
-#' \code{\link{callbacks}},
+#' \code{\link{xgb.Callback}},
 #' \code{\link{predict.xgb.Booster}},
 #' \code{\link{xgb.cv}}
 #'
@@ -247,12 +254,12 @@
 #' dtest <- with(
 #'   agaricus.test, xgb.DMatrix(data, label = label, nthread = nthread)
 #' )
-#' watchlist <- list(train = dtrain, eval = dtest)
+#' evals <- list(train = dtrain, eval = dtest)
 #'
 #' ## A simple xgb.train example:
 #' param <- list(max_depth = 2, eta = 1, nthread = nthread,
 #'               objective = "binary:logistic", eval_metric = "auc")
-#' bst <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0)
+#' bst <- xgb.train(param, dtrain, nrounds = 2, evals = evals, verbose = 0)
 #'
 #' ## An xgb.train example where custom objective and evaluation metric are
 #' ## used:
@@ -273,15 +280,15 @@
 #' #  as 'objective' and 'eval_metric' parameters in the params list:
 #' param <- list(max_depth = 2, eta = 1, nthread = nthread,
 #'               objective = logregobj, eval_metric = evalerror)
-#' bst <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0)
+#' bst <- xgb.train(param, dtrain, nrounds = 2, evals = evals, verbose = 0)
 #'
 #' #  or through the ... arguments:
 #' param <- list(max_depth = 2, eta = 1, nthread = nthread)
-#' bst <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0,
+#' bst <- xgb.train(param, dtrain, nrounds = 2, evals = evals, verbose = 0,
 #'                  objective = logregobj, eval_metric = evalerror)
 #'
 #' #  or as dedicated 'obj' and 'feval' parameters of xgb.train:
-#' bst <- xgb.train(param, dtrain, nrounds = 2, watchlist,
+#' bst <- xgb.train(param, dtrain, nrounds = 2, evals = evals,
 #'                  obj = logregobj, feval = evalerror)
 #'
 #'
@@ -289,11 +296,11 @@
 #' param <- list(max_depth = 2, eta = 1, nthread = nthread,
 #'               objective = "binary:logistic", eval_metric = "auc")
 #' my_etas <- list(eta = c(0.5, 0.1))
-#' bst <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0,
-#'                  callbacks = list(cb.reset.parameters(my_etas)))
+#' bst <- xgb.train(param, dtrain, nrounds = 2, evals = evals, verbose = 0,
+#'                  callbacks = list(xgb.cb.reset.parameters(my_etas)))
 #'
 #' ## Early stopping:
-#' bst <- xgb.train(param, dtrain, nrounds = 25, watchlist,
+#' bst <- xgb.train(param, dtrain, nrounds = 25, evals = evals,
 #'                  early_stopping_rounds = 3)
 #'
 #' ## An 'xgboost' interface example:
@@ -304,7 +311,7 @@
 #'
 #' @rdname xgb.train
 #' @export
-xgb.train <- function(params = list(), data, nrounds, watchlist = list(),
+xgb.train <- function(params = list(), data, nrounds, evals = list(),
                       obj = NULL, feval = NULL, verbose = 1, print_every_n = 1L,
                       early_stopping_rounds = NULL, maximize = NULL,
                       save_period = NULL, save_name = "xgboost.model",
@@ -317,68 +324,68 @@ xgb.train <- function(params = list(), data, nrounds, watchlist = list(),
   check.custom.obj()
   check.custom.eval()
 
-  # data & watchlist checks
+  # data & evals checks
   dtrain <- data
   if (!inherits(dtrain, "xgb.DMatrix"))
     stop("second argument dtrain must be xgb.DMatrix")
-  if (length(watchlist) > 0) {
-    if (typeof(watchlist) != "list" ||
-        !all(vapply(watchlist, inherits, logical(1), what = 'xgb.DMatrix')))
-      stop("watchlist must be a list of xgb.DMatrix elements")
-    evnames <- names(watchlist)
+  if (length(evals) > 0) {
+    if (typeof(evals) != "list" ||
+        !all(vapply(evals, inherits, logical(1), what = 'xgb.DMatrix')))
+      stop("'evals' must be a list of xgb.DMatrix elements")
+    evnames <- names(evals)
     if (is.null(evnames) || any(evnames == ""))
-      stop("each element of the watchlist must have a name tag")
+      stop("each element of 'evals' must have a name tag")
   }
   # Handle multiple evaluation metrics given as a list
   for (m in params$eval_metric) {
     params <- c(params, list(eval_metric = m))
   }
 
-  # evaluation printing callback
   params <- c(params)
-  print_every_n <- max(as.integer(print_every_n), 1L)
-  if (!has.callbacks(callbacks, 'cb.print.evaluation') &&
-      verbose) {
-    callbacks <- add.cb(callbacks, cb.print.evaluation(print_every_n))
-  }
-  # evaluation log callback:  it is automatically enabled when watchlist is provided
-  evaluation_log <- list()
-  if (!has.callbacks(callbacks, 'cb.evaluation.log') &&
-      length(watchlist) > 0) {
-    callbacks <- add.cb(callbacks, cb.evaluation.log())
-  }
-  # Model saving callback
-  if (!is.null(save_period) &&
-      !has.callbacks(callbacks, 'cb.save.model')) {
-    callbacks <- add.cb(callbacks, cb.save.model(save_period, save_name))
-  }
-  # Early stopping callback
-  stop_condition <- FALSE
-  if (!is.null(early_stopping_rounds) &&
-      !has.callbacks(callbacks, 'cb.early.stop')) {
-    callbacks <- add.cb(callbacks, cb.early.stop(early_stopping_rounds,
-                                                 maximize = maximize, verbose = verbose))
+  params['validate_parameters'] <- TRUE
+  if (!("seed" %in% names(params))) {
+    params[["seed"]] <- sample(.Machine$integer.max, size = 1)
   }
 
-  # Sort the callbacks into categories
-  cb <- categorize.callbacks(callbacks)
-  params['validate_parameters'] <- TRUE
-  if (!is.null(params[['seed']])) {
-    warning("xgb.train: `seed` is ignored in R package.  Use `set.seed()` instead.")
+  # callbacks
+  tmp <- .process.callbacks(callbacks, is_cv = FALSE)
+  callbacks <- tmp$callbacks
+  cb_names <- tmp$cb_names
+  rm(tmp)
+
+  # Early stopping callback (should always come first)
+  if (!is.null(early_stopping_rounds) && !("early_stop" %in% cb_names)) {
+    callbacks <- add.callback(
+      callbacks,
+      xgb.cb.early.stop(
+        early_stopping_rounds,
+        maximize = maximize,
+        verbose = verbose
+      ),
+      as_first_elt = TRUE
+    )
+  }
+  # evaluation printing callback
+  print_every_n <- max(as.integer(print_every_n), 1L)
+  if (verbose && !("print_evaluation" %in% cb_names)) {
+    callbacks <- add.callback(callbacks, xgb.cb.print.evaluation(print_every_n))
+  }
+  # evaluation log callback:  it is automatically enabled when 'evals' is provided
+  if (length(evals) && !("evaluation_log" %in% cb_names)) {
+    callbacks <- add.callback(callbacks, xgb.cb.evaluation.log())
+  }
+  # Model saving callback
+  if (!is.null(save_period) && !("save_model" %in% cb_names)) {
+    callbacks <- add.callback(callbacks, xgb.cb.save.model(save_period, save_name))
   }
 
   # The tree updating process would need slightly different handling
   is_update <- NVL(params[['process_type']], '.') == 'update'
 
-  past_evaluation_log <- NULL
-  if (inherits(xgb_model, "xgb.Booster")) {
-    past_evaluation_log <- attributes(xgb_model)$evaluation_log
-  }
-
   # Construct a booster (either a new one or load from xgb_model)
   bst <- xgb.Booster(
     params = params,
-    cachelist = append(watchlist, dtrain),
+    cachelist = append(evals, dtrain),
     modelfile = xgb_model
   )
   niter_init <- bst$niter
@@ -389,11 +396,6 @@ xgb.train <- function(params = list(), data, nrounds, watchlist = list(),
     dtrain
   )
 
-  # extract parameters that can affect the relationship b/w #trees and #iterations
-  # Note: it might look like these aren't used, but they need to be defined in this
-  # environment for the callbacks for work correctly.
-  num_class <- max(as.numeric(NVL(params[['num_class']], 1)), 1) # nolint
-
   if (is_update && nrounds > niter_init)
     stop("nrounds cannot be larger than ", niter_init, " (nrounds of xgb_model)")
 
@@ -401,57 +403,83 @@ xgb.train <- function(params = list(), data, nrounds, watchlist = list(),
   begin_iteration <- niter_skip + 1
   end_iteration <- niter_skip + nrounds
 
+  .execute.cb.before.training(
+    callbacks,
+    bst,
+    dtrain,
+    evals,
+    begin_iteration,
+    end_iteration
+  )
+
   # the main loop for boosting iterations
   for (iteration in begin_iteration:end_iteration) {
 
-    for (f in cb$pre_iter) f()
-
-    xgb.iter.update(
-        bst = bst,
-        dtrain = dtrain,
-        iter = iteration - 1,
-        obj = obj
+    .execute.cb.before.iter(
+      callbacks,
+      bst,
+      dtrain,
+      evals,
+      iteration
     )
 
-    if (length(watchlist) > 0) {
-      bst_evaluation <- xgb.iter.eval(  # nolint: object_usage_linter
+    xgb.iter.update(
+      bst = bst,
+      dtrain = dtrain,
+      iter = iteration - 1,
+      obj = obj
+    )
+
+    bst_evaluation <- NULL
+    if (length(evals) > 0) {
+      bst_evaluation <- xgb.iter.eval(
         bst = bst,
-        watchlist = watchlist,
+        evals = evals,
         iter = iteration - 1,
         feval = feval
       )
     }
 
-    for (f in cb$post_iter) f()
+    should_stop <- .execute.cb.after.iter(
+      callbacks,
+      bst,
+      dtrain,
+      evals,
+      iteration,
+      bst_evaluation
+    )
 
-    if (stop_condition) break
+    if (should_stop) break
   }
-  for (f in cb$finalize) f(finalize = TRUE)
 
-  # store the evaluation results
-  keep_evaluation_log <- FALSE
-  if (length(evaluation_log) > 0 && nrow(evaluation_log) > 0) {
-    keep_evaluation_log <- TRUE
-    # include the previous compatible history when available
-    if (inherits(xgb_model, 'xgb.Booster') &&
-        !is_update &&
-        !is.null(past_evaluation_log) &&
-        isTRUE(all.equal(colnames(evaluation_log),
-                         colnames(past_evaluation_log)))) {
-      evaluation_log <- rbindlist(list(past_evaluation_log, evaluation_log))
-    }
-  }
+  cb_outputs <- .execute.cb.after.training(
+    callbacks,
+    bst,
+    dtrain,
+    evals,
+    iteration,
+    bst_evaluation
+  )
 
   extra_attrs <- list(
     call = match.call(),
-    params = params,
-    callbacks = callbacks
+    params = params
   )
-  if (keep_evaluation_log) {
-    extra_attrs$evaluation_log <- evaluation_log
-  }
+
   curr_attrs <- attributes(bst)
-  attributes(bst) <- c(curr_attrs, extra_attrs)
+  if (NROW(curr_attrs)) {
+    curr_attrs <- curr_attrs[
+      setdiff(
+        names(curr_attrs),
+        c(names(extra_attrs), names(cb_outputs))
+      )
+    ]
+  }
+  curr_attrs <- c(extra_attrs, curr_attrs)
+  if (NROW(cb_outputs)) {
+    curr_attrs <- c(curr_attrs, cb_outputs)
+  }
+  attributes(bst) <- curr_attrs
 
   return(bst)
 }
diff --git a/R-package/R/xgboost.R b/R-package/R/xgboost.R
index 170aa5ffd..a1d373581 100644
--- a/R-package/R/xgboost.R
+++ b/R-package/R/xgboost.R
@@ -18,9 +18,9 @@ xgboost <- function(data = NULL, label = NULL, missing = NA, weight = NULL,
     nthread = merged$nthread
   )
 
-  watchlist <- list(train = dtrain)
+  evals <- list(train = dtrain)
 
-  bst <- xgb.train(params, dtrain, nrounds, watchlist, verbose = verbose, print_every_n = print_every_n,
+  bst <- xgb.train(params, dtrain, nrounds, evals, verbose = verbose, print_every_n = print_every_n,
                    early_stopping_rounds = early_stopping_rounds, maximize = maximize,
                    save_period = save_period, save_name = save_name,
                    xgb_model = xgb_model, callbacks = callbacks, ...)
@@ -82,12 +82,8 @@ NULL
 NULL
 
 # Various imports
-#' @importClassesFrom Matrix dgCMatrix dgeMatrix dgRMatrix
-#' @importFrom Matrix colSums
+#' @importClassesFrom Matrix dgCMatrix dgRMatrix CsparseMatrix
 #' @importFrom Matrix sparse.model.matrix
-#' @importFrom Matrix sparseVector
-#' @importFrom Matrix sparseMatrix
-#' @importFrom Matrix t
 #' @importFrom data.table data.table
 #' @importFrom data.table is.data.table
 #' @importFrom data.table as.data.table
@@ -103,6 +99,7 @@ NULL
 #' @importFrom stats coef
 #' @importFrom stats predict
 #' @importFrom stats median
+#' @importFrom stats sd
 #' @importFrom stats variable.names
 #' @importFrom utils head
 #' @importFrom graphics barplot
diff --git a/R-package/demo/basic_walkthrough.R b/R-package/demo/basic_walkthrough.R
index 31f79fb57..9403bac20 100644
--- a/R-package/demo/basic_walkthrough.R
+++ b/R-package/demo/basic_walkthrough.R
@@ -55,6 +55,8 @@ print(paste("test-error=", err))
 # save model to binary local file
 xgb.save(bst, "xgboost.model")
 # load binary model to R
+# Function doesn't take 'nthreads', but can be set like this:
+RhpcBLASctl::omp_set_num_threads(1)
 bst2 <- xgb.load("xgboost.model")
 pred2 <- predict(bst2, test$data)
 # pred2 should be identical to pred
@@ -72,17 +74,17 @@ print(paste("sum(abs(pred3-pred))=", sum(abs(pred3 - pred))))
 # to use advanced features, we need to put data in xgb.DMatrix
 dtrain <- xgb.DMatrix(data = train$data, label = train$label)
 dtest <- xgb.DMatrix(data = test$data, label = test$label)
-#---------------Using watchlist----------------
-# watchlist is a list of xgb.DMatrix, each of them is tagged with name
-watchlist <- list(train = dtrain, test = dtest)
-# to train with watchlist, use xgb.train, which contains more advanced features
-# watchlist allows us to monitor the evaluation result on all data in the list
-print("Train xgboost using xgb.train with watchlist")
-bst <- xgb.train(data = dtrain, max_depth = 2, eta = 1, nrounds = 2, watchlist = watchlist,
+#---------------Using an evaluation set----------------
+# 'evals' is a list of xgb.DMatrix, each of them is tagged with name
+evals <- list(train = dtrain, test = dtest)
+# to train with an evaluation set, use xgb.train, which contains more advanced features
+# 'evals' argument allows us to monitor the evaluation result on all data in the list
+print("Train xgboost using xgb.train with evaluation data")
+bst <- xgb.train(data = dtrain, max_depth = 2, eta = 1, nrounds = 2, evals = evals,
                  nthread = 2, objective = "binary:logistic")
 # we can change evaluation metrics, or use multiple evaluation metrics
-print("train xgboost using xgb.train with watchlist, watch logloss and error")
-bst <- xgb.train(data = dtrain, max_depth = 2, eta = 1, nrounds = 2, watchlist = watchlist,
+print("train xgboost using xgb.train with evaluation data, watch logloss and error")
+bst <- xgb.train(data = dtrain, max_depth = 2, eta = 1, nrounds = 2, evals = evals,
                  eval_metric = "error", eval_metric = "logloss",
                  nthread = 2, objective = "binary:logistic")
 
@@ -90,7 +92,7 @@ bst <- xgb.train(data = dtrain, max_depth = 2, eta = 1, nrounds = 2, watchlist =
 xgb.DMatrix.save(dtrain, "dtrain.buffer")
 # to load it in, simply call xgb.DMatrix
 dtrain2 <- xgb.DMatrix("dtrain.buffer")
-bst <- xgb.train(data = dtrain2, max_depth = 2, eta = 1, nrounds = 2, watchlist = watchlist,
+bst <- xgb.train(data = dtrain2, max_depth = 2, eta = 1, nrounds = 2, evals = evals,
                  nthread = 2, objective = "binary:logistic")
 # information can be extracted from xgb.DMatrix using getinfo
 label <- getinfo(dtest, "label")
diff --git a/R-package/demo/boost_from_prediction.R b/R-package/demo/boost_from_prediction.R
index 1a3d55369..75af70dba 100644
--- a/R-package/demo/boost_from_prediction.R
+++ b/R-package/demo/boost_from_prediction.R
@@ -5,14 +5,14 @@ data(agaricus.test, package = 'xgboost')
 dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
 dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label)
 
-watchlist <- list(eval = dtest, train = dtrain)
+evals <- list(eval = dtest, train = dtrain)
 ###
 # advanced: start from a initial base prediction
 #
 print('start running example to start from a initial prediction')
 # train xgboost for 1 round
 param <- list(max_depth = 2, eta = 1, nthread = 2, objective = 'binary:logistic')
-bst <- xgb.train(param, dtrain, 1, watchlist)
+bst <- xgb.train(param, dtrain, 1, evals)
 # Note: we need the margin value instead of transformed prediction in set_base_margin
 # do predict with output_margin=TRUE, will always give you margin values before logistic transformation
 ptrain <- predict(bst, dtrain, outputmargin = TRUE)
@@ -23,4 +23,4 @@ setinfo(dtrain, "base_margin", ptrain)
 setinfo(dtest, "base_margin", ptest)
 
 print('this is result of boost from initial prediction')
-bst <- xgb.train(params = param, data = dtrain, nrounds = 1, watchlist = watchlist)
+bst <- xgb.train(params = param, data = dtrain, nrounds = 1, evals = evals)
diff --git a/R-package/demo/custom_objective.R b/R-package/demo/custom_objective.R
index 35201332c..03d7b3464 100644
--- a/R-package/demo/custom_objective.R
+++ b/R-package/demo/custom_objective.R
@@ -8,7 +8,7 @@ dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label)
 # note: for customized objective function, we leave objective as default
 # note: what we are getting is margin value in prediction
 # you must know what you are doing
-watchlist <- list(eval = dtest, train = dtrain)
+evals <- list(eval = dtest, train = dtrain)
 num_round <- 2
 
 # user define objective function, given prediction, return gradient and second order gradient
@@ -38,7 +38,7 @@ param <- list(max_depth = 2, eta = 1, nthread  =  2, verbosity = 0,
 print('start training with user customized objective')
 # training with customized objective, we can also do step by step training
 # simply look at xgboost.py's implementation of train
-bst <- xgb.train(param, dtrain, num_round, watchlist)
+bst <- xgb.train(param, dtrain, num_round, evals)
 
 #
 # there can be cases where you want additional information
@@ -62,4 +62,4 @@ param <- list(max_depth = 2, eta = 1, nthread  =  2, verbosity = 0,
 print('start training with user customized objective, with additional attributes in DMatrix')
 # training with customized objective, we can also do step by step training
 # simply look at xgboost.py's implementation of train
-bst <- xgb.train(param, dtrain, num_round, watchlist)
+bst <- xgb.train(param, dtrain, num_round, evals)
diff --git a/R-package/demo/early_stopping.R b/R-package/demo/early_stopping.R
index 04da1382f..057440882 100644
--- a/R-package/demo/early_stopping.R
+++ b/R-package/demo/early_stopping.R
@@ -8,7 +8,7 @@ dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label)
 # note: what we are getting is margin value in prediction
 # you must know what you are doing
 param <- list(max_depth = 2, eta = 1, nthread = 2, verbosity = 0)
-watchlist <- list(eval = dtest)
+evals <- list(eval = dtest)
 num_round <- 20
 # user define objective function, given prediction, return gradient and second order gradient
 # this is log likelihood loss
@@ -32,7 +32,7 @@ evalerror <- function(preds, dtrain) {
 }
 print('start training with early Stopping setting')
 
-bst <- xgb.train(param, dtrain, num_round, watchlist,
+bst <- xgb.train(param, dtrain, num_round, evals,
                  objective = logregobj, eval_metric = evalerror, maximize = FALSE,
                  early_stopping_round = 3)
 bst <- xgb.cv(param, dtrain, num_round, nfold = 5,
diff --git a/R-package/demo/generalized_linear_model.R b/R-package/demo/generalized_linear_model.R
index c24fe72cb..d29a6dc5b 100644
--- a/R-package/demo/generalized_linear_model.R
+++ b/R-package/demo/generalized_linear_model.R
@@ -25,9 +25,9 @@ param <- list(objective = "binary:logistic", booster = "gblinear",
 ##
 # the rest of settings are the same
 ##
-watchlist <- list(eval = dtest, train = dtrain)
+evals <- list(eval = dtest, train = dtrain)
 num_round <- 2
-bst <- xgb.train(param, dtrain, num_round, watchlist)
+bst <- xgb.train(param, dtrain, num_round, evals)
 ypred <- predict(bst, dtest)
 labels <- getinfo(dtest, 'label')
 cat('error of preds=', mean(as.numeric(ypred > 0.5) != labels), '\n')
diff --git a/R-package/demo/gpu_accelerated.R b/R-package/demo/gpu_accelerated.R
index 14ed9392b..617a63e74 100644
--- a/R-package/demo/gpu_accelerated.R
+++ b/R-package/demo/gpu_accelerated.R
@@ -23,7 +23,7 @@ y <- rbinom(N, 1, plogis(m))
 tr <- sample.int(N, N * 0.75)
 dtrain <- xgb.DMatrix(X[tr, ], label = y[tr])
 dtest <- xgb.DMatrix(X[-tr, ], label = y[-tr])
-wl <- list(train = dtrain, test = dtest)
+evals <- list(train = dtrain, test = dtest)
 
 # An example of running 'gpu_hist' algorithm
 # which is
@@ -35,11 +35,11 @@ wl <- list(train = dtrain, test = dtest)
 param <- list(objective = 'reg:logistic', eval_metric = 'auc', subsample = 0.5, nthread = 4,
               max_bin = 64, tree_method = 'gpu_hist')
 pt <- proc.time()
-bst_gpu <- xgb.train(param, dtrain, watchlist = wl, nrounds = 50)
+bst_gpu <- xgb.train(param, dtrain, evals = evals, nrounds = 50)
 proc.time() - pt
 
 # Compare to the 'hist' algorithm:
 param$tree_method <- 'hist'
 pt <- proc.time()
-bst_hist <- xgb.train(param, dtrain, watchlist = wl, nrounds = 50)
+bst_hist <- xgb.train(param, dtrain, evals = evals, nrounds = 50)
 proc.time() - pt
diff --git a/R-package/demo/predict_first_ntree.R b/R-package/demo/predict_first_ntree.R
index 179c18c70..ba15ab39a 100644
--- a/R-package/demo/predict_first_ntree.R
+++ b/R-package/demo/predict_first_ntree.R
@@ -6,11 +6,11 @@ dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
 dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label)
 
 param <- list(max_depth = 2, eta = 1, objective = 'binary:logistic')
-watchlist <- list(eval = dtest, train = dtrain)
+evals <- list(eval = dtest, train = dtrain)
 nrounds <- 2
 
 # training the model for two rounds
-bst <- xgb.train(param, dtrain, nrounds, nthread = 2, watchlist)
+bst <- xgb.train(param, dtrain, nrounds, nthread = 2, evals = evals)
 cat('start testing prediction from first n trees\n')
 labels <- getinfo(dtest, 'label')
 
diff --git a/R-package/demo/predict_leaf_indices.R b/R-package/demo/predict_leaf_indices.R
index 21b6fa71d..a57baf668 100644
--- a/R-package/demo/predict_leaf_indices.R
+++ b/R-package/demo/predict_leaf_indices.R
@@ -43,7 +43,6 @@ colnames(new.features.test) <- colnames(new.features.train)
 # learning with new features
 new.dtrain <- xgb.DMatrix(data = new.features.train, label = agaricus.train$label)
 new.dtest <- xgb.DMatrix(data = new.features.test, label = agaricus.test$label)
-watchlist <- list(train = new.dtrain)
 bst <- xgb.train(params = param, data = new.dtrain, nrounds = nrounds, nthread = 2)
 
 # Model accuracy with new features
diff --git a/R-package/demo/tweedie_regression.R b/R-package/demo/tweedie_regression.R
index dfaf6a2ae..b07858e76 100644
--- a/R-package/demo/tweedie_regression.R
+++ b/R-package/demo/tweedie_regression.R
@@ -39,7 +39,7 @@ bst <- xgb.train(
   data = d_train,
   params = params,
   maximize = FALSE,
-  watchlist = list(train = d_train),
+  evals = list(train = d_train),
   nrounds = 20)
 
 var_imp <- xgb.importance(attr(x, 'Dimnames')[[2]], model = bst)
diff --git a/R-package/man/callbacks.Rd b/R-package/man/callbacks.Rd
deleted file mode 100644
index 9f6f69015..000000000
--- a/R-package/man/callbacks.Rd
+++ /dev/null
@@ -1,37 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/callbacks.R
-\name{callbacks}
-\alias{callbacks}
-\title{Callback closures for booster training.}
-\description{
-These are used to perform various service tasks either during boosting iterations or at the end.
-This approach helps to modularize many of such tasks without bloating the main training methods,
-and it offers .
-}
-\details{
-By default, a callback function is run after each boosting iteration.
-An R-attribute \code{is_pre_iteration} could be set for a callback to define a pre-iteration function.
-
-When a callback function has \code{finalize} parameter, its finalizer part will also be run after
-the boosting is completed.
-
-WARNING: side-effects!!! Be aware that these callback functions access and modify things in
-the environment from which they are called from, which is a fairly uncommon thing to do in R.
-
-To write a custom callback closure, make sure you first understand the main concepts about R environments.
-Check either R documentation on \code{\link[base]{environment}} or the
-\href{http://adv-r.had.co.nz/Environments.html}{Environments chapter} from the "Advanced R"
-book by Hadley Wickham. Further, the best option is to read the code of some of the existing callbacks -
-choose ones that do something similar to what you want to achieve. Also, you would need to get familiar
-with the objects available inside of the \code{xgb.train} and \code{xgb.cv} internal environments.
-}
-\seealso{
-\code{\link{cb.print.evaluation}},
-\code{\link{cb.evaluation.log}},
-\code{\link{cb.reset.parameters}},
-\code{\link{cb.early.stop}},
-\code{\link{cb.save.model}},
-\code{\link{cb.cv.predict}},
-\code{\link{xgb.train}},
-\code{\link{xgb.cv}}
-}
diff --git a/R-package/man/cb.early.stop.Rd b/R-package/man/cb.early.stop.Rd
deleted file mode 100644
index 7cd51a3ce..000000000
--- a/R-package/man/cb.early.stop.Rd
+++ /dev/null
@@ -1,62 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/callbacks.R
-\name{cb.early.stop}
-\alias{cb.early.stop}
-\title{Callback closure to activate the early stopping.}
-\usage{
-cb.early.stop(
-  stopping_rounds,
-  maximize = FALSE,
-  metric_name = NULL,
-  verbose = TRUE
-)
-}
-\arguments{
-\item{stopping_rounds}{The number of rounds with no improvement in
-the evaluation metric in order to stop the training.}
-
-\item{maximize}{whether to maximize the evaluation metric}
-
-\item{metric_name}{the name of an evaluation column to use as a criteria for early
-stopping. If not set, the last column would be used.
-Let's say the test data in \code{watchlist} was labelled as \code{dtest},
-and one wants to use the AUC in test data for early stopping regardless of where
-it is in the \code{watchlist}, then one of the following would need to be set:
-\code{metric_name='dtest-auc'} or \code{metric_name='dtest_auc'}.
-All dash '-' characters in metric names are considered equivalent to '_'.}
-
-\item{verbose}{whether to print the early stopping information.}
-}
-\description{
-Callback closure to activate the early stopping.
-}
-\details{
-This callback function determines the condition for early stopping
-by setting the \code{stop_condition = TRUE} flag in its calling frame.
-
-The following additional fields are assigned to the model's R object:
-\itemize{
-\item \code{best_score} the evaluation score at the best iteration
-\item \code{best_iteration} at which boosting iteration the best score has occurred (1-based index)
-}
-The Same values are also stored as xgb-attributes:
-\itemize{
-\item \code{best_iteration} is stored as a 0-based iteration index (for interoperability of binary models)
-\item \code{best_msg} message string is also stored.
-}
-
-At least one data element is required in the evaluation watchlist for early stopping to work.
-
-Callback function expects the following values to be set in its calling frame:
-\code{stop_condition},
-\code{bst_evaluation},
-\code{rank},
-\code{bst} (or \code{bst_folds} and \code{basket}),
-\code{iteration},
-\code{begin_iteration},
-\code{end_iteration},
-}
-\seealso{
-\code{\link{callbacks}},
-\code{\link{xgb.attr}}
-}
diff --git a/R-package/man/cb.evaluation.log.Rd b/R-package/man/cb.evaluation.log.Rd
deleted file mode 100644
index 94f8a02e6..000000000
--- a/R-package/man/cb.evaluation.log.Rd
+++ /dev/null
@@ -1,31 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/callbacks.R
-\name{cb.evaluation.log}
-\alias{cb.evaluation.log}
-\title{Callback closure for logging the evaluation history}
-\usage{
-cb.evaluation.log()
-}
-\description{
-Callback closure for logging the evaluation history
-}
-\details{
-This callback function appends the current iteration evaluation results \code{bst_evaluation}
-available in the calling parent frame to the \code{evaluation_log} list in a calling frame.
-
-The finalizer callback (called with \code{finalize = TURE} in the end) converts
-the \code{evaluation_log} list into a final data.table.
-
-The iteration evaluation result \code{bst_evaluation} must be a named numeric vector.
-
-Note: in the column names of the final data.table, the dash '-' character is replaced with
-the underscore '_' in order to make the column names more like regular R identifiers.
-
-Callback function expects the following values to be set in its calling frame:
-\code{evaluation_log},
-\code{bst_evaluation},
-\code{iteration}.
-}
-\seealso{
-\code{\link{callbacks}}
-}
diff --git a/R-package/man/cb.print.evaluation.Rd b/R-package/man/cb.print.evaluation.Rd
deleted file mode 100644
index 59b9ba65e..000000000
--- a/R-package/man/cb.print.evaluation.Rd
+++ /dev/null
@@ -1,29 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/callbacks.R
-\name{cb.print.evaluation}
-\alias{cb.print.evaluation}
-\title{Callback closure for printing the result of evaluation}
-\usage{
-cb.print.evaluation(period = 1, showsd = TRUE)
-}
-\arguments{
-\item{period}{results would be printed every number of periods}
-
-\item{showsd}{whether standard deviations should be printed (when available)}
-}
-\description{
-Callback closure for printing the result of evaluation
-}
-\details{
-The callback function prints the result of evaluation at every \code{period} iterations.
-The initial and the last iteration's evaluations are always printed.
-
-Callback function expects the following values to be set in its calling frame:
-\code{bst_evaluation} (also \code{bst_evaluation_err} when available),
-\code{iteration},
-\code{begin_iteration},
-\code{end_iteration}.
-}
-\seealso{
-\code{\link{callbacks}}
-}
diff --git a/R-package/man/cb.save.model.Rd b/R-package/man/cb.save.model.Rd
deleted file mode 100644
index 7701ad990..000000000
--- a/R-package/man/cb.save.model.Rd
+++ /dev/null
@@ -1,40 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/callbacks.R
-\name{cb.save.model}
-\alias{cb.save.model}
-\title{Callback closure for saving a model file.}
-\usage{
-cb.save.model(save_period = 0, save_name = "xgboost.ubj")
-}
-\arguments{
-\item{save_period}{save the model to disk after every
-\code{save_period} iterations; 0 means save the model at the end.}
-
-\item{save_name}{the name or path for the saved model file.
-
-\if{html}{\out{<div class="sourceCode">}}\preformatted{   Note that the format of the model being saved is determined by the file
-   extension specified here (see \link{xgb.save} for details about how it works).
-
-   It can contain a \code{\link[base]{sprintf}} formatting specifier
-   to include the integer iteration number in the file name.
-   E.g., with \code{save_name} = 'xgboost_\%04d.ubj',
-   the file saved at iteration 50 would be named "xgboost_0050.ubj".
-}\if{html}{\out{</div>}}}
-}
-\description{
-Callback closure for saving a model file.
-}
-\details{
-This callback function allows to save an xgb-model file, either periodically after each \code{save_period}'s or at the end.
-
-Callback function expects the following values to be set in its calling frame:
-\code{bst},
-\code{iteration},
-\code{begin_iteration},
-\code{end_iteration}.
-}
-\seealso{
-\link{xgb.save}
-
-\code{\link{callbacks}}
-}
diff --git a/R-package/man/predict.xgb.Booster.Rd b/R-package/man/predict.xgb.Booster.Rd
index 95e7a51fd..88a2f203e 100644
--- a/R-package/man/predict.xgb.Booster.Rd
+++ b/R-package/man/predict.xgb.Booster.Rd
@@ -18,25 +18,47 @@
   iterationrange = NULL,
   strict_shape = FALSE,
   validate_features = FALSE,
+  base_margin = NULL,
   ...
 )
 }
 \arguments{
 \item{object}{Object of class \code{xgb.Booster}.}
 
-\item{newdata}{Takes \code{matrix}, \code{dgCMatrix}, \code{dgRMatrix}, \code{dsparseVector},
+\item{newdata}{Takes \code{data.frame}, \code{matrix}, \code{dgCMatrix}, \code{dgRMatrix}, \code{dsparseVector},
 local data file, or \code{xgb.DMatrix}.
-For single-row predictions on sparse data, it is recommended to use the CSR format.
-If passing a sparse vector, it will take it as a row vector.}
 
-\item{missing}{Only used when input is a dense matrix. Pick a float value that represents
-missing values in data (e.g., 0 or some other extreme value).}
+\if{html}{\out{<div class="sourceCode">}}\preformatted{   For single-row predictions on sparse data, it's recommended to use CSR format. If passing
+   a sparse vector, it will take it as a row vector.
+
+   Note that, for repeated predictions on the same data, one might want to create a DMatrix to
+   pass here instead of passing R types like matrices or data frames, as predictions will be
+   faster on DMatrix.
+
+   If `newdata` is a `data.frame`, be aware that:\\itemize\{
+   \\item Columns will be converted to numeric if they aren't already, which could potentially make
+         the operation slower than in an equivalent `matrix` object.
+   \\item The order of the columns must match with that of the data from which the model was fitted
+         (i.e. columns will not be referenced by their names, just by their order in the data).
+   \\item If the model was fitted to data with categorical columns, these columns must be of
+         `factor` type here, and must use the same encoding (i.e. have the same levels).
+   \\item If `newdata` contains any `factor` columns, they will be converted to base-0
+         encoding (same as during DMatrix creation) - hence, one should not pass a `factor`
+         under a column which during training had a different type.
+   \}
+}\if{html}{\out{</div>}}}
+
+\item{missing}{Float value that represents missing values in data (e.g., 0 or some other extreme value).
+
+\if{html}{\out{<div class="sourceCode">}}\preformatted{   This parameter is not used when `newdata` is an `xgb.DMatrix` - in such cases, should pass
+   this as an argument to the DMatrix constructor instead.
+}\if{html}{\out{</div>}}}
 
 \item{outputmargin}{Whether the prediction should be returned in the form of original untransformed
 sum of predictions from boosting iterations' results. E.g., setting \code{outputmargin=TRUE} for
 logistic regression would return log-odds instead of probabilities.}
 
-\item{predleaf}{Whether to predict pre-tree leaf indices.}
+\item{predleaf}{Whether to predict per-tree leaf indices.}
 
 \item{predcontrib}{Whether to return feature contributions to individual predictions (see Details).}
 
@@ -48,7 +70,7 @@ logistic regression would return log-odds instead of probabilities.}
 prediction outputs per case. No effect if \code{predleaf}, \code{predcontrib},
 or \code{predinteraction} is \code{TRUE}.}
 
-\item{training}{Whether the predictions are used for training. For dart booster,
+\item{training}{Whether the prediction result is used for training. For dart booster,
 training predicting will perform dropout.}
 
 \item{iterationrange}{Sequence of rounds/iterations from the model to use for prediction, specified by passing
@@ -84,6 +106,13 @@ match (only applicable when both \code{object} and \code{newdata} have feature n
    recommended to disable it for performance-sensitive applications.
 }\if{html}{\out{</div>}}}
 
+\item{base_margin}{Base margin used for boosting from existing model.
+
+\if{html}{\out{<div class="sourceCode">}}\preformatted{   Note that, if `newdata` is an `xgb.DMatrix` object, this argument will
+   be ignored as it needs to be added to the DMatrix instead (e.g. by passing it as
+   an argument in its constructor, or by calling \link{setinfo.xgb.DMatrix}).
+}\if{html}{\out{</div>}}}
+
 \item{...}{Not used.}
 }
 \value{
@@ -115,7 +144,7 @@ When \code{strict_shape = TRUE}, the output is always an array:
 }
 }
 \description{
-Predicted values based on either xgboost model or model handle object.
+Predict values on data based on xgboost model.
 }
 \details{
 Note that \code{iterationrange} would currently do nothing for predictions from "gblinear",
diff --git a/R-package/man/xgb.Callback.Rd b/R-package/man/xgb.Callback.Rd
new file mode 100644
index 000000000..b4edcd978
--- /dev/null
+++ b/R-package/man/xgb.Callback.Rd
@@ -0,0 +1,248 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/callbacks.R
+\name{xgb.Callback}
+\alias{xgb.Callback}
+\title{XGBoost Callback Constructor}
+\usage{
+xgb.Callback(
+  cb_name = "custom_callback",
+  env = new.env(),
+  f_before_training = function(env, model, data, evals, begin_iteration, end_iteration)
+    NULL,
+  f_before_iter = function(env, model, data, evals, iteration) NULL,
+  f_after_iter = function(env, model, data, evals, iteration, iter_feval) NULL,
+  f_after_training = function(env, model, data, evals, iteration, final_feval,
+    prev_cb_res) NULL
+)
+}
+\arguments{
+\item{cb_name}{Name for the callback.
+
+If the callback produces some non-NULL result (from executing the function passed under
+\code{f_after_training}), that result will be added as an R attribute to the resulting booster
+(or as a named element in the result of CV), with the attribute name specified here.
+
+Names of callbacks must be unique - i.e. there cannot be two callbacks with the same name.}
+
+\item{env}{An environment object that will be passed to the different functions in the callback.
+Note that this environment will not be shared with other callbacks.}
+
+\item{f_before_training}{A function that will be executed before the training has started.
+
+If passing \code{NULL} for this or for the other function inputs, then no function will be executed.
+
+If passing a function, it will be called with parameters supplied as non-named arguments
+matching the function signatures that are shown in the default value for each function argument.}
+
+\item{f_before_iter}{A function that will be executed before each boosting round.
+
+This function can signal whether the training should be finalized or not, by outputting
+a value that evaluates to \code{TRUE} - i.e. if the output from the function provided here at
+a given round is \code{TRUE}, then training will be stopped before the current iteration happens.
+
+Return values of \code{NULL} will be interpreted as \code{FALSE}.}
+
+\item{f_after_iter}{A function that will be executed after each boosting round.
+
+This function can signal whether the training should be finalized or not, by outputting
+a value that evaluates to \code{TRUE} - i.e. if the output from the function provided here at
+a given round is \code{TRUE}, then training will be stopped at that round.
+
+Return values of \code{NULL} will be interpreted as \code{FALSE}.}
+
+\item{f_after_training}{A function that will be executed after training is finished.
+
+This function can optionally output something non-NULL, which will become part of the R
+attributes of the booster (assuming one passes \code{keep_extra_attributes=TRUE} to \link{xgb.train})
+under the name supplied for parameter \code{cb_name} imn the case of \link{xgb.train}; or a part
+of the named elements in the result of \link{xgb.cv}.}
+}
+\value{
+An \code{xgb.Callback} object, which can be passed to \link{xgb.train} or \link{xgb.cv}.
+}
+\description{
+Constructor for defining the structure of callback functions that can be executed
+at different stages of model training (before / after training, before / after each boosting
+iteration).
+}
+\details{
+Arguments that will be passed to the supplied functions are as follows:\itemize{
+
+\item env The same environment that is passed under argument \code{env}.
+
+It may be modified by the functions in order to e.g. keep tracking of what happens
+across iterations or similar.
+
+This environment is only used by the functions supplied to the callback, and will
+not be kept after the model fitting function terminates (see parameter \code{f_after_training}).
+
+\item model The booster object when using \link{xgb.train}, or the folds when using
+\link{xgb.cv}.
+
+For \link{xgb.cv}, folds are a list with a structure as follows:\itemize{
+\item \code{dtrain}: The training data for the fold (as an \code{xgb.DMatrix} object).
+\item \code{bst}: Rhe \code{xgb.Booster} object for the fold.
+\item \code{evals}: A list containing two DMatrices, with names \code{train} and \code{test}
+(\code{test} is the held-out data for the fold).
+\item \code{index}: The indices of the hold-out data for that fold (base-1 indexing),
+from which the \code{test} entry in \code{evals} was obtained.
+}
+
+This object should \bold{not} be in-place modified in ways that conflict with the
+training (e.g. resetting the parameters for a training update in a way that resets
+the number of rounds to zero in order to overwrite rounds).
+
+Note that any R attributes that are assigned to the booster during the callback functions,
+will not be kept thereafter as the booster object variable is not re-assigned during
+training. It is however possible to set C-level attributes of the booster through
+\link{xgb.attr} or \link{xgb.attributes}, which should remain available for the rest
+of the iterations and after the training is done.
+
+For keeping variables across iterations, it's recommended to use \code{env} instead.
+\item data The data to which the model is being fit, as an \code{xgb.DMatrix} object.
+
+Note that, for \link{xgb.cv}, this will be the full data, while data for the specific
+folds can be found in the \code{model} object.
+
+\item evals The evaluation data, as passed under argument \code{evals} to
+\link{xgb.train}.
+
+For \link{xgb.cv}, this will always be \code{NULL}.
+
+\item begin_iteration Index of the first boosting iteration that will be executed
+(base-1 indexing).
+
+This will typically be '1', but when using training continuation, depending on the
+parameters for updates, boosting rounds will be continued from where the previous
+model ended, in which case this will be larger than 1.
+
+\item end_iteration Index of the last boostign iteration that will be executed
+(base-1 indexing, inclusive of this end).
+
+It should match with argument \code{nrounds} passed to \link{xgb.train} or \link{xgb.cv}.
+
+Note that boosting might be interrupted before reaching this last iteration, for
+example by using the early stopping callback \link{xgb.cb.early.stop}.
+
+\item iteration Index of the iteration number that is being executed (first iteration
+will be the same as parameter \code{begin_iteration}, then next one will add +1, and so on).
+
+\item iter_feval Evaluation metrics for \code{evals} that were supplied, either
+determined by the objective, or by parameter \code{feval}.
+
+For \link{xgb.train}, this will be a named vector with one entry per element in
+\code{evals}, where the names are determined as 'evals name' + '-' + 'metric name' - for
+example, if \code{evals} contains an entry named "tr" and the metric is "rmse",
+this will be a one-element vector with name "tr-rmse".
+
+For \link{xgb.cv}, this will be a 2d matrix with dimensions \verb{[length(evals), nfolds]},
+where the row names will follow the same naming logic as the one-dimensional vector
+that is passed in \link{xgb.train}.
+
+Note that, internally, the built-in callbacks such as \link{xgb.cb.print.evaluation} summarize
+this table by calculating the row-wise means and standard deviations.
+
+\item final_feval The evaluation results after the last boosting round is executed
+(same format as \code{iter_feval}, and will be the exact same input as passed under
+\code{iter_feval} to the last round that is executed during model fitting).
+
+\item prev_cb_res Result from a previous run of a callback sharing the same name
+(as given by parameter \code{cb_name}) when conducting training continuation, if there
+was any in the booster R attributes.
+
+Some times, one might want to append the new results to the previous one, and this will
+be done automatically by the built-in callbacks such as \link{xgb.cb.evaluation.log},
+which will append the new rows to the previous table.
+
+If no such previous callback result is available (which it never will when fitting
+a model from start instead of updating an existing model), this will be \code{NULL}.
+
+For \link{xgb.cv}, which doesn't support training continuation, this will always be \code{NULL}.
+}
+
+The following names (\code{cb_name} values) are reserved for internal callbacks:\itemize{
+\item print_evaluation
+\item evaluation_log
+\item reset_parameters
+\item early_stop
+\item save_model
+\item cv_predict
+\item gblinear_history
+}
+
+The following names are reserved for other non-callback attributes:\itemize{
+\item names
+\item class
+\item call
+\item params
+\item niter
+\item nfeatures
+\item folds
+}
+
+When using the built-in early stopping callback (\link{xgb.cb.early.stop}), said callback
+will always be executed before the others, as it sets some booster C-level attributes
+that other callbacks might also use. Otherwise, the order of execution will match with
+the order in which the callbacks are passed to the model fitting function.
+}
+\examples{
+# Example constructing a custom callback that calculates
+# squared error on the training data (no separate test set),
+# and outputs the per-iteration results.
+ssq_callback <- xgb.Callback(
+  cb_name = "ssq",
+  f_before_training = function(env, model, data, evals,
+                               begin_iteration, end_iteration) {
+    # A vector to keep track of a number at each iteration
+    env$logs <- rep(NA_real_, end_iteration - begin_iteration + 1)
+  },
+  f_after_iter = function(env, model, data, evals, iteration, iter_feval) {
+    # This calculates the sum of squared errors on the training data.
+    # Note that this can be better done by passing an 'evals' entry,
+    # but this demonstrates a way in which callbacks can be structured.
+    pred <- predict(model, data)
+    err <- pred - getinfo(data, "label")
+    sq_err <- sum(err^2)
+    env$logs[iteration] <- sq_err
+    cat(
+      sprintf(
+        "Squared error at iteration \%d: \%.2f\n",
+        iteration, sq_err
+      )
+    )
+
+    # A return value of 'TRUE' here would signal to finalize the training
+    return(FALSE)
+  },
+  f_after_training = function(env, model, data, evals, iteration,
+                              final_feval, prev_cb_res) {
+    return(env$logs)
+  }
+)
+
+data(mtcars)
+y <- mtcars$mpg
+x <- as.matrix(mtcars[, -1])
+dm <- xgb.DMatrix(x, label = y, nthread = 1)
+model <- xgb.train(
+  data = dm,
+  params = list(objective = "reg:squarederror", nthread = 1),
+  nrounds = 5,
+  callbacks = list(ssq_callback),
+  keep_extra_attributes = TRUE
+)
+
+# Result from 'f_after_iter' will be available as an attribute
+attributes(model)$ssq
+}
+\seealso{
+Built-in callbacks:\itemize{
+\item \link{xgb.cb.print.evaluation}
+\item \link{xgb.cb.evaluation.log}
+\item \link{xgb.cb.reset.parameters}
+\item \link{xgb.cb.early.stop}
+\item \link{xgb.cb.save.model}
+\item \link{xgb.cb.cv.predict}
+\item \link{xgb.cb.gblinear.history}
+}
+}
diff --git a/R-package/man/xgb.DMatrix.Rd b/R-package/man/xgb.DMatrix.Rd
index d18270733..5f764ed45 100644
--- a/R-package/man/xgb.DMatrix.Rd
+++ b/R-package/man/xgb.DMatrix.Rd
@@ -19,7 +19,8 @@ xgb.DMatrix(
   qid = NULL,
   label_lower_bound = NULL,
   label_upper_bound = NULL,
-  feature_weights = NULL
+  feature_weights = NULL,
+  data_split_mode = "row"
 )
 
 xgb.QuantileDMatrix(
@@ -60,10 +61,27 @@ Other column types are not supported.
 'xgb.QuantileDMatrix'.
 \item Single-row CSR matrices, as class \code{dsparseVector} from package \code{Matrix}, which is interpreted
 as a single row (only when making predictions from a fitted model).
-\item Text files in SVMLight / LibSVM formats, passed as a path to the file. These are \bold{not}
-supported for xgb.QuantileDMatrix'.
-\item Binary files generated by \link{xgb.DMatrix.save},  passed as a path to the file. These are
-\bold{not} supported for xgb.QuantileDMatrix'.
+\item Text files in a supported format, passed as a \code{character} variable containing the URI path to
+the file, with an optional format specifier.
+
+These are \bold{not} supported for \code{xgb.QuantileDMatrix}. Supported formats are:\itemize{
+\item XGBoost's own binary format for DMatrices, as produced by \link{xgb.DMatrix.save}.
+\item SVMLight (a.k.a. LibSVM) format for CSR matrices. This format can be signaled by suffix
+\code{?format=libsvm} at the end of the file path. It will be the default format if not
+otherwise specified.
+\item CSV files (comma-separated values). This format can be specified by adding suffix
+\code{?format=csv} at the end ofthe file path. It will \bold{not} be auto-deduced from file extensions.
+}
+
+Be aware that the format of the file will not be auto-deduced - for example, if a file is named 'file.csv',
+it will not look at the extension or file contents to determine that it is a comma-separated value.
+Instead, the format must be specified following the URI format, so the input to \code{data} should be passed
+like this: \code{"file.csv?format=csv"} (or \code{"file.csv?format=csv&label_column=0"} if the first column
+corresponds to the labels).
+
+For more information about passing text files as input, see the articles
+\href{https://xgboost.readthedocs.io/en/stable/tutorials/input_format.html}{Text Input Format of DMatrix} and
+\href{https://xgboost.readthedocs.io/en/stable/python/python_intro.html#python-data-interface}{Data Interface}.
 }}
 
 \item{label}{Label of the training data. For classification problems, should be passed encoded as
@@ -129,6 +147,14 @@ not be saved, so make sure that \code{factor} columns passed to \code{predict} h
 
 \item{feature_weights}{Set feature weights for column sampling.}
 
+\item{data_split_mode}{When passing a URI (as R \code{character}) as input, this signals
+whether to split by row or column. Allowed values are \code{"row"} and \code{"col"}.
+
+In distributed mode, the file is split accordingly; otherwise this is only an indicator on
+how the file was split beforehand. Default to row.
+
+This is not used when \code{data} is not a URI.}
+
 \item{ref}{The training dataset that provides quantile information, needed when creating
 validation/test dataset with \code{xgb.QuantileDMatrix}. Supplying the training DMatrix
 as a reference means that the same quantisation applied to the training data is
diff --git a/R-package/man/xgb.DMatrix.save.Rd b/R-package/man/xgb.DMatrix.save.Rd
index d5c0563b3..51643274d 100644
--- a/R-package/man/xgb.DMatrix.save.Rd
+++ b/R-package/man/xgb.DMatrix.save.Rd
@@ -15,6 +15,7 @@ xgb.DMatrix.save(dmatrix, fname)
 Save xgb.DMatrix object to binary file
 }
 \examples{
+\dontshow{RhpcBLASctl::omp_set_num_threads(1)}
 data(agaricus.train, package='xgboost')
 dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label, nthread = 2))
 fname <- file.path(tempdir(), "xgb.DMatrix.data")
diff --git a/R-package/man/cb.cv.predict.Rd b/R-package/man/xgb.cb.cv.predict.Rd
similarity index 53%
rename from R-package/man/cb.cv.predict.Rd
rename to R-package/man/xgb.cb.cv.predict.Rd
index 4cabac1c9..d2d9a084b 100644
--- a/R-package/man/cb.cv.predict.Rd
+++ b/R-package/man/xgb.cb.cv.predict.Rd
@@ -1,16 +1,27 @@
 % Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/callbacks.R
-\name{cb.cv.predict}
-\alias{cb.cv.predict}
-\title{Callback closure for returning cross-validation based predictions.}
+\name{xgb.cb.cv.predict}
+\alias{xgb.cb.cv.predict}
+\title{Callback for returning cross-validation based predictions.}
 \usage{
-cb.cv.predict(save_models = FALSE)
+xgb.cb.cv.predict(save_models = FALSE, outputmargin = FALSE)
 }
 \arguments{
-\item{save_models}{a flag for whether to save the folds' models.}
+\item{save_models}{A flag for whether to save the folds' models.}
+
+\item{outputmargin}{Whether to save margin predictions (same effect as passing this
+parameter to \link{predict.xgb.Booster}).}
 }
 \value{
-Predictions are returned inside of the \code{pred} element, which is either a vector or a matrix,
+An \code{xgb.Callback} object, which can be passed to \link{xgb.cv},
+but \bold{not} to \link{xgb.train}.
+}
+\description{
+This callback function saves predictions for all of the test folds,
+and also allows to save the folds' models.
+}
+\details{
+Predictions are saved inside of the \code{pred} element, which is either a vector or a matrix,
 depending on the number of prediction outputs per data row. The order of predictions corresponds
 to the order of rows in the original dataset. Note that when a custom \code{folds} list is
 provided in \code{xgb.cv}, the predictions would only be returned properly when this list is a
@@ -19,23 +30,3 @@ meaningful when user-provided folds have overlapping indices as in, e.g., random
 When some of the indices in the training dataset are not included into user-provided \code{folds},
 their prediction value would be \code{NA}.
 }
-\description{
-Callback closure for returning cross-validation based predictions.
-}
-\details{
-This callback function saves predictions for all of the test folds,
-and also allows to save the folds' models.
-
-It is a "finalizer" callback and it uses early stopping information whenever it is available,
-thus it must be run after the early stopping callback if the early stopping is used.
-
-Callback function expects the following values to be set in its calling frame:
-\code{bst_folds},
-\code{basket},
-\code{data},
-\code{end_iteration},
-\code{params},
-}
-\seealso{
-\code{\link{callbacks}}
-}
diff --git a/R-package/man/xgb.cb.early.stop.Rd b/R-package/man/xgb.cb.early.stop.Rd
new file mode 100644
index 000000000..2a70f4943
--- /dev/null
+++ b/R-package/man/xgb.cb.early.stop.Rd
@@ -0,0 +1,55 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/callbacks.R
+\name{xgb.cb.early.stop}
+\alias{xgb.cb.early.stop}
+\title{Callback to activate early stopping}
+\usage{
+xgb.cb.early.stop(
+  stopping_rounds,
+  maximize = FALSE,
+  metric_name = NULL,
+  verbose = TRUE,
+  keep_all_iter = TRUE
+)
+}
+\arguments{
+\item{stopping_rounds}{The number of rounds with no improvement in
+the evaluation metric in order to stop the training.}
+
+\item{maximize}{Whether to maximize the evaluation metric.}
+
+\item{metric_name}{The name of an evaluation column to use as a criteria for early
+stopping. If not set, the last column would be used.
+Let's say the test data in \code{evals} was labelled as \code{dtest},
+and one wants to use the AUC in test data for early stopping regardless of where
+it is in the \code{evals}, then one of the following would need to be set:
+\code{metric_name='dtest-auc'} or \code{metric_name='dtest_auc'}.
+All dash '-' characters in metric names are considered equivalent to '_'.}
+
+\item{verbose}{Whether to print the early stopping information.}
+
+\item{keep_all_iter}{Whether to keep all of the boosting rounds that were produced
+in the resulting object. If passing \code{FALSE}, will only keep the boosting rounds
+up to the detected best iteration, discarding the ones that come after.}
+}
+\value{
+An \code{xgb.Callback} object, which can be passed to \link{xgb.train} or \link{xgb.cv}.
+}
+\description{
+This callback function determines the condition for early stopping.
+
+The following attributes are assigned to the booster's object:
+\itemize{
+\item \code{best_score} the evaluation score at the best iteration
+\item \code{best_iteration} at which boosting iteration the best score has occurred
+(0-based index for interoperability of binary models)
+}
+
+The same values are also stored as R attributes as a result of the callback, plus an additional
+attribute \code{stopped_by_max_rounds} which indicates whether an early stopping by the \code{stopping_rounds}
+condition occurred. Note that the \code{best_iteration} that is stored under R attributes will follow
+base-1 indexing, so it will be larger by '1' than the C-level 'best_iteration' that is accessed
+through \link{xgb.attr} or \link{xgb.attributes}.
+
+At least one dataset is required in \code{evals} for early stopping to work.
+}
diff --git a/R-package/man/xgb.cb.evaluation.log.Rd b/R-package/man/xgb.cb.evaluation.log.Rd
new file mode 100644
index 000000000..4cc6ef636
--- /dev/null
+++ b/R-package/man/xgb.cb.evaluation.log.Rd
@@ -0,0 +1,24 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/callbacks.R
+\name{xgb.cb.evaluation.log}
+\alias{xgb.cb.evaluation.log}
+\title{Callback for logging the evaluation history}
+\usage{
+xgb.cb.evaluation.log()
+}
+\value{
+An \code{xgb.Callback} object, which can be passed to \link{xgb.train} or \link{xgb.cv}.
+}
+\description{
+Callback for logging the evaluation history
+}
+\details{
+This callback creates a table with per-iteration evaluation metrics (see parameters
+\code{evals} and \code{feval} in \link{xgb.train}).
+
+Note: in the column names of the final data.table, the dash '-' character is replaced with
+the underscore '_' in order to make the column names more like regular R identifiers.
+}
+\seealso{
+\link{xgb.cb.print.evaluation}
+}
diff --git a/R-package/man/cb.gblinear.history.Rd b/R-package/man/xgb.cb.gblinear.history.Rd
similarity index 63%
rename from R-package/man/cb.gblinear.history.Rd
rename to R-package/man/xgb.cb.gblinear.history.Rd
index 2a03c14db..0ebaa4685 100644
--- a/R-package/man/cb.gblinear.history.Rd
+++ b/R-package/man/xgb.cb.gblinear.history.Rd
@@ -1,37 +1,48 @@
 % Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/callbacks.R
-\name{cb.gblinear.history}
-\alias{cb.gblinear.history}
-\title{Callback closure for collecting the model coefficients history of a gblinear booster
-during its training.}
+\name{xgb.cb.gblinear.history}
+\alias{xgb.cb.gblinear.history}
+\title{Callback for collecting coefficients history of a gblinear booster}
 \usage{
-cb.gblinear.history(sparse = FALSE)
+xgb.cb.gblinear.history(sparse = FALSE)
 }
 \arguments{
-\item{sparse}{when set to FALSE/TRUE, a dense/sparse matrix is used to store the result.
+\item{sparse}{when set to \code{FALSE}/\code{TRUE}, a dense/sparse matrix is used to store the result.
 Sparse format is useful when one expects only a subset of coefficients to be non-zero,
 when using the "thrifty" feature selector with fairly small number of top features
 selected per iteration.}
 }
 \value{
-Results are stored in the \code{coefs} element of the closure.
-The \code{\link{xgb.gblinear.history}} convenience function provides an easy
-way to access it.
-With \code{xgb.train}, it is either a dense of a sparse matrix.
-While with \code{xgb.cv}, it is a list (an element per each fold) of such
-matrices.
+An \code{xgb.Callback} object, which can be passed to \link{xgb.train} or \link{xgb.cv}.
 }
 \description{
-Callback closure for collecting the model coefficients history of a gblinear booster
-during its training.
+Callback for collecting coefficients history of a gblinear booster
 }
 \details{
 To keep things fast and simple, gblinear booster does not internally store the history of linear
 model coefficients at each boosting iteration. This callback provides a workaround for storing
 the coefficients' path, by extracting them after each training iteration.
 
-Callback function expects the following values to be set in its calling frame:
-\code{bst} (or \code{bst_folds}).
+This callback will construct a matrix where rows are boosting iterations and columns are
+feature coefficients (same order as when calling \link{coef.xgb.Booster}, with the intercept
+corresponding to the first column).
+
+When there is more than one coefficient per feature (e.g. multi-class classification),
+the result will be reshaped into a vector where coefficients are arranged first by features and
+then by class (e.g. first 1 through N coefficients will be for the first class, then
+coefficients N+1 through 2N for the second class, and so on).
+
+If the result has only one coefficient per feature in the data, then the resulting matrix
+will have column names matching with the feature names, otherwise (when there's more than
+one coefficient per feature) the names will be composed as 'column name' + ':' + 'class index'
+(so e.g. column 'c1' for class '0' will be named 'c1:0').
+
+With \code{xgb.train}, the output is either a dense or a sparse matrix.
+With with \code{xgb.cv}, it is a list (one element per each fold) of such
+matrices.
+
+Function \link{xgb.gblinear.history} function provides an easy way to retrieve the
+outputs from this callback.
 }
 \examples{
 #### Binary classification:
@@ -52,7 +63,7 @@ param <- list(booster = "gblinear", objective = "reg:logistic", eval_metric = "a
 # rate does not break the convergence, but allows us to illustrate the typical pattern of
 # "stochastic explosion" behaviour of this lock-free algorithm at early boosting iterations.
 bst <- xgb.train(param, dtrain, list(tr=dtrain), nrounds = 200, eta = 1.,
-                 callbacks = list(cb.gblinear.history()))
+                 callbacks = list(xgb.cb.gblinear.history()))
 # Extract the coefficients' path and plot them vs boosting iteration number:
 coef_path <- xgb.gblinear.history(bst)
 matplot(coef_path, type = 'l')
@@ -61,7 +72,7 @@ matplot(coef_path, type = 'l')
 # Will try the classical componentwise boosting which selects a single best feature per round:
 bst <- xgb.train(param, dtrain, list(tr=dtrain), nrounds = 200, eta = 0.8,
                  updater = 'coord_descent', feature_selector = 'thrifty', top_k = 1,
-                 callbacks = list(cb.gblinear.history()))
+                 callbacks = list(xgb.cb.gblinear.history()))
 matplot(xgb.gblinear.history(bst), type = 'l')
 #  Componentwise boosting is known to have similar effect to Lasso regularization.
 # Try experimenting with various values of top_k, eta, nrounds,
@@ -69,7 +80,7 @@ matplot(xgb.gblinear.history(bst), type = 'l')
 
 # For xgb.cv:
 bst <- xgb.cv(param, dtrain, nfold = 5, nrounds = 100, eta = 0.8,
-              callbacks = list(cb.gblinear.history()))
+              callbacks = list(xgb.cb.gblinear.history()))
 # coefficients in the CV fold #3
 matplot(xgb.gblinear.history(bst)[[3]], type = 'l')
 
@@ -82,7 +93,7 @@ param <- list(booster = "gblinear", objective = "multi:softprob", num_class = 3,
 # For the default linear updater 'shotgun' it sometimes is helpful
 # to use smaller eta to reduce instability
 bst <- xgb.train(param, dtrain, list(tr=dtrain), nrounds = 50, eta = 0.5,
-                 callbacks = list(cb.gblinear.history()))
+                 callbacks = list(xgb.cb.gblinear.history()))
 # Will plot the coefficient paths separately for each class:
 matplot(xgb.gblinear.history(bst, class_index = 0), type = 'l')
 matplot(xgb.gblinear.history(bst, class_index = 1), type = 'l')
@@ -90,11 +101,11 @@ matplot(xgb.gblinear.history(bst, class_index = 2), type = 'l')
 
 # CV:
 bst <- xgb.cv(param, dtrain, nfold = 5, nrounds = 70, eta = 0.5,
-              callbacks = list(cb.gblinear.history(FALSE)))
+              callbacks = list(xgb.cb.gblinear.history(FALSE)))
 # 1st fold of 1st class
 matplot(xgb.gblinear.history(bst, class_index = 0)[[1]], type = 'l')
 
 }
 \seealso{
-\code{\link{callbacks}}, \code{\link{xgb.gblinear.history}}.
+\link{xgb.gblinear.history}, \link{coef.xgb.Booster}.
 }
diff --git a/R-package/man/xgb.cb.print.evaluation.Rd b/R-package/man/xgb.cb.print.evaluation.Rd
new file mode 100644
index 000000000..c4f2e6991
--- /dev/null
+++ b/R-package/man/xgb.cb.print.evaluation.Rd
@@ -0,0 +1,25 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/callbacks.R
+\name{xgb.cb.print.evaluation}
+\alias{xgb.cb.print.evaluation}
+\title{Callback for printing the result of evaluation}
+\usage{
+xgb.cb.print.evaluation(period = 1, showsd = TRUE)
+}
+\arguments{
+\item{period}{results would be printed every number of periods}
+
+\item{showsd}{whether standard deviations should be printed (when available)}
+}
+\value{
+An \code{xgb.Callback} object, which can be passed to \link{xgb.train} or \link{xgb.cv}.
+}
+\description{
+The callback function prints the result of evaluation at every \code{period} iterations.
+The initial and the last iteration's evaluations are always printed.
+
+Does not leave any attribute in the booster (see \link{xgb.cb.evaluation.log} for that).
+}
+\seealso{
+\link{xgb.Callback}
+}
diff --git a/R-package/man/cb.reset.parameters.Rd b/R-package/man/xgb.cb.reset.parameters.Rd
similarity index 57%
rename from R-package/man/cb.reset.parameters.Rd
rename to R-package/man/xgb.cb.reset.parameters.Rd
index ee0a5d1bd..c7e863817 100644
--- a/R-package/man/cb.reset.parameters.Rd
+++ b/R-package/man/xgb.cb.reset.parameters.Rd
@@ -1,10 +1,10 @@
 % Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/callbacks.R
-\name{cb.reset.parameters}
-\alias{cb.reset.parameters}
-\title{Callback closure for resetting the booster's parameters at each iteration.}
+\name{xgb.cb.reset.parameters}
+\alias{xgb.cb.reset.parameters}
+\title{Callback for resetting the booster's parameters at each iteration.}
 \usage{
-cb.reset.parameters(new_params)
+xgb.cb.reset.parameters(new_params)
 }
 \arguments{
 \item{new_params}{a list where each element corresponds to a parameter that needs to be reset.
@@ -14,23 +14,16 @@ or a function of two parameters \code{learning_rates(iteration, nrounds)}
 which returns a new parameter value by using the current iteration number
 and the total number of boosting rounds.}
 }
+\value{
+An \code{xgb.Callback} object, which can be passed to \link{xgb.train} or \link{xgb.cv}.
+}
 \description{
-Callback closure for resetting the booster's parameters at each iteration.
+Callback for resetting the booster's parameters at each iteration.
 }
 \details{
-This is a "pre-iteration" callback function used to reset booster's parameters
-at the beginning of each iteration.
-
 Note that when training is resumed from some previous model, and a function is used to
 reset a parameter value, the \code{nrounds} argument in this function would be the
 the number of boosting rounds in the current training.
 
-Callback function expects the following values to be set in its calling frame:
-\code{bst} or \code{bst_folds},
-\code{iteration},
-\code{begin_iteration},
-\code{end_iteration}.
-}
-\seealso{
-\code{\link{callbacks}}
+Does not leave any attribute in the booster.
 }
diff --git a/R-package/man/xgb.cb.save.model.Rd b/R-package/man/xgb.cb.save.model.Rd
new file mode 100644
index 000000000..8ddba2f1a
--- /dev/null
+++ b/R-package/man/xgb.cb.save.model.Rd
@@ -0,0 +1,28 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/callbacks.R
+\name{xgb.cb.save.model}
+\alias{xgb.cb.save.model}
+\title{Callback for saving a model file.}
+\usage{
+xgb.cb.save.model(save_period = 0, save_name = "xgboost.ubj")
+}
+\arguments{
+\item{save_period}{Save the model to disk after every
+\code{save_period} iterations; 0 means save the model at the end.}
+
+\item{save_name}{The name or path for the saved model file.
+It can contain a \code{\link[base]{sprintf}} formatting specifier
+to include the integer iteration number in the file name.
+E.g., with \code{save_name} = 'xgboost_\%04d.model',
+the file saved at iteration 50 would be named "xgboost_0050.model".}
+}
+\value{
+An \code{xgb.Callback} object, which can be passed to \link{xgb.train},
+but \bold{not} to \link{xgb.cv}.
+}
+\description{
+This callback function allows to save an xgb-model file, either periodically
+after each \code{save_period}'s or at the end.
+
+Does not leave any attribute in the booster.
+}
diff --git a/R-package/man/xgb.create.features.Rd b/R-package/man/xgb.create.features.Rd
index 68b561997..995c27459 100644
--- a/R-package/man/xgb.create.features.Rd
+++ b/R-package/man/xgb.create.features.Rd
@@ -82,7 +82,6 @@ new.dtrain <- xgb.DMatrix(
 new.dtest <- xgb.DMatrix(
   data = new.features.test, label = agaricus.test$label, nthread = 2
 )
-watchlist <- list(train = new.dtrain)
 bst <- xgb.train(params = param, data = new.dtrain, nrounds = nrounds, nthread = 2)
 
 # Model accuracy with new features
diff --git a/R-package/man/xgb.cv.Rd b/R-package/man/xgb.cv.Rd
index 9f6103a52..778b4540a 100644
--- a/R-package/man/xgb.cv.Rd
+++ b/R-package/man/xgb.cv.Rd
@@ -59,7 +59,7 @@ that NA values should be considered as 'missing' by the algorithm.
 Sometimes, 0 or other extreme value might be used to represent missing values.}
 
 \item{prediction}{A logical value indicating whether to return the test fold predictions
-from each CV model. This parameter engages the \code{\link{cb.cv.predict}} callback.}
+from each CV model. This parameter engages the \code{\link{xgb.cb.cv.predict}} callback.}
 
 \item{showsd}{\code{boolean}, whether to show standard deviation of cross validation}
 
@@ -98,20 +98,20 @@ the \code{nfold} and \code{stratified} parameters are ignored.}
 
 \item{print_every_n}{Print each n-th iteration evaluation messages when \code{verbose>0}.
 Default is 1 which means all messages are printed. This parameter is passed to the
-\code{\link{cb.print.evaluation}} callback.}
+\code{\link{xgb.cb.print.evaluation}} callback.}
 
 \item{early_stopping_rounds}{If \code{NULL}, the early stopping function is not triggered.
 If set to an integer \code{k}, training with a validation set will stop if the performance
 doesn't improve for \code{k} rounds.
-Setting this parameter engages the \code{\link{cb.early.stop}} callback.}
+Setting this parameter engages the \code{\link{xgb.cb.early.stop}} callback.}
 
 \item{maximize}{If \code{feval} and \code{early_stopping_rounds} are set,
 then this parameter must be set as well.
 When it is \code{TRUE}, it means the larger the evaluation score the better.
-This parameter is passed to the \code{\link{cb.early.stop}} callback.}
+This parameter is passed to the \code{\link{xgb.cb.early.stop}} callback.}
 
 \item{callbacks}{a list of callback functions to perform various task during boosting.
-See \code{\link{callbacks}}. Some of the callbacks are automatically created depending on the
+See \code{\link{xgb.Callback}}. Some of the callbacks are automatically created depending on the
 parameters' values. User can provide either existing or their own callback methods in order
 to customize the training process.}
 
@@ -122,24 +122,24 @@ An object of class \code{xgb.cv.synchronous} with the following elements:
 \itemize{
 \item \code{call} a function call.
 \item \code{params} parameters that were passed to the xgboost library. Note that it does not
-capture parameters changed by the \code{\link{cb.reset.parameters}} callback.
-\item \code{callbacks} callback functions that were either automatically assigned or
-explicitly passed.
+capture parameters changed by the \code{\link{xgb.cb.reset.parameters}} callback.
 \item \code{evaluation_log} evaluation history stored as a \code{data.table} with the
 first column corresponding to iteration number and the rest corresponding to the
 CV-based evaluation means and standard deviations for the training and test CV-sets.
-It is created by the \code{\link{cb.evaluation.log}} callback.
+It is created by the \code{\link{xgb.cb.evaluation.log}} callback.
 \item \code{niter} number of boosting iterations.
 \item \code{nfeatures} number of features in training data.
 \item \code{folds} the list of CV folds' indices - either those passed through the \code{folds}
 parameter or randomly generated.
 \item \code{best_iteration} iteration number with the best evaluation metric value
 (only available with early stopping).
-\item \code{pred} CV prediction values available when \code{prediction} is set.
-It is either vector or matrix (see \code{\link{cb.cv.predict}}).
-\item \code{models} a list of the CV folds' models. It is only available with the explicit
-setting of the \code{cb.cv.predict(save_models = TRUE)} callback.
 }
+
+Plus other potential elements that are the result of callbacks, such as a list \code{cv_predict} with
+a sub-element \code{pred} when passing \code{prediction = TRUE}, which is added by the \link{xgb.cb.cv.predict}
+callback (note that one can also pass it manually under \code{callbacks} with different settings,
+such as saving also the models created during cross validation); or a list \code{early_stop} which
+will contain elements such as \code{best_iteration} when using the early stopping callback (\link{xgb.cb.early.stop}).
 }
 \description{
 The cross validation function of xgboost
diff --git a/R-package/man/xgb.dump.Rd b/R-package/man/xgb.dump.Rd
index 2cdb6b16a..6f97f6924 100644
--- a/R-package/man/xgb.dump.Rd
+++ b/R-package/man/xgb.dump.Rd
@@ -44,6 +44,7 @@ as a \code{character} vector. Otherwise it will return \code{TRUE}.
 Dump an xgboost model in text format.
 }
 \examples{
+\dontshow{RhpcBLASctl::omp_set_num_threads(1)}
 data(agaricus.train, package='xgboost')
 data(agaricus.test, package='xgboost')
 train <- agaricus.train
diff --git a/R-package/man/xgb.gblinear.history.Rd b/R-package/man/xgb.gblinear.history.Rd
index 103be16f1..25aef7163 100644
--- a/R-package/man/xgb.gblinear.history.Rd
+++ b/R-package/man/xgb.gblinear.history.Rd
@@ -8,7 +8,7 @@ xgb.gblinear.history(model, class_index = NULL)
 }
 \arguments{
 \item{model}{either an \code{xgb.Booster} or a result of \code{xgb.cv()}, trained
-using the \code{cb.gblinear.history()} callback, but \bold{not} a booster
+using the \link{xgb.cb.gblinear.history} callback, but \bold{not} a booster
 loaded from \link{xgb.load} or \link{xgb.load.raw}.}
 
 \item{class_index}{zero-based class index to extract the coefficients for only that
@@ -16,23 +16,31 @@ specific class in a multinomial multiclass model. When it is NULL, all the
 coefficients are returned. Has no effect in non-multiclass models.}
 }
 \value{
-For an \code{xgb.train} result, a matrix (either dense or sparse) with the columns
-corresponding to iteration's coefficients (in the order as \code{xgb.dump()} would
-return) and the rows corresponding to boosting iterations.
+For an \link{xgb.train} result, a matrix (either dense or sparse) with the columns
+corresponding to iteration's coefficients and the rows corresponding to boosting iterations.
 
-For an \code{xgb.cv} result, a list of such matrices is returned with the elements
+For an \link{xgb.cv} result, a list of such matrices is returned with the elements
 corresponding to CV folds.
+
+When there is more than one coefficient per feature (e.g. multi-class classification)
+and \code{class_index} is not provided,
+the result will be reshaped into a vector where coefficients are arranged first by features and
+then by class (e.g. first 1 through N coefficients will be for the first class, then
+coefficients N+1 through 2N for the second class, and so on).
 }
 \description{
 A helper function to extract the matrix of linear coefficients' history
-from a gblinear model created while using the \code{cb.gblinear.history()}
-callback.
+from a gblinear model created while using the \link{xgb.cb.gblinear.history}
+callback (which must be added manually as by default it's not used).
 }
 \details{
 Note that this is an R-specific function that relies on R attributes that
 are not saved when using xgboost's own serialization functions like \link{xgb.load}
 or \link{xgb.load.raw}.
 
-In order for a serialized model to be accepted by tgis function, one must use R
+In order for a serialized model to be accepted by this function, one must use R
 serializers such as \link{saveRDS}.
 }
+\seealso{
+\link{xgb.cb.gblinear.history}, \link{coef.xgb.Booster}.
+}
diff --git a/R-package/man/xgb.load.Rd b/R-package/man/xgb.load.Rd
index 1a6873171..e18a900e3 100644
--- a/R-package/man/xgb.load.Rd
+++ b/R-package/man/xgb.load.Rd
@@ -17,7 +17,7 @@ Load xgboost model from the binary model file.
 }
 \details{
 The input file is expected to contain a model saved in an xgboost model format
-using either \code{\link{xgb.save}} or \code{\link{cb.save.model}} in R, or using some
+using either \code{\link{xgb.save}} or \code{\link{xgb.cb.save.model}} in R, or using some
 appropriate methods from other xgboost interfaces. E.g., a model trained in Python and
 saved from there in xgboost format, could be loaded from R.
 
@@ -25,6 +25,7 @@ Note: a model saved as an R-object, has to be loaded using corresponding R-metho
 not \code{xgb.load}.
 }
 \examples{
+\dontshow{RhpcBLASctl::omp_set_num_threads(1)}
 data(agaricus.train, package='xgboost')
 data(agaricus.test, package='xgboost')
 
diff --git a/R-package/man/xgb.save.Rd b/R-package/man/xgb.save.Rd
index 0db80a120..bcfbd0bb4 100644
--- a/R-package/man/xgb.save.Rd
+++ b/R-package/man/xgb.save.Rd
@@ -41,6 +41,7 @@ how to persist models in a future-proof way, i.e. to make the model accessible i
 releases of XGBoost.
 }
 \examples{
+\dontshow{RhpcBLASctl::omp_set_num_threads(1)}
 data(agaricus.train, package='xgboost')
 data(agaricus.test, package='xgboost')
 
diff --git a/R-package/man/xgb.save.raw.Rd b/R-package/man/xgb.save.raw.Rd
index 15400bb14..6cdafd3d9 100644
--- a/R-package/man/xgb.save.raw.Rd
+++ b/R-package/man/xgb.save.raw.Rd
@@ -21,6 +21,7 @@ xgb.save.raw(model, raw_format = "ubj")
 Save xgboost model from xgboost or xgb.train
 }
 \examples{
+\dontshow{RhpcBLASctl::omp_set_num_threads(1)}
 data(agaricus.train, package='xgboost')
 data(agaricus.test, package='xgboost')
 
diff --git a/R-package/man/xgb.train.Rd b/R-package/man/xgb.train.Rd
index 0421b9c4a..21c8dbe16 100644
--- a/R-package/man/xgb.train.Rd
+++ b/R-package/man/xgb.train.Rd
@@ -9,7 +9,7 @@ xgb.train(
   params = list(),
   data,
   nrounds,
-  watchlist = list(),
+  evals = list(),
   obj = NULL,
   feval = NULL,
   verbose = 1,
@@ -158,13 +158,13 @@ List is provided in detail section.}
 
 \item{nrounds}{max number of boosting iterations.}
 
-\item{watchlist}{named list of xgb.DMatrix datasets to use for evaluating model performance.
+\item{evals}{Named list of \code{xgb.DMatrix} datasets to use for evaluating model performance.
 Metrics specified in either \code{eval_metric} or \code{feval} will be computed for each
 of these datasets during each boosting iteration, and stored in the end as a field named
 \code{evaluation_log} in the resulting object. When either \code{verbose>=1} or
-\code{\link{cb.print.evaluation}} callback is engaged, the performance results are continuously
+\code{\link{xgb.cb.print.evaluation}} callback is engaged, the performance results are continuously
 printed out during the training.
-E.g., specifying \code{watchlist=list(validation1=mat1, validation2=mat2)} allows to track
+E.g., specifying \code{evals=list(validation1=mat1, validation2=mat2)} allows to track
 the performance of each round's model on mat1 and mat2.}
 
 \item{obj}{customized objective function. Returns gradient and second order
@@ -177,24 +177,24 @@ prediction and dtrain.}
 \item{verbose}{If 0, xgboost will stay silent. If 1, it will print information about performance.
 If 2, some additional information will be printed out.
 Note that setting \code{verbose > 0} automatically engages the
-\code{cb.print.evaluation(period=1)} callback function.}
+\code{xgb.cb.print.evaluation(period=1)} callback function.}
 
 \item{print_every_n}{Print each n-th iteration evaluation messages when \code{verbose>0}.
 Default is 1 which means all messages are printed. This parameter is passed to the
-\code{\link{cb.print.evaluation}} callback.}
+\code{\link{xgb.cb.print.evaluation}} callback.}
 
 \item{early_stopping_rounds}{If \code{NULL}, the early stopping function is not triggered.
 If set to an integer \code{k}, training with a validation set will stop if the performance
 doesn't improve for \code{k} rounds.
-Setting this parameter engages the \code{\link{cb.early.stop}} callback.}
+Setting this parameter engages the \code{\link{xgb.cb.early.stop}} callback.}
 
 \item{maximize}{If \code{feval} and \code{early_stopping_rounds} are set,
 then this parameter must be set as well.
 When it is \code{TRUE}, it means the larger the evaluation score the better.
-This parameter is passed to the \code{\link{cb.early.stop}} callback.}
+This parameter is passed to the \code{\link{xgb.cb.early.stop}} callback.}
 
 \item{save_period}{when it is non-NULL, model is saved to disk after every \code{save_period} rounds,
-0 means save at the end. The saving is handled by the \code{\link{cb.save.model}} callback.}
+0 means save at the end. The saving is handled by the \code{\link{xgb.cb.save.model}} callback.}
 
 \item{save_name}{the name or path for periodically saved model file.}
 
@@ -203,12 +203,13 @@ Could be either an object of class \code{xgb.Booster}, or its raw data, or the n
 file with a previously saved model.}
 
 \item{callbacks}{a list of callback functions to perform various task during boosting.
-See \code{\link{callbacks}}. Some of the callbacks are automatically created depending on the
+See \code{\link{xgb.Callback}}. Some of the callbacks are automatically created depending on the
 parameters' values. User can provide either existing or their own callback methods in order
 to customize the training process.
 
-\if{html}{\out{<div class="sourceCode">}}\preformatted{   Note that some callbacks might try to set an evaluation log - be aware that these evaluation logs
-   are kept as R attributes, and thus do not get saved when using non-R serializaters like
+\if{html}{\out{<div class="sourceCode">}}\preformatted{   Note that some callbacks might try to leave attributes in the resulting model object,
+   such as an evaluation log (a `data.table` object) - be aware that these objects are kept
+   as R attributes, and thus do not get saved when using XGBoost's own serializaters like
    \link{xgb.save} (but are kept when using R serializers like \link{saveRDS}).
 }\if{html}{\out{</div>}}}
 
@@ -233,7 +234,7 @@ The \code{xgboost} function is a simpler wrapper for \code{xgb.train}.
 \details{
 These are the training functions for \code{xgboost}.
 
-The \code{xgb.train} interface supports advanced features such as \code{watchlist},
+The \code{xgb.train} interface supports advanced features such as \code{evals},
 customized objective and evaluation metric functions, therefore it is more flexible
 than the \code{xgboost} interface.
 
@@ -241,6 +242,11 @@ Parallelization is automatically enabled if \code{OpenMP} is present.
 Number of threads can also be manually specified via the \code{nthread}
 parameter.
 
+While in other interfaces, the default random seed defaults to zero, in R, if a parameter \code{seed}
+is not manually supplied, it will generate a random seed through R's own random number generator,
+whose seed in turn is controllable through \code{set.seed}. If \code{seed} is passed, it will override the
+RNG from R.
+
 The evaluation metric is chosen automatically by XGBoost (according to the objective)
 when the \code{eval_metric} parameter is not provided.
 User may set one or several \code{eval_metric} parameters.
@@ -264,18 +270,19 @@ Different threshold (e.g., 0.) could be specified as "error@0."
 
 The following callbacks are automatically created when certain parameters are set:
 \itemize{
-\item \code{cb.print.evaluation} is turned on when \code{verbose > 0};
+\item \code{xgb.cb.print.evaluation} is turned on when \code{verbose > 0};
 and the \code{print_every_n} parameter is passed to it.
-\item \code{cb.evaluation.log} is on when \code{watchlist} is present.
-\item \code{cb.early.stop}: when \code{early_stopping_rounds} is set.
-\item \code{cb.save.model}: when \code{save_period > 0} is set.
+\item \code{xgb.cb.evaluation.log} is on when \code{evals} is present.
+\item \code{xgb.cb.early.stop}: when \code{early_stopping_rounds} is set.
+\item \code{xgb.cb.save.model}: when \code{save_period > 0} is set.
 }
 
 Note that objects of type \code{xgb.Booster} as returned by this function behave a bit differently
 from typical R objects (it's an 'altrep' list class), and it makes a separation between
 internal booster attributes (restricted to jsonifyable data), accessed through \link{xgb.attr}
 and shared between interfaces through serialization functions like \link{xgb.save}; and
-R-specific attributes, accessed through \link{attributes} and \link{attr}, which are otherwise
+R-specific attributes (typically the result from a callback), accessed through \link{attributes}
+and \link{attr}, which are otherwise
 only used in the R interface, only kept when using R's serializers like \link{saveRDS}, and
 not anyhow used by functions like \link{predict.xgb.Booster}.
 
@@ -300,12 +307,12 @@ dtrain <- with(
 dtest <- with(
   agaricus.test, xgb.DMatrix(data, label = label, nthread = nthread)
 )
-watchlist <- list(train = dtrain, eval = dtest)
+evals <- list(train = dtrain, eval = dtest)
 
 ## A simple xgb.train example:
 param <- list(max_depth = 2, eta = 1, nthread = nthread,
               objective = "binary:logistic", eval_metric = "auc")
-bst <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0)
+bst <- xgb.train(param, dtrain, nrounds = 2, evals = evals, verbose = 0)
 
 ## An xgb.train example where custom objective and evaluation metric are
 ## used:
@@ -326,15 +333,15 @@ evalerror <- function(preds, dtrain) {
 #  as 'objective' and 'eval_metric' parameters in the params list:
 param <- list(max_depth = 2, eta = 1, nthread = nthread,
               objective = logregobj, eval_metric = evalerror)
-bst <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0)
+bst <- xgb.train(param, dtrain, nrounds = 2, evals = evals, verbose = 0)
 
 #  or through the ... arguments:
 param <- list(max_depth = 2, eta = 1, nthread = nthread)
-bst <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0,
+bst <- xgb.train(param, dtrain, nrounds = 2, evals = evals, verbose = 0,
                  objective = logregobj, eval_metric = evalerror)
 
 #  or as dedicated 'obj' and 'feval' parameters of xgb.train:
-bst <- xgb.train(param, dtrain, nrounds = 2, watchlist,
+bst <- xgb.train(param, dtrain, nrounds = 2, evals = evals,
                  obj = logregobj, feval = evalerror)
 
 
@@ -342,11 +349,11 @@ bst <- xgb.train(param, dtrain, nrounds = 2, watchlist,
 param <- list(max_depth = 2, eta = 1, nthread = nthread,
               objective = "binary:logistic", eval_metric = "auc")
 my_etas <- list(eta = c(0.5, 0.1))
-bst <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0,
-                 callbacks = list(cb.reset.parameters(my_etas)))
+bst <- xgb.train(param, dtrain, nrounds = 2, evals = evals, verbose = 0,
+                 callbacks = list(xgb.cb.reset.parameters(my_etas)))
 
 ## Early stopping:
-bst <- xgb.train(param, dtrain, nrounds = 25, watchlist,
+bst <- xgb.train(param, dtrain, nrounds = 25, evals = evals,
                  early_stopping_rounds = 3)
 
 ## An 'xgboost' interface example:
@@ -361,7 +368,7 @@ Tianqi Chen and Carlos Guestrin, "XGBoost: A Scalable Tree Boosting System",
 22nd SIGKDD Conference on Knowledge Discovery and Data Mining, 2016, \url{https://arxiv.org/abs/1603.02754}
 }
 \seealso{
-\code{\link{callbacks}},
+\code{\link{xgb.Callback}},
 \code{\link{predict.xgb.Booster}},
 \code{\link{xgb.cv}}
 }
diff --git a/R-package/man/xgbConfig.Rd b/R-package/man/xgbConfig.Rd
index 94b220c77..164c62ef4 100644
--- a/R-package/man/xgbConfig.Rd
+++ b/R-package/man/xgbConfig.Rd
@@ -25,6 +25,15 @@ values of one or more global-scope parameters. Use \code{xgb.get.config} to fetc
 values of all global-scope parameters (listed in
 \url{https://xgboost.readthedocs.io/en/stable/parameter.html}).
 }
+\details{
+Note that serialization-related functions might use a globally-configured number of threads,
+which is managed by the system's OpenMP (OMP) configuration instead. Typically, XGBoost methods
+accept an \code{nthreads} parameter, but some methods like \code{readRDS} might get executed before such
+parameter can be supplied.
+
+The number of OMP threads can in turn be configured for example through an environment variable
+\code{OMP_NUM_THREADS} (needs to be set before R is started), or through \code{RhpcBLASctl::omp_set_num_threads}.
+}
 \examples{
 # Set verbosity level to silent (0)
 xgb.set.config(verbosity = 0)
diff --git a/R-package/src/Makevars.in b/R-package/src/Makevars.in
index dd13983f5..0f4b3ac6f 100644
--- a/R-package/src/Makevars.in
+++ b/R-package/src/Makevars.in
@@ -104,6 +104,7 @@ OBJECTS= \
     $(PKGROOT)/src/collective/broadcast.o \
     $(PKGROOT)/src/collective/comm.o \
     $(PKGROOT)/src/collective/coll.o \
+    $(PKGROOT)/src/collective/communicator-inl.o \
     $(PKGROOT)/src/collective/tracker.o \
     $(PKGROOT)/src/collective/communicator.o \
     $(PKGROOT)/src/collective/in_memory_communicator.o \
diff --git a/R-package/src/Makevars.win b/R-package/src/Makevars.win
index 46a862711..0c2084de9 100644
--- a/R-package/src/Makevars.win
+++ b/R-package/src/Makevars.win
@@ -104,6 +104,7 @@ OBJECTS= \
     $(PKGROOT)/src/collective/broadcast.o \
     $(PKGROOT)/src/collective/comm.o \
     $(PKGROOT)/src/collective/coll.o \
+    $(PKGROOT)/src/collective/communicator-inl.o \
     $(PKGROOT)/src/collective/tracker.o \
     $(PKGROOT)/src/collective/communicator.o \
     $(PKGROOT)/src/collective/in_memory_communicator.o \
diff --git a/R-package/src/init.c b/R-package/src/init.c
index a9f3f3e38..c869871c6 100644
--- a/R-package/src/init.c
+++ b/R-package/src/init.c
@@ -37,6 +37,9 @@ extern SEXP XGBoosterLoadJsonConfig_R(SEXP handle, SEXP value);
 extern SEXP XGBoosterSerializeToBuffer_R(SEXP handle);
 extern SEXP XGBoosterUnserializeFromBuffer_R(SEXP handle, SEXP raw);
 extern SEXP XGBoosterPredictFromDMatrix_R(SEXP, SEXP, SEXP);
+extern SEXP XGBoosterPredictFromDense_R(SEXP, SEXP, SEXP, SEXP, SEXP);
+extern SEXP XGBoosterPredictFromCSR_R(SEXP, SEXP, SEXP, SEXP, SEXP);
+extern SEXP XGBoosterPredictFromColumnar_R(SEXP, SEXP, SEXP, SEXP, SEXP);
 extern SEXP XGBoosterSaveModel_R(SEXP, SEXP);
 extern SEXP XGBoosterSetAttr_R(SEXP, SEXP, SEXP);
 extern SEXP XGBoosterSetParam_R(SEXP, SEXP, SEXP);
@@ -46,7 +49,7 @@ extern SEXP XGSetArrayDimInplace_R(SEXP, SEXP);
 extern SEXP XGSetArrayDimNamesInplace_R(SEXP, SEXP);
 extern SEXP XGDMatrixCreateFromCSC_R(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP);
 extern SEXP XGDMatrixCreateFromCSR_R(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP);
-extern SEXP XGDMatrixCreateFromFile_R(SEXP, SEXP);
+extern SEXP XGDMatrixCreateFromURI_R(SEXP, SEXP, SEXP);
 extern SEXP XGDMatrixCreateFromMat_R(SEXP, SEXP, SEXP);
 extern SEXP XGDMatrixGetFloatInfo_R(SEXP, SEXP);
 extern SEXP XGDMatrixGetUIntInfo_R(SEXP, SEXP);
@@ -73,6 +76,7 @@ extern SEXP XGBSetGlobalConfig_R(SEXP);
 extern SEXP XGBGetGlobalConfig_R(void);
 extern SEXP XGBoosterFeatureScore_R(SEXP, SEXP);
 extern SEXP XGBoosterSlice_R(SEXP, SEXP, SEXP, SEXP);
+extern SEXP XGBoosterSliceAndReplace_R(SEXP, SEXP, SEXP, SEXP);
 
 static const R_CallMethodDef CallEntries[] = {
   {"XGDuplicate_R",               (DL_FUNC) &XGDuplicate_R,               1},
@@ -96,6 +100,9 @@ static const R_CallMethodDef CallEntries[] = {
   {"XGBoosterSerializeToBuffer_R",     (DL_FUNC) &XGBoosterSerializeToBuffer_R,     1},
   {"XGBoosterUnserializeFromBuffer_R", (DL_FUNC) &XGBoosterUnserializeFromBuffer_R, 2},
   {"XGBoosterPredictFromDMatrix_R", (DL_FUNC) &XGBoosterPredictFromDMatrix_R, 3},
+  {"XGBoosterPredictFromDense_R", (DL_FUNC) &XGBoosterPredictFromDense_R, 5},
+  {"XGBoosterPredictFromCSR_R",   (DL_FUNC) &XGBoosterPredictFromCSR_R,   5},
+  {"XGBoosterPredictFromColumnar_R", (DL_FUNC) &XGBoosterPredictFromColumnar_R, 5},
   {"XGBoosterSaveModel_R",        (DL_FUNC) &XGBoosterSaveModel_R,        2},
   {"XGBoosterSetAttr_R",          (DL_FUNC) &XGBoosterSetAttr_R,          3},
   {"XGBoosterSetParam_R",         (DL_FUNC) &XGBoosterSetParam_R,         3},
@@ -105,7 +112,7 @@ static const R_CallMethodDef CallEntries[] = {
   {"XGSetArrayDimNamesInplace_R", (DL_FUNC) &XGSetArrayDimNamesInplace_R, 2},
   {"XGDMatrixCreateFromCSC_R",    (DL_FUNC) &XGDMatrixCreateFromCSC_R,    6},
   {"XGDMatrixCreateFromCSR_R",    (DL_FUNC) &XGDMatrixCreateFromCSR_R,    6},
-  {"XGDMatrixCreateFromFile_R",   (DL_FUNC) &XGDMatrixCreateFromFile_R,   2},
+  {"XGDMatrixCreateFromURI_R",    (DL_FUNC) &XGDMatrixCreateFromURI_R,    3},
   {"XGDMatrixCreateFromMat_R",    (DL_FUNC) &XGDMatrixCreateFromMat_R,    3},
   {"XGDMatrixGetFloatInfo_R",     (DL_FUNC) &XGDMatrixGetFloatInfo_R,     2},
   {"XGDMatrixGetUIntInfo_R",      (DL_FUNC) &XGDMatrixGetUIntInfo_R,      2},
@@ -132,6 +139,7 @@ static const R_CallMethodDef CallEntries[] = {
   {"XGBGetGlobalConfig_R",        (DL_FUNC) &XGBGetGlobalConfig_R,        0},
   {"XGBoosterFeatureScore_R",     (DL_FUNC) &XGBoosterFeatureScore_R,     2},
   {"XGBoosterSlice_R",            (DL_FUNC) &XGBoosterSlice_R,            4},
+  {"XGBoosterSliceAndReplace_R",  (DL_FUNC) &XGBoosterSliceAndReplace_R,  4},
   {NULL, NULL, 0}
 };
 
diff --git a/R-package/src/xgboost_R.cc b/R-package/src/xgboost_R.cc
index c91fb94c4..2228932bd 100644
--- a/R-package/src/xgboost_R.cc
+++ b/R-package/src/xgboost_R.cc
@@ -13,6 +13,7 @@
 #include <cstdint>
 #include <cstdio>
 #include <cstring>
+#include <memory>
 #include <limits>
 #include <sstream>
 #include <string>
@@ -207,25 +208,24 @@ SEXP SafeAllocInteger(size_t size, SEXP continuation_token) {
   return xgboost::Json::Dump(jinterface);
 }
 
-[[nodiscard]] std::string MakeJsonConfigForArray(SEXP missing, SEXP n_threads, SEXPTYPE arr_type) {
-  using namespace ::xgboost;  // NOLINT
-  Json jconfig{Object{}};
-
-  const SEXPTYPE missing_type = TYPEOF(missing);
-  if (Rf_isNull(missing) || (missing_type == REALSXP && ISNAN(Rf_asReal(missing))) ||
-      (missing_type == LGLSXP && Rf_asLogical(missing) == R_NaInt) ||
-      (missing_type == INTSXP && Rf_asInteger(missing) == R_NaInt)) {
+void AddMissingToJson(xgboost::Json *jconfig, SEXP missing, SEXPTYPE arr_type) {
+  if (Rf_isNull(missing) || ISNAN(Rf_asReal(missing))) {
     // missing is not specified
     if (arr_type == REALSXP) {
-      jconfig["missing"] = std::numeric_limits<double>::quiet_NaN();
+      (*jconfig)["missing"] = std::numeric_limits<double>::quiet_NaN();
     } else {
-      jconfig["missing"] = R_NaInt;
+      (*jconfig)["missing"] = R_NaInt;
     }
   } else {
     // missing specified
-    jconfig["missing"] = Rf_asReal(missing);
+    (*jconfig)["missing"] = Rf_asReal(missing);
   }
+}
 
+[[nodiscard]] std::string MakeJsonConfigForArray(SEXP missing, SEXP n_threads, SEXPTYPE arr_type) {
+  using namespace ::xgboost;  // NOLINT
+  Json jconfig{Object{}};
+  AddMissingToJson(&jconfig, missing, arr_type);
   jconfig["nthread"] = Rf_asInteger(n_threads);
   return Json::Dump(jconfig);
 }
@@ -365,15 +365,22 @@ XGB_DLL SEXP XGBGetGlobalConfig_R() {
   return mkString(json_str);
 }
 
-XGB_DLL SEXP XGDMatrixCreateFromFile_R(SEXP fname, SEXP silent) {
-  SEXP ret = PROTECT(R_MakeExternalPtr(nullptr, R_NilValue, R_NilValue));
+XGB_DLL SEXP XGDMatrixCreateFromURI_R(SEXP uri, SEXP silent, SEXP data_split_mode) {
+  SEXP ret = Rf_protect(R_MakeExternalPtr(nullptr, R_NilValue, R_NilValue));
+  SEXP uri_char = Rf_protect(Rf_asChar(uri));
+  const char *uri_ptr = CHAR(uri_char);
   R_API_BEGIN();
+  xgboost::Json jconfig{xgboost::Object{}};
+  jconfig["uri"] = std::string(uri_ptr);
+  jconfig["silent"] = Rf_asLogical(silent);
+  jconfig["data_split_mode"] = Rf_asInteger(data_split_mode);
+  const std::string sconfig = xgboost::Json::Dump(jconfig);
   DMatrixHandle handle;
-  CHECK_CALL(XGDMatrixCreateFromFile(CHAR(asChar(fname)), asInteger(silent), &handle));
+  CHECK_CALL(XGDMatrixCreateFromURI(sconfig.c_str(), &handle));
   R_SetExternalPtrAddr(ret, handle);
   R_RegisterCFinalizerEx(ret, _DMatrixFinalizer, TRUE);
   R_API_END();
-  UNPROTECT(1);
+  Rf_unprotect(2);
   return ret;
 }
 
@@ -404,7 +411,7 @@ XGB_DLL SEXP XGDMatrixCreateFromDF_R(SEXP df, SEXP missing, SEXP n_threads) {
   DMatrixHandle handle;
   std::int32_t rc{0};
   {
-    std::string sinterface = MakeArrayInterfaceFromRDataFrame(df);
+    const std::string sinterface = MakeArrayInterfaceFromRDataFrame(df);
     xgboost::Json jconfig{xgboost::Object{}};
     jconfig["missing"] = asReal(missing);
     jconfig["nthread"] = asInteger(n_threads);
@@ -456,7 +463,7 @@ XGB_DLL SEXP XGDMatrixCreateFromCSC_R(SEXP indptr, SEXP indices, SEXP data, SEXP
     Json jconfig{Object{}};
     // Construct configuration
     jconfig["nthread"] = Integer{threads};
-    jconfig["missing"] = xgboost::Number{asReal(missing)};
+    AddMissingToJson(&jconfig, missing, TYPEOF(data));
     std::string config;
     Json::Dump(jconfig, &config);
     res_code = XGDMatrixCreateFromCSC(sindptr.c_str(), sindices.c_str(), sdata.c_str(), nrow,
@@ -491,7 +498,7 @@ XGB_DLL SEXP XGDMatrixCreateFromCSR_R(SEXP indptr, SEXP indices, SEXP data, SEXP
     Json jconfig{Object{}};
     // Construct configuration
     jconfig["nthread"] = Integer{threads};
-    jconfig["missing"] = xgboost::Number{asReal(missing)};
+    AddMissingToJson(&jconfig, missing, TYPEOF(data));
     std::string config;
     Json::Dump(jconfig, &config);
     res_code = XGDMatrixCreateFromCSR(sindptr.c_str(), sindices.c_str(), sdata.c_str(), ncol,
@@ -1240,7 +1247,60 @@ XGB_DLL SEXP XGBoosterEvalOneIter_R(SEXP handle, SEXP iter, SEXP dmats, SEXP evn
   return mkString(ret);
 }
 
-XGB_DLL SEXP XGBoosterPredictFromDMatrix_R(SEXP handle, SEXP dmat, SEXP json_config)  {
+namespace {
+
+struct ProxyDmatrixError : public std::exception {};
+
+struct ProxyDmatrixWrapper {
+  DMatrixHandle proxy_dmat_handle;
+
+  ProxyDmatrixWrapper() {
+    int res_code = XGProxyDMatrixCreate(&this->proxy_dmat_handle);
+    if (res_code != 0) {
+      throw ProxyDmatrixError();
+    }
+  }
+
+  ~ProxyDmatrixWrapper() {
+    if (this->proxy_dmat_handle) {
+      XGDMatrixFree(this->proxy_dmat_handle);
+      this->proxy_dmat_handle = nullptr;
+    }
+  }
+
+  DMatrixHandle get_handle() {
+    return this->proxy_dmat_handle;
+  }
+};
+
+std::unique_ptr<ProxyDmatrixWrapper> GetProxyDMatrixWithBaseMargin(SEXP base_margin) {
+  if (Rf_isNull(base_margin)) {
+    return std::unique_ptr<ProxyDmatrixWrapper>(nullptr);
+  }
+
+  SEXP base_margin_dim = Rf_getAttrib(base_margin, R_DimSymbol);
+  int res_code;
+  try {
+    const std::string array_str = Rf_isNull(base_margin_dim)?
+      MakeArrayInterfaceFromRVector(base_margin) : MakeArrayInterfaceFromRMat(base_margin);
+    std::unique_ptr<ProxyDmatrixWrapper> proxy_dmat(new ProxyDmatrixWrapper());
+    res_code = XGDMatrixSetInfoFromInterface(proxy_dmat->get_handle(),
+                                             "base_margin",
+                                             array_str.c_str());
+    if (res_code != 0) {
+      throw ProxyDmatrixError();
+    }
+    return proxy_dmat;
+  } catch(ProxyDmatrixError &err) {
+    Rf_error("%s", XGBGetLastError());
+  }
+}
+
+enum class PredictionInputType {DMatrix, DenseMatrix, CSRMatrix, DataFrame};
+
+SEXP XGBoosterPredictGeneric(SEXP handle, SEXP input_data, SEXP json_config,
+                                    PredictionInputType input_type, SEXP missing,
+                                    SEXP base_margin) {
   SEXP r_out_shape;
   SEXP r_out_result;
   SEXP r_out = PROTECT(allocVector(VECSXP, 2));
@@ -1252,9 +1312,79 @@ XGB_DLL SEXP XGBoosterPredictFromDMatrix_R(SEXP handle, SEXP dmat, SEXP json_con
   bst_ulong out_dim;
   bst_ulong const *out_shape;
   float const *out_result;
-  CHECK_CALL(XGBoosterPredictFromDMatrix(R_ExternalPtrAddr(handle),
-                                         R_ExternalPtrAddr(dmat), c_json_config,
-                                         &out_shape, &out_dim, &out_result));
+
+  int res_code;
+  {
+    switch (input_type) {
+      case PredictionInputType::DMatrix: {
+        res_code = XGBoosterPredictFromDMatrix(R_ExternalPtrAddr(handle),
+                                               R_ExternalPtrAddr(input_data), c_json_config,
+                                               &out_shape, &out_dim, &out_result);
+        break;
+      }
+
+      case PredictionInputType::CSRMatrix: {
+        std::unique_ptr<ProxyDmatrixWrapper> proxy_dmat = GetProxyDMatrixWithBaseMargin(
+          base_margin);
+        DMatrixHandle proxy_dmat_handle = proxy_dmat.get()? proxy_dmat->get_handle() : nullptr;
+
+        SEXP indptr = VECTOR_ELT(input_data, 0);
+        SEXP indices = VECTOR_ELT(input_data, 1);
+        SEXP data = VECTOR_ELT(input_data, 2);
+        const int ncol_csr = Rf_asInteger(VECTOR_ELT(input_data, 3));
+        const SEXPTYPE type_data = TYPEOF(data);
+        CHECK_EQ(type_data, REALSXP);
+        std::string sindptr, sindices, sdata;
+        CreateFromSparse(indptr, indices, data, &sindptr, &sindices, &sdata);
+
+        xgboost::StringView json_str(c_json_config);
+        xgboost::Json new_json = xgboost::Json::Load(json_str);
+        AddMissingToJson(&new_json, missing, type_data);
+        const std::string new_c_json = xgboost::Json::Dump(new_json);
+
+        res_code = XGBoosterPredictFromCSR(
+          R_ExternalPtrAddr(handle), sindptr.c_str(), sindices.c_str(), sdata.c_str(),
+          ncol_csr, new_c_json.c_str(), proxy_dmat_handle, &out_shape, &out_dim, &out_result);
+        break;
+      }
+
+      case PredictionInputType::DenseMatrix: {
+        std::unique_ptr<ProxyDmatrixWrapper> proxy_dmat = GetProxyDMatrixWithBaseMargin(
+          base_margin);
+        DMatrixHandle proxy_dmat_handle = proxy_dmat.get()? proxy_dmat->get_handle() : nullptr;
+        const std::string array_str = MakeArrayInterfaceFromRMat(input_data);
+
+        xgboost::StringView json_str(c_json_config);
+        xgboost::Json new_json = xgboost::Json::Load(json_str);
+        AddMissingToJson(&new_json, missing, TYPEOF(input_data));
+        const std::string new_c_json = xgboost::Json::Dump(new_json);
+
+        res_code = XGBoosterPredictFromDense(
+          R_ExternalPtrAddr(handle), array_str.c_str(), new_c_json.c_str(),
+          proxy_dmat_handle, &out_shape, &out_dim, &out_result);
+        break;
+      }
+
+      case PredictionInputType::DataFrame: {
+        std::unique_ptr<ProxyDmatrixWrapper> proxy_dmat = GetProxyDMatrixWithBaseMargin(
+          base_margin);
+        DMatrixHandle proxy_dmat_handle = proxy_dmat.get()? proxy_dmat->get_handle() : nullptr;
+
+        const std::string df_str = MakeArrayInterfaceFromRDataFrame(input_data);
+
+        xgboost::StringView json_str(c_json_config);
+        xgboost::Json new_json = xgboost::Json::Load(json_str);
+        AddMissingToJson(&new_json, missing, REALSXP);
+        const std::string new_c_json = xgboost::Json::Dump(new_json);
+
+        res_code = XGBoosterPredictFromColumnar(
+          R_ExternalPtrAddr(handle), df_str.c_str(), new_c_json.c_str(),
+          proxy_dmat_handle, &out_shape, &out_dim, &out_result);
+        break;
+      }
+    }
+  }
+  CHECK_CALL(res_code);
 
   r_out_shape = PROTECT(allocVector(INTSXP, out_dim));
   size_t len = 1;
@@ -1275,6 +1405,31 @@ XGB_DLL SEXP XGBoosterPredictFromDMatrix_R(SEXP handle, SEXP dmat, SEXP json_con
   return r_out;
 }
 
+}  // namespace
+
+XGB_DLL SEXP XGBoosterPredictFromDMatrix_R(SEXP handle, SEXP dmat, SEXP json_config)  {
+  return XGBoosterPredictGeneric(handle, dmat, json_config,
+                                 PredictionInputType::DMatrix, R_NilValue, R_NilValue);
+}
+
+XGB_DLL SEXP XGBoosterPredictFromDense_R(SEXP handle, SEXP R_mat, SEXP missing,
+                                         SEXP json_config, SEXP base_margin) {
+  return XGBoosterPredictGeneric(handle, R_mat, json_config,
+                                 PredictionInputType::DenseMatrix, missing, base_margin);
+}
+
+XGB_DLL SEXP XGBoosterPredictFromCSR_R(SEXP handle, SEXP lst, SEXP missing,
+                                       SEXP json_config, SEXP base_margin) {
+  return XGBoosterPredictGeneric(handle, lst, json_config,
+                                 PredictionInputType::CSRMatrix, missing, base_margin);
+}
+
+XGB_DLL SEXP XGBoosterPredictFromColumnar_R(SEXP handle, SEXP R_df, SEXP missing,
+                                            SEXP json_config, SEXP base_margin) {
+  return XGBoosterPredictGeneric(handle, R_df, json_config,
+                                 PredictionInputType::DataFrame, missing, base_margin);
+}
+
 XGB_DLL SEXP XGBoosterLoadModel_R(SEXP handle, SEXP fname) {
   R_API_BEGIN();
   CHECK_CALL(XGBoosterLoadModel(R_ExternalPtrAddr(handle), CHAR(asChar(fname))));
@@ -1519,3 +1674,18 @@ XGB_DLL SEXP XGBoosterSlice_R(SEXP handle, SEXP begin_layer, SEXP end_layer, SEX
   Rf_unprotect(1);
   return out;
 }
+
+XGB_DLL SEXP XGBoosterSliceAndReplace_R(SEXP handle, SEXP begin_layer, SEXP end_layer, SEXP step) {
+  R_API_BEGIN();
+  BoosterHandle old_handle = R_ExternalPtrAddr(handle);
+  BoosterHandle new_handle = nullptr;
+  CHECK_CALL(XGBoosterSlice(old_handle,
+                            Rf_asInteger(begin_layer),
+                            Rf_asInteger(end_layer),
+                            Rf_asInteger(step),
+                            &new_handle));
+  R_SetExternalPtrAddr(handle, new_handle);
+  CHECK_CALL(XGBoosterFree(old_handle));
+  R_API_END();
+  return R_NilValue;
+}
diff --git a/R-package/src/xgboost_R.h b/R-package/src/xgboost_R.h
index d2e0ae828..cea50c146 100644
--- a/R-package/src/xgboost_R.h
+++ b/R-package/src/xgboost_R.h
@@ -53,12 +53,13 @@ XGB_DLL SEXP XGBSetGlobalConfig_R(SEXP json_str);
 XGB_DLL SEXP XGBGetGlobalConfig_R();
 
 /*!
- * \brief load a data matrix
- * \param fname name of the content
+ * \brief load a data matrix from URI
+ * \param uri URI to the source file to read data from
  * \param silent whether print messages
+ * \param Data split mode (0=rows, 1=columns)
  * \return a loaded data matrix
  */
-XGB_DLL SEXP XGDMatrixCreateFromFile_R(SEXP fname, SEXP silent);
+XGB_DLL SEXP XGDMatrixCreateFromURI_R(SEXP uri, SEXP silent, SEXP data_split_mode);
 
 /*!
  * \brief create matrix content from dense matrix
@@ -370,6 +371,50 @@ XGB_DLL SEXP XGBoosterEvalOneIter_R(SEXP handle, SEXP iter, SEXP dmats, SEXP evn
  * \return A list containing 2 vectors, first one for shape while second one for prediction result.
  */
 XGB_DLL SEXP XGBoosterPredictFromDMatrix_R(SEXP handle, SEXP dmat, SEXP json_config);
+
+/*!
+ * \brief Run prediction on R dense matrix
+ * \param handle handle
+ * \param R_mat R matrix
+ * \param missing missing value
+ * \param json_config See `XGBoosterPredictFromDense` in xgboost c_api.h. Doesn't include 'missing'
+ * \param base_margin base margin for the prediction
+ *
+ * \return A list containing 2 vectors, first one for shape while second one for prediction result.
+ */
+XGB_DLL SEXP XGBoosterPredictFromDense_R(SEXP handle, SEXP R_mat, SEXP missing,
+                                         SEXP json_config, SEXP base_margin);
+
+/*!
+ * \brief Run prediction on R CSR matrix
+ * \param handle handle
+ * \param lst An R list, containing, in this order:
+ *              (a) 'p' array (a.k.a. indptr)
+ *              (b) 'j' array (a.k.a. indices)
+ *              (c) 'x' array (a.k.a. data / values)
+ *              (d) number of columns
+ * \param missing missing value
+ * \param json_config See `XGBoosterPredictFromCSR` in xgboost c_api.h. Doesn't include 'missing'
+ * \param base_margin base margin for the prediction
+ *
+ * \return A list containing 2 vectors, first one for shape while second one for prediction result.
+ */
+XGB_DLL SEXP XGBoosterPredictFromCSR_R(SEXP handle, SEXP lst, SEXP missing,
+                                       SEXP json_config, SEXP base_margin);
+
+/*!
+ * \brief Run prediction on R data.frame
+ * \param handle handle
+ * \param R_df R data.frame
+ * \param missing missing value
+ * \param json_config See `XGBoosterPredictFromDense` in xgboost c_api.h. Doesn't include 'missing'
+ * \param base_margin base margin for the prediction
+ *
+ * \return A list containing 2 vectors, first one for shape while second one for prediction result.
+ */
+XGB_DLL SEXP XGBoosterPredictFromColumnar_R(SEXP handle, SEXP R_df, SEXP missing,
+                                            SEXP json_config, SEXP base_margin);
+
 /*!
  * \brief load model from existing file
  * \param handle handle
@@ -490,4 +535,14 @@ XGB_DLL SEXP XGBoosterFeatureScore_R(SEXP handle, SEXP json_config);
  */
 XGB_DLL SEXP XGBoosterSlice_R(SEXP handle, SEXP begin_layer, SEXP end_layer, SEXP step);
 
+/*!
+ * \brief Slice a fitted booster model (by rounds), and replace its handle with the result
+ * \param handle handle to the fitted booster
+ * \param begin_layer start of the slice
+ * \param end_later end of the slice; end_layer=0 is equivalent to end_layer=num_boost_round
+ * \param step step size of the slice
+ * \return NULL
+ */
+XGB_DLL SEXP XGBoosterSliceAndReplace_R(SEXP handle, SEXP begin_layer, SEXP end_layer, SEXP step);
+
 #endif  // XGBOOST_WRAPPER_R_H_ // NOLINT(*)
diff --git a/R-package/src/xgboost_custom.cc b/R-package/src/xgboost_custom.cc
index 4b05361ca..fb548c61d 100644
--- a/R-package/src/xgboost_custom.cc
+++ b/R-package/src/xgboost_custom.cc
@@ -41,16 +41,6 @@ double LogGamma(double v) {
   return lgammafn(v);
 }
 #endif  // !defined(XGBOOST_USE_CUDA)
-// customize random engine.
-void CustomGlobalRandomEngine::seed(CustomGlobalRandomEngine::result_type val) {
-  // ignore the seed
-}
 
-// use R's PRNG to replacd
-CustomGlobalRandomEngine::result_type
-CustomGlobalRandomEngine::operator()() {
-  return static_cast<result_type>(
-      std::floor(unif_rand() * CustomGlobalRandomEngine::max()));
-}
 }  // namespace common
 }  // namespace xgboost
diff --git a/R-package/tests/helper_scripts/install_deps.R b/R-package/tests/helper_scripts/install_deps.R
index 3ae44f6b1..7a621798a 100644
--- a/R-package/tests/helper_scripts/install_deps.R
+++ b/R-package/tests/helper_scripts/install_deps.R
@@ -20,6 +20,7 @@ pkgs <- c(
   "igraph",
   "float",
   "titanic",
+  "RhpcBLASctl",
   ## imports
   "Matrix",
   "methods",
diff --git a/R-package/tests/testthat.R b/R-package/tests/testthat.R
index 3bb229e70..bad6c1df3 100644
--- a/R-package/tests/testthat.R
+++ b/R-package/tests/testthat.R
@@ -1,4 +1,6 @@
 library(testthat)
 library(xgboost)
+library(Matrix)
 
 test_check("xgboost", reporter = ProgressReporter)
+RhpcBLASctl::omp_set_num_threads(1)
diff --git a/R-package/tests/testthat/test_basic.R b/R-package/tests/testthat/test_basic.R
index 03a8ddbe1..18a3b99e6 100644
--- a/R-package/tests/testthat/test_basic.R
+++ b/R-package/tests/testthat/test_basic.R
@@ -20,7 +20,7 @@ test_that("train and predict binary classification", {
       data = xgb.DMatrix(train$data, label = train$label), max_depth = 2,
       eta = 1, nthread = n_threads, nrounds = nrounds,
       objective = "binary:logistic", eval_metric = "error",
-      watchlist = list(train = xgb.DMatrix(train$data, label = train$label))
+      evals = list(train = xgb.DMatrix(train$data, label = train$label))
     ),
     "train-error"
   )
@@ -139,8 +139,8 @@ test_that("dart prediction works", {
   pred_by_train_1 <- predict(booster_by_train, newdata = dtrain, iterationrange = c(1, nrounds))
   pred_by_train_2 <- predict(booster_by_train, newdata = dtrain, training = TRUE)
 
-  expect_true(all(matrix(pred_by_train_0, byrow = TRUE) == matrix(pred_by_xgboost_0, byrow = TRUE)))
-  expect_true(all(matrix(pred_by_train_1, byrow = TRUE) == matrix(pred_by_xgboost_1, byrow = TRUE)))
+  expect_equal(pred_by_train_0, pred_by_xgboost_0, tolerance = 1e-6)
+  expect_equal(pred_by_train_1, pred_by_xgboost_1, tolerance = 1e-6)
   expect_true(all(matrix(pred_by_train_2, byrow = TRUE) == matrix(pred_by_xgboost_2, byrow = TRUE)))
 })
 
@@ -152,7 +152,7 @@ test_that("train and predict softprob", {
       data = xgb.DMatrix(as.matrix(iris[, -5]), label = lb),
       max_depth = 3, eta = 0.5, nthread = n_threads, nrounds = 5,
       objective = "multi:softprob", num_class = 3, eval_metric = "merror",
-      watchlist = list(train = xgb.DMatrix(as.matrix(iris[, -5]), label = lb))
+      evals = list(train = xgb.DMatrix(as.matrix(iris[, -5]), label = lb))
     ),
     "train-merror"
   )
@@ -203,7 +203,7 @@ test_that("train and predict softmax", {
       data = xgb.DMatrix(as.matrix(iris[, -5]), label = lb),
       max_depth = 3, eta = 0.5, nthread = n_threads, nrounds = 5,
       objective = "multi:softmax", num_class = 3, eval_metric = "merror",
-      watchlist = list(train = xgb.DMatrix(as.matrix(iris[, -5]), label = lb))
+      evals = list(train = xgb.DMatrix(as.matrix(iris[, -5]), label = lb))
     ),
     "train-merror"
   )
@@ -226,7 +226,7 @@ test_that("train and predict RF", {
     nthread = n_threads,
     nrounds = 1, objective = "binary:logistic", eval_metric = "error",
     num_parallel_tree = 20, subsample = 0.6, colsample_bytree = 0.1,
-    watchlist = list(train = xgb.DMatrix(train$data, label = lb))
+    evals = list(train = xgb.DMatrix(train$data, label = lb))
   )
   expect_equal(xgb.get.num.boosted.rounds(bst), 1)
 
@@ -250,7 +250,7 @@ test_that("train and predict RF with softprob", {
     objective = "multi:softprob", eval_metric = "merror",
     num_class = 3, verbose = 0,
     num_parallel_tree = 4, subsample = 0.5, colsample_bytree = 0.5,
-    watchlist = list(train = xgb.DMatrix(as.matrix(iris[, -5]), label = lb))
+    evals = list(train = xgb.DMatrix(as.matrix(iris[, -5]), label = lb))
   )
   expect_equal(xgb.get.num.boosted.rounds(bst), 15)
   # predict for all iterations:
@@ -271,7 +271,7 @@ test_that("use of multiple eval metrics works", {
       data = xgb.DMatrix(train$data, label = train$label), max_depth = 2,
       eta = 1, nthread = n_threads, nrounds = 2, objective = "binary:logistic",
       eval_metric = "error", eval_metric = "auc", eval_metric = "logloss",
-      watchlist = list(train = xgb.DMatrix(train$data, label = train$label))
+      evals = list(train = xgb.DMatrix(train$data, label = train$label))
     ),
     "train-error.*train-auc.*train-logloss"
   )
@@ -283,7 +283,7 @@ test_that("use of multiple eval metrics works", {
       data = xgb.DMatrix(train$data, label = train$label), max_depth = 2,
       eta = 1, nthread = n_threads, nrounds = 2, objective = "binary:logistic",
       eval_metric = list("error", "auc", "logloss"),
-      watchlist = list(train = xgb.DMatrix(train$data, label = train$label))
+      evals = list(train = xgb.DMatrix(train$data, label = train$label))
     ),
     "train-error.*train-auc.*train-logloss"
   )
@@ -295,19 +295,19 @@ test_that("use of multiple eval metrics works", {
 
 test_that("training continuation works", {
   dtrain <- xgb.DMatrix(train$data, label = train$label, nthread = n_threads)
-  watchlist <- list(train = dtrain)
+  evals <- list(train = dtrain)
   param <- list(
     objective = "binary:logistic", max_depth = 2, eta = 1, nthread = n_threads
   )
 
   # for the reference, use 4 iterations at once:
   set.seed(11)
-  bst <- xgb.train(param, dtrain, nrounds = 4, watchlist, verbose = 0)
+  bst <- xgb.train(param, dtrain, nrounds = 4, evals = evals, verbose = 0)
   # first two iterations:
   set.seed(11)
-  bst1 <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0)
+  bst1 <- xgb.train(param, dtrain, nrounds = 2, evals = evals, verbose = 0)
   # continue for two more:
-  bst2 <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0, xgb_model = bst1)
+  bst2 <- xgb.train(param, dtrain, nrounds = 2, evals = evals, verbose = 0, xgb_model = bst1)
   if (!windows_flag && !solaris_flag) {
     expect_equal(xgb.save.raw(bst), xgb.save.raw(bst2))
   }
@@ -315,7 +315,7 @@ test_that("training continuation works", {
   expect_equal(dim(attributes(bst2)$evaluation_log), c(4, 2))
   expect_equal(attributes(bst2)$evaluation_log, attributes(bst)$evaluation_log)
   # test continuing from raw model data
-  bst2 <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0, xgb_model = xgb.save.raw(bst1))
+  bst2 <- xgb.train(param, dtrain, nrounds = 2, evals = evals, verbose = 0, xgb_model = xgb.save.raw(bst1))
   if (!windows_flag && !solaris_flag) {
     expect_equal(xgb.save.raw(bst), xgb.save.raw(bst2))
   }
@@ -323,7 +323,7 @@ test_that("training continuation works", {
   # test continuing from a model in file
   fname <- file.path(tempdir(), "xgboost.json")
   xgb.save(bst1, fname)
-  bst2 <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0, xgb_model = fname)
+  bst2 <- xgb.train(param, dtrain, nrounds = 2, evals = evals, verbose = 0, xgb_model = fname)
   if (!windows_flag && !solaris_flag) {
     expect_equal(xgb.save.raw(bst), xgb.save.raw(bst2))
   }
@@ -348,7 +348,6 @@ test_that("xgb.cv works", {
   expect_false(is.null(cv$folds) && is.list(cv$folds))
   expect_length(cv$folds, 5)
   expect_false(is.null(cv$params) && is.list(cv$params))
-  expect_false(is.null(cv$callbacks))
   expect_false(is.null(cv$call))
 })
 
@@ -418,7 +417,7 @@ test_that("max_delta_step works", {
   dtrain <- xgb.DMatrix(
     agaricus.train$data, label = agaricus.train$label, nthread = n_threads
   )
-  watchlist <- list(train = dtrain)
+  evals <- list(train = dtrain)
   param <- list(
     objective = "binary:logistic", eval_metric = "logloss", max_depth = 2,
     nthread = n_threads,
@@ -426,9 +425,9 @@ test_that("max_delta_step works", {
   )
   nrounds <- 5
   # model with no restriction on max_delta_step
-  bst1 <- xgb.train(param, dtrain, nrounds, watchlist, verbose = 1)
+  bst1 <- xgb.train(param, dtrain, nrounds, evals = evals, verbose = 1)
   # model with restricted max_delta_step
-  bst2 <- xgb.train(param, dtrain, nrounds, watchlist, verbose = 1, max_delta_step = 1)
+  bst2 <- xgb.train(param, dtrain, nrounds, evals = evals, verbose = 1, max_delta_step = 1)
   # the no-restriction model is expected to have consistently lower loss during the initial iterations
   expect_true(all(attributes(bst1)$evaluation_log$train_logloss < attributes(bst2)$evaluation_log$train_logloss))
   expect_lt(mean(attributes(bst1)$evaluation_log$train_logloss) / mean(attributes(bst2)$evaluation_log$train_logloss), 0.8)
@@ -445,7 +444,7 @@ test_that("colsample_bytree works", {
   colnames(test_x) <- paste0("Feature_", sprintf("%03d", 1:100))
   dtrain <- xgb.DMatrix(train_x, label = train_y, nthread = n_threads)
   dtest <- xgb.DMatrix(test_x, label = test_y, nthread = n_threads)
-  watchlist <- list(train = dtrain, eval = dtest)
+  evals <- list(train = dtrain, eval = dtest)
   ## Use colsample_bytree = 0.01, so that roughly one out of 100 features is chosen for
   ## each tree
   param <- list(
@@ -454,7 +453,7 @@ test_that("colsample_bytree works", {
     eval_metric = "auc"
   )
   set.seed(2)
-  bst <- xgb.train(param, dtrain, nrounds = 100, watchlist, verbose = 0)
+  bst <- xgb.train(param, dtrain, nrounds = 100, evals = evals, verbose = 0)
   xgb.importance(model = bst)
   # If colsample_bytree works properly, a variety of features should be used
   # in the 100 trees
@@ -651,6 +650,51 @@ test_that("Can use ranking objectives with either 'qid' or 'group'", {
   expect_equal(pred_qid, pred_gr)
 })
 
+test_that("Can predict on data.frame objects", {
+  data("mtcars")
+  y <- mtcars$mpg
+  x_df <- mtcars[, -1]
+  x_mat <- as.matrix(x_df)
+  dm <- xgb.DMatrix(x_mat, label = y, nthread = n_threads)
+  model <- xgb.train(
+    params = list(
+      tree_method = "hist",
+      objective = "reg:squarederror",
+      nthread = n_threads
+    ),
+    data = dm,
+    nrounds = 5
+  )
+
+  pred_mat <- predict(model, xgb.DMatrix(x_mat), nthread = n_threads)
+  pred_df <- predict(model, x_df, nthread = n_threads)
+  expect_equal(pred_mat, pred_df)
+})
+
+test_that("'base_margin' gives the same result in DMatrix as in inplace_predict", {
+  data("mtcars")
+  y <- mtcars$mpg
+  x <- as.matrix(mtcars[, -1])
+  dm <- xgb.DMatrix(x, label = y, nthread = n_threads)
+  model <- xgb.train(
+    params = list(
+      tree_method = "hist",
+      objective = "reg:squarederror",
+      nthread = n_threads
+    ),
+    data = dm,
+    nrounds = 5
+  )
+
+  set.seed(123)
+  base_margin <- rnorm(nrow(x))
+  dm_w_base <- xgb.DMatrix(data = x, base_margin = base_margin)
+  pred_from_dm <- predict(model, dm_w_base)
+  pred_from_mat <- predict(model, x, base_margin = base_margin)
+
+  expect_equal(pred_from_dm, pred_from_mat)
+})
+
 test_that("Coefficients from gblinear have the expected shape and names", {
   # Single-column coefficients
   data(mtcars)
@@ -778,3 +822,66 @@ test_that("DMatrix field are set to booster when training", {
   expect_equal(getinfo(model_feature_types, "feature_type"), c("q", "c", "q"))
   expect_equal(getinfo(model_both, "feature_type"), c("q", "c", "q"))
 })
+
+test_that("Seed in params override PRNG from R", {
+  set.seed(123)
+  model1 <- xgb.train(
+    data = xgb.DMatrix(
+      agaricus.train$data,
+      label = agaricus.train$label, nthread = 1L
+    ),
+    params = list(
+      objective = "binary:logistic",
+      max_depth = 3L,
+      subsample = 0.1,
+      colsample_bytree = 0.1,
+      seed = 111L
+    ),
+    nrounds = 3L
+  )
+
+  set.seed(456)
+  model2 <- xgb.train(
+    data = xgb.DMatrix(
+      agaricus.train$data,
+      label = agaricus.train$label, nthread = 1L
+    ),
+    params = list(
+      objective = "binary:logistic",
+      max_depth = 3L,
+      subsample = 0.1,
+      colsample_bytree = 0.1,
+      seed = 111L
+    ),
+    nrounds = 3L
+  )
+
+  expect_equal(
+    xgb.save.raw(model1, raw_format = "json"),
+    xgb.save.raw(model2, raw_format = "json")
+  )
+
+  set.seed(123)
+  model3 <- xgb.train(
+    data = xgb.DMatrix(
+      agaricus.train$data,
+      label = agaricus.train$label, nthread = 1L
+    ),
+    params = list(
+      objective = "binary:logistic",
+      max_depth = 3L,
+      subsample = 0.1,
+      colsample_bytree = 0.1,
+      seed = 222L
+    ),
+    nrounds = 3L
+  )
+  expect_false(
+    isTRUE(
+      all.equal(
+        xgb.save.raw(model1, raw_format = "json"),
+        xgb.save.raw(model3, raw_format = "json")
+      )
+    )
+  )
+})
diff --git a/R-package/tests/testthat/test_callbacks.R b/R-package/tests/testthat/test_callbacks.R
index c60d0c246..913791de4 100644
--- a/R-package/tests/testthat/test_callbacks.R
+++ b/R-package/tests/testthat/test_callbacks.R
@@ -19,7 +19,7 @@ ltrain <- add.noise(train$label, 0.2)
 ltest <- add.noise(test$label, 0.2)
 dtrain <- xgb.DMatrix(train$data, label = ltrain, nthread = n_threads)
 dtest <- xgb.DMatrix(test$data, label = ltest, nthread = n_threads)
-watchlist <- list(train = dtrain, test = dtest)
+evals <- list(train = dtrain, test = dtest)
 
 
 err <- function(label, pr) sum((pr > 0.5) != label) / length(label)
@@ -28,79 +28,125 @@ param <- list(objective = "binary:logistic", eval_metric = "error",
               max_depth = 2, nthread = n_threads)
 
 
-test_that("cb.print.evaluation works as expected", {
+test_that("xgb.cb.print.evaluation works as expected for xgb.train", {
+  logs1 <- capture.output({
+    model <- xgb.train(
+      data = dtrain,
+      params = list(
+        objective = "binary:logistic",
+        eval_metric = "auc",
+        max_depth = 2,
+        nthread = n_threads
+      ),
+      nrounds = 10,
+      evals = list(train = dtrain, test = dtest),
+      callbacks = list(xgb.cb.print.evaluation(period = 1))
+    )
+  })
+  expect_equal(length(logs1), 10)
+  expect_true(all(grepl("^\\[\\d{1,2}\\]\ttrain-auc:0\\.\\d+\ttest-auc:0\\.\\d+\\s*$", logs1)))
+  lapply(seq(1, 10), function(x) expect_true(grepl(paste0("^\\[", x), logs1[x])))
 
-  bst_evaluation <- c('train-auc' = 0.9, 'test-auc' = 0.8)
-  bst_evaluation_err <- NULL
-  begin_iteration <- 1
-  end_iteration <- 7
-
-  f0 <- cb.print.evaluation(period = 0)
-  f1 <- cb.print.evaluation(period = 1)
-  f5 <- cb.print.evaluation(period = 5)
-
-  expect_false(is.null(attr(f1, 'call')))
-  expect_equal(attr(f1, 'name'), 'cb.print.evaluation')
-
-  iteration <- 1
-  expect_silent(f0())
-  expect_output(f1(), "\\[1\\]\ttrain-auc:0.900000\ttest-auc:0.800000")
-  expect_output(f5(), "\\[1\\]\ttrain-auc:0.900000\ttest-auc:0.800000")
-  expect_null(f1())
-
-  iteration <- 2
-  expect_output(f1(), "\\[2\\]\ttrain-auc:0.900000\ttest-auc:0.800000")
-  expect_silent(f5())
-
-  iteration <- 7
-  expect_output(f1(), "\\[7\\]\ttrain-auc:0.900000\ttest-auc:0.800000")
-  expect_output(f5(), "\\[7\\]\ttrain-auc:0.900000\ttest-auc:0.800000")
-
-  bst_evaluation_err  <- c('train-auc' = 0.1, 'test-auc' = 0.2)
-  expect_output(f1(), "\\[7\\]\ttrain-auc:0.900000±0.100000\ttest-auc:0.800000±0.200000")
+  logs2 <- capture.output({
+    model <- xgb.train(
+      data = dtrain,
+      params = list(
+        objective = "binary:logistic",
+        eval_metric = "auc",
+        max_depth = 2,
+        nthread = n_threads
+      ),
+      nrounds = 10,
+      evals = list(train = dtrain, test = dtest),
+      callbacks = list(xgb.cb.print.evaluation(period = 2))
+    )
+  })
+  expect_equal(length(logs2), 6)
+  expect_true(all(grepl("^\\[\\d{1,2}\\]\ttrain-auc:0\\.\\d+\ttest-auc:0\\.\\d+\\s*$", logs2)))
+  seq_matches <- c(seq(1, 10, 2), 10)
+  lapply(seq_along(seq_matches), function(x) expect_true(grepl(paste0("^\\[", seq_matches[x]), logs2[x])))
 })
 
-test_that("cb.evaluation.log works as expected", {
+test_that("xgb.cb.print.evaluation works as expected for xgb.cv", {
+  logs1 <- capture.output({
+    model <- xgb.cv(
+      data = dtrain,
+      params = list(
+        objective = "binary:logistic",
+        eval_metric = "auc",
+        max_depth = 2,
+        nthread = n_threads
+      ),
+      nrounds = 10,
+      nfold = 3,
+      callbacks = list(xgb.cb.print.evaluation(period = 1, showsd = TRUE))
+    )
+  })
+  expect_equal(length(logs1), 10)
+  expect_true(all(grepl("^\\[\\d{1,2}\\]\ttrain-auc:0\\.\\d+±0\\.\\d+\ttest-auc:0\\.\\d+±0\\.\\d+\\s*$", logs1)))
+  lapply(seq(1, 10), function(x) expect_true(grepl(paste0("^\\[", x), logs1[x])))
 
-  bst_evaluation <- c('train-auc' = 0.9, 'test-auc' = 0.8)
-  bst_evaluation_err <- NULL
+  logs2 <- capture.output({
+    model <- xgb.cv(
+      data = dtrain,
+      params = list(
+        objective = "binary:logistic",
+        eval_metric = "auc",
+        max_depth = 2,
+        nthread = n_threads
+      ),
+      nrounds = 10,
+      nfold = 3,
+      callbacks = list(xgb.cb.print.evaluation(period = 2, showsd = TRUE))
+    )
+  })
+  expect_equal(length(logs2), 6)
+  expect_true(all(grepl("^\\[\\d{1,2}\\]\ttrain-auc:0\\.\\d+±0\\.\\d+\ttest-auc:0\\.\\d+±0\\.\\d+\\s*$", logs2)))
+  seq_matches <- c(seq(1, 10, 2), 10)
+  lapply(seq_along(seq_matches), function(x) expect_true(grepl(paste0("^\\[", seq_matches[x]), logs2[x])))
+})
 
-  evaluation_log <- list()
-  f <- cb.evaluation.log()
+test_that("xgb.cb.evaluation.log works as expected for xgb.train", {
+  model <- xgb.train(
+    data = dtrain,
+    params = list(
+      objective = "binary:logistic",
+      eval_metric = "auc",
+      max_depth = 2,
+      nthread = n_threads
+    ),
+    nrounds = 10,
+    verbose = FALSE,
+    evals = list(train = dtrain, test = dtest),
+    callbacks = list(xgb.cb.evaluation.log())
+  )
+  logs <- attributes(model)$evaluation_log
 
-  expect_false(is.null(attr(f, 'call')))
-  expect_equal(attr(f, 'name'), 'cb.evaluation.log')
+  expect_equal(nrow(logs), 10)
+  expect_equal(colnames(logs), c("iter", "train_auc", "test_auc"))
+})
 
-  iteration <- 1
-  expect_silent(f())
-  expect_equal(evaluation_log,
-               list(c(iter = 1, bst_evaluation)))
-  iteration <- 2
-  expect_silent(f())
-  expect_equal(evaluation_log,
-               list(c(iter = 1, bst_evaluation), c(iter = 2, bst_evaluation)))
-  expect_silent(f(finalize = TRUE))
-  expect_equal(evaluation_log,
-               data.table::data.table(iter = 1:2, train_auc = c(0.9, 0.9), test_auc = c(0.8, 0.8)))
+test_that("xgb.cb.evaluation.log works as expected for xgb.cv", {
+  model <- xgb.cv(
+    data = dtrain,
+    params = list(
+      objective = "binary:logistic",
+      eval_metric = "auc",
+      max_depth = 2,
+      nthread = n_threads
+    ),
+    nrounds = 10,
+    verbose = FALSE,
+    nfold = 3,
+    callbacks = list(xgb.cb.evaluation.log())
+  )
+  logs <- model$evaluation_log
 
-  bst_evaluation_err  <- c('train-auc' = 0.1, 'test-auc' = 0.2)
-  evaluation_log <- list()
-  f <- cb.evaluation.log()
-
-  iteration <- 1
-  expect_silent(f())
-  expect_equal(evaluation_log,
-               list(c(iter = 1, c(bst_evaluation, bst_evaluation_err))))
-  iteration <- 2
-  expect_silent(f())
-  expect_equal(evaluation_log,
-               list(c(iter = 1, c(bst_evaluation, bst_evaluation_err)),
-                    c(iter = 2, c(bst_evaluation, bst_evaluation_err))))
-  expect_silent(f(finalize = TRUE))
-  expect_equal(evaluation_log,
-               data.table::data.table(iter = 1:2,
-                          train_auc_mean = c(0.9, 0.9), train_auc_std = c(0.1, 0.1),
-                          test_auc_mean = c(0.8, 0.8), test_auc_std = c(0.2, 0.2)))
+  expect_equal(nrow(logs), 10)
+  expect_equal(
+    colnames(logs),
+    c("iter", "train_auc_mean", "train_auc_std", "test_auc_mean", "test_auc_std")
+  )
 })
 
 
@@ -109,26 +155,26 @@ param <- list(objective = "binary:logistic", eval_metric = "error",
 
 test_that("can store evaluation_log without printing", {
   expect_silent(
-    bst <- xgb.train(param, dtrain, nrounds = 10, watchlist, eta = 1, verbose = 0)
+    bst <- xgb.train(param, dtrain, nrounds = 10, evals = evals, eta = 1, verbose = 0)
   )
   expect_false(is.null(attributes(bst)$evaluation_log))
   expect_false(is.null(attributes(bst)$evaluation_log$train_error))
   expect_lt(attributes(bst)$evaluation_log[, min(train_error)], 0.2)
 })
 
-test_that("cb.reset.parameters works as expected", {
+test_that("xgb.cb.reset.parameters works as expected", {
 
   # fixed eta
   set.seed(111)
-  bst0 <- xgb.train(param, dtrain, nrounds = 2, watchlist, eta = 0.9, verbose = 0)
+  bst0 <- xgb.train(param, dtrain, nrounds = 2, evals = evals, eta = 0.9, verbose = 0)
   expect_false(is.null(attributes(bst0)$evaluation_log))
   expect_false(is.null(attributes(bst0)$evaluation_log$train_error))
 
   # same eta but re-set as a vector parameter in the callback
   set.seed(111)
   my_par <- list(eta = c(0.9, 0.9))
-  bst1 <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0,
-                    callbacks = list(cb.reset.parameters(my_par)))
+  bst1 <- xgb.train(param, dtrain, nrounds = 2, evals = evals, verbose = 0,
+                    callbacks = list(xgb.cb.reset.parameters(my_par)))
   expect_false(is.null(attributes(bst1)$evaluation_log$train_error))
   expect_equal(attributes(bst0)$evaluation_log$train_error,
                attributes(bst1)$evaluation_log$train_error)
@@ -136,8 +182,8 @@ test_that("cb.reset.parameters works as expected", {
   # same eta but re-set via a function in the callback
   set.seed(111)
   my_par <- list(eta = function(itr, itr_end) 0.9)
-  bst2 <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0,
-                    callbacks = list(cb.reset.parameters(my_par)))
+  bst2 <- xgb.train(param, dtrain, nrounds = 2, evals = evals, verbose = 0,
+                    callbacks = list(xgb.cb.reset.parameters(my_par)))
   expect_false(is.null(attributes(bst2)$evaluation_log$train_error))
   expect_equal(attributes(bst0)$evaluation_log$train_error,
                attributes(bst2)$evaluation_log$train_error)
@@ -145,39 +191,39 @@ test_that("cb.reset.parameters works as expected", {
   # different eta re-set as a vector parameter in the callback
   set.seed(111)
   my_par <- list(eta = c(0.6, 0.5))
-  bst3 <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0,
-                    callbacks = list(cb.reset.parameters(my_par)))
+  bst3 <- xgb.train(param, dtrain, nrounds = 2, evals = evals, verbose = 0,
+                    callbacks = list(xgb.cb.reset.parameters(my_par)))
   expect_false(is.null(attributes(bst3)$evaluation_log$train_error))
   expect_false(all(attributes(bst0)$evaluation_log$train_error == attributes(bst3)$evaluation_log$train_error))
 
   # resetting multiple parameters at the same time runs with no error
   my_par <- list(eta = c(1., 0.5), gamma = c(1, 2), max_depth = c(4, 8))
   expect_error(
-    bst4 <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0,
-                      callbacks = list(cb.reset.parameters(my_par)))
+    bst4 <- xgb.train(param, dtrain, nrounds = 2, evals = evals, verbose = 0,
+                      callbacks = list(xgb.cb.reset.parameters(my_par)))
   , NA) # NA = no error
   # CV works as well
   expect_error(
     bst4 <- xgb.cv(param, dtrain, nfold = 2, nrounds = 2, verbose = 0,
-                   callbacks = list(cb.reset.parameters(my_par)))
+                   callbacks = list(xgb.cb.reset.parameters(my_par)))
   , NA) # NA = no error
 
   # expect no learning with 0 learning rate
   my_par <- list(eta = c(0., 0.))
-  bstX <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0,
-                    callbacks = list(cb.reset.parameters(my_par)))
+  bstX <- xgb.train(param, dtrain, nrounds = 2, evals = evals, verbose = 0,
+                    callbacks = list(xgb.cb.reset.parameters(my_par)))
   expect_false(is.null(attributes(bstX)$evaluation_log$train_error))
   er <- unique(attributes(bstX)$evaluation_log$train_error)
   expect_length(er, 1)
   expect_gt(er, 0.4)
 })
 
-test_that("cb.save.model works as expected", {
+test_that("xgb.cb.save.model works as expected", {
   files <- c('xgboost_01.json', 'xgboost_02.json', 'xgboost.json')
   files <- unname(sapply(files, function(f) file.path(tempdir(), f)))
   for (f in files) if (file.exists(f)) file.remove(f)
 
-  bst <- xgb.train(param, dtrain, nrounds = 2, watchlist, eta = 1, verbose = 0,
+  bst <- xgb.train(param, dtrain, nrounds = 2, evals = evals, eta = 1, verbose = 0,
                    save_period = 1, save_name = file.path(tempdir(), "xgboost_%02d.json"))
   expect_true(file.exists(files[1]))
   expect_true(file.exists(files[2]))
@@ -193,7 +239,7 @@ test_that("cb.save.model works as expected", {
   expect_equal(xgb.save.raw(bst), xgb.save.raw(b2))
 
   # save_period = 0 saves the last iteration's model
-  bst <- xgb.train(param, dtrain, nrounds = 2, watchlist, eta = 1, verbose = 0,
+  bst <- xgb.train(param, dtrain, nrounds = 2, evals = evals, eta = 1, verbose = 0,
                    save_period = 0, save_name = file.path(tempdir(), 'xgboost.json'))
   expect_true(file.exists(files[3]))
   b2 <- xgb.load(files[3])
@@ -206,7 +252,7 @@ test_that("cb.save.model works as expected", {
 test_that("early stopping xgb.train works", {
   set.seed(11)
   expect_output(
-    bst <- xgb.train(param, dtrain, nrounds = 20, watchlist, eta = 0.3,
+    bst <- xgb.train(param, dtrain, nrounds = 20, evals = evals, eta = 0.3,
                      early_stopping_rounds = 3, maximize = FALSE)
   , "Stopping. Best iteration")
   expect_false(is.null(xgb.attr(bst, "best_iteration")))
@@ -220,7 +266,7 @@ test_that("early stopping xgb.train works", {
 
   set.seed(11)
   expect_silent(
-    bst0 <- xgb.train(param, dtrain, nrounds = 20, watchlist, eta = 0.3,
+    bst0 <- xgb.train(param, dtrain, nrounds = 20, evals = evals, eta = 0.3,
                       early_stopping_rounds = 3, maximize = FALSE, verbose = 0)
   )
   expect_equal(attributes(bst)$evaluation_log, attributes(bst0)$evaluation_log)
@@ -236,10 +282,10 @@ test_that("early stopping xgb.train works", {
 test_that("early stopping using a specific metric works", {
   set.seed(11)
   expect_output(
-    bst <- xgb.train(param[-2], dtrain, nrounds = 20, watchlist, eta = 0.6,
+    bst <- xgb.train(param[-2], dtrain, nrounds = 20, evals = evals, eta = 0.6,
                      eval_metric = "logloss", eval_metric = "auc",
-                     callbacks = list(cb.early.stop(stopping_rounds = 3, maximize = FALSE,
-                                                    metric_name = 'test_logloss')))
+                     callbacks = list(xgb.cb.early.stop(stopping_rounds = 3, maximize = FALSE,
+                                                        metric_name = 'test_logloss')))
   , "Stopping. Best iteration")
   expect_false(is.null(xgb.attr(bst, "best_iteration")))
   expect_lt(xgb.attr(bst, "best_iteration"), 19)
@@ -269,7 +315,7 @@ test_that("early stopping works with titanic", {
     nrounds = 100,
     early_stopping_rounds = 3,
     nthread = n_threads,
-    watchlist = list(train = xgb.DMatrix(dtx, label = dty))
+    evals = list(train = xgb.DMatrix(dtx, label = dty))
   )
 
   expect_true(TRUE)  # should not crash
@@ -281,10 +327,10 @@ test_that("early stopping xgb.cv works", {
     cv <- xgb.cv(param, dtrain, nfold = 5, eta = 0.3, nrounds = 20,
                  early_stopping_rounds = 3, maximize = FALSE)
   , "Stopping. Best iteration")
-  expect_false(is.null(cv$best_iteration))
-  expect_lt(cv$best_iteration, 19)
+  expect_false(is.null(cv$early_stop$best_iteration))
+  expect_lt(cv$early_stop$best_iteration, 19)
   # the best error is min error:
-  expect_true(cv$evaluation_log[, test_error_mean[cv$best_iteration] == min(test_error_mean)])
+  expect_true(cv$evaluation_log[, test_error_mean[cv$early_stop$best_iteration] == min(test_error_mean)])
 })
 
 test_that("prediction in xgb.cv works", {
@@ -292,19 +338,19 @@ test_that("prediction in xgb.cv works", {
   nrounds <- 4
   cv <- xgb.cv(param, dtrain, nfold = 5, eta = 0.5, nrounds = nrounds, prediction = TRUE, verbose = 0)
   expect_false(is.null(cv$evaluation_log))
-  expect_false(is.null(cv$pred))
-  expect_length(cv$pred, nrow(train$data))
-  err_pred <- mean(sapply(cv$folds, function(f) mean(err(ltrain[f], cv$pred[f]))))
+  expect_false(is.null(cv$cv_predict$pred))
+  expect_length(cv$cv_predict$pred, nrow(train$data))
+  err_pred <- mean(sapply(cv$folds, function(f) mean(err(ltrain[f], cv$cv_predict$pred[f]))))
   err_log <- cv$evaluation_log[nrounds, test_error_mean]
   expect_equal(err_pred, err_log, tolerance = 1e-6)
 
   # save CV models
   set.seed(11)
   cvx <- xgb.cv(param, dtrain, nfold = 5, eta = 0.5, nrounds = nrounds, prediction = TRUE, verbose = 0,
-                callbacks = list(cb.cv.predict(save_models = TRUE)))
+                callbacks = list(xgb.cb.cv.predict(save_models = TRUE)))
   expect_equal(cv$evaluation_log, cvx$evaluation_log)
-  expect_length(cvx$models, 5)
-  expect_true(all(sapply(cvx$models, class) == 'xgb.Booster'))
+  expect_length(cvx$cv_predict$models, 5)
+  expect_true(all(sapply(cvx$cv_predict$models, class) == 'xgb.Booster'))
 })
 
 test_that("prediction in xgb.cv works for gblinear too", {
@@ -312,8 +358,8 @@ test_that("prediction in xgb.cv works for gblinear too", {
   p <- list(booster = 'gblinear', objective = "reg:logistic", nthread = n_threads)
   cv <- xgb.cv(p, dtrain, nfold = 5, eta = 0.5, nrounds = 2, prediction = TRUE, verbose = 0)
   expect_false(is.null(cv$evaluation_log))
-  expect_false(is.null(cv$pred))
-  expect_length(cv$pred, nrow(train$data))
+  expect_false(is.null(cv$cv_predict$pred))
+  expect_length(cv$cv_predict$pred, nrow(train$data))
 })
 
 test_that("prediction in early-stopping xgb.cv works", {
@@ -324,14 +370,14 @@ test_that("prediction in early-stopping xgb.cv works", {
                  prediction = TRUE, base_score = 0.5)
   , "Stopping. Best iteration")
 
-  expect_false(is.null(cv$best_iteration))
-  expect_lt(cv$best_iteration, 19)
+  expect_false(is.null(cv$early_stop$best_iteration))
+  expect_lt(cv$early_stop$best_iteration, 19)
   expect_false(is.null(cv$evaluation_log))
-  expect_false(is.null(cv$pred))
-  expect_length(cv$pred, nrow(train$data))
+  expect_false(is.null(cv$cv_predict$pred))
+  expect_length(cv$cv_predict$pred, nrow(train$data))
 
-  err_pred <- mean(sapply(cv$folds, function(f) mean(err(ltrain[f], cv$pred[f]))))
-  err_log <- cv$evaluation_log[cv$best_iteration, test_error_mean]
+  err_pred <- mean(sapply(cv$folds, function(f) mean(err(ltrain[f], cv$cv_predict$pred[f]))))
+  err_log <- cv$evaluation_log[cv$early_stop$best_iteration, test_error_mean]
   expect_equal(err_pred, err_log, tolerance = 1e-6)
   err_log_last <- cv$evaluation_log[cv$niter, test_error_mean]
   expect_gt(abs(err_pred - err_log_last), 1e-4)
@@ -346,9 +392,9 @@ test_that("prediction in xgb.cv for softprob works", {
                  subsample = 0.8, gamma = 2, verbose = 0,
                  prediction = TRUE, objective = "multi:softprob", num_class = 3)
   , NA)
-  expect_false(is.null(cv$pred))
-  expect_equal(dim(cv$pred), c(nrow(iris), 3))
-  expect_lt(diff(range(rowSums(cv$pred))), 1e-6)
+  expect_false(is.null(cv$cv_predict$pred))
+  expect_equal(dim(cv$cv_predict$pred), c(nrow(iris), 3))
+  expect_lt(diff(range(rowSums(cv$cv_predict$pred))), 1e-6)
 })
 
 test_that("prediction in xgb.cv works for multi-quantile", {
@@ -368,7 +414,7 @@ test_that("prediction in xgb.cv works for multi-quantile", {
     prediction = TRUE,
     verbose = 0
   )
-  expect_equal(dim(cv$pred), c(nrow(x), 5))
+  expect_equal(dim(cv$cv_predict$pred), c(nrow(x), 5))
 })
 
 test_that("prediction in xgb.cv works for multi-output", {
@@ -389,5 +435,46 @@ test_that("prediction in xgb.cv works for multi-output", {
     prediction = TRUE,
     verbose = 0
   )
-  expect_equal(dim(cv$pred), c(nrow(x), 2))
+  expect_equal(dim(cv$cv_predict$pred), c(nrow(x), 2))
+})
+
+test_that("prediction in xgb.cv works for multi-quantile", {
+  data(mtcars)
+  y <- mtcars$mpg
+  x <- as.matrix(mtcars[, -1])
+  dm <- xgb.DMatrix(x, label = y, nthread = 1)
+  cv <- xgb.cv(
+    data = dm,
+    params = list(
+      objective = "reg:quantileerror",
+      quantile_alpha = c(0.1, 0.2, 0.5, 0.8, 0.9),
+      nthread = 1
+    ),
+    nrounds = 5,
+    nfold = 3,
+    prediction = TRUE,
+    verbose = 0
+  )
+  expect_equal(dim(cv$cv_predict$pred), c(nrow(x), 5))
+})
+
+test_that("prediction in xgb.cv works for multi-output", {
+  data(mtcars)
+  y <- mtcars$mpg
+  x <- as.matrix(mtcars[, -1])
+  dm <- xgb.DMatrix(x, label = cbind(y, -y), nthread = 1)
+  cv <- xgb.cv(
+    data = dm,
+    params = list(
+      tree_method = "hist",
+      multi_strategy = "multi_output_tree",
+      objective = "reg:squarederror",
+      nthread = n_threads
+    ),
+    nrounds = 5,
+    nfold = 3,
+    prediction = TRUE,
+    verbose = 0
+  )
+  expect_equal(dim(cv$cv_predict$pred), c(nrow(x), 2))
 })
diff --git a/R-package/tests/testthat/test_custom_objective.R b/R-package/tests/testthat/test_custom_objective.R
index c65031246..d3050b152 100644
--- a/R-package/tests/testthat/test_custom_objective.R
+++ b/R-package/tests/testthat/test_custom_objective.R
@@ -12,7 +12,7 @@ dtrain <- xgb.DMatrix(
 dtest <- xgb.DMatrix(
   agaricus.test$data, label = agaricus.test$label, nthread = n_threads
 )
-watchlist <- list(eval = dtest, train = dtrain)
+evals <- list(eval = dtest, train = dtrain)
 
 logregobj <- function(preds, dtrain) {
   labels <- getinfo(dtrain, "label")
@@ -33,7 +33,7 @@ param <- list(max_depth = 2, eta = 1, nthread = n_threads,
 num_round <- 2
 
 test_that("custom objective works", {
-  bst <- xgb.train(param, dtrain, num_round, watchlist)
+  bst <- xgb.train(param, dtrain, num_round, evals)
   expect_equal(class(bst), "xgb.Booster")
   expect_false(is.null(attributes(bst)$evaluation_log))
   expect_false(is.null(attributes(bst)$evaluation_log$eval_error))
@@ -48,7 +48,7 @@ test_that("custom objective in CV works", {
 })
 
 test_that("custom objective with early stop works", {
-  bst <- xgb.train(param, dtrain, 10, watchlist)
+  bst <- xgb.train(param, dtrain, 10, evals)
   expect_equal(class(bst), "xgb.Booster")
   train_log <- attributes(bst)$evaluation_log$train_error
   expect_true(all(diff(train_log) <= 0))
@@ -66,7 +66,7 @@ test_that("custom objective using DMatrix attr works", {
     return(list(grad = grad, hess = hess))
   }
   param$objective <- logregobjattr
-  bst <- xgb.train(param, dtrain, num_round, watchlist)
+  bst <- xgb.train(param, dtrain, num_round, evals)
   expect_equal(class(bst), "xgb.Booster")
 })
 
diff --git a/R-package/tests/testthat/test_dmatrix.R b/R-package/tests/testthat/test_dmatrix.R
index 50621f241..44d1566c6 100644
--- a/R-package/tests/testthat/test_dmatrix.R
+++ b/R-package/tests/testthat/test_dmatrix.R
@@ -41,13 +41,13 @@ test_that("xgb.DMatrix: basic construction", {
 
   params <- list(tree_method = "hist", nthread = n_threads)
   bst_fd <- xgb.train(
-    params, nrounds = 8, fd, watchlist = list(train = fd)
+    params, nrounds = 8, fd, evals = list(train = fd)
   )
   bst_dgr <- xgb.train(
-    params, nrounds = 8, fdgr, watchlist = list(train = fdgr)
+    params, nrounds = 8, fdgr, evals = list(train = fdgr)
   )
   bst_dgc <- xgb.train(
-    params, nrounds = 8, fdgc, watchlist = list(train = fdgc)
+    params, nrounds = 8, fdgc, evals = list(train = fdgc)
   )
 
   raw_fd <- xgb.save.raw(bst_fd, raw_format = "ubj")
@@ -302,6 +302,37 @@ test_that("xgb.DMatrix: Inf as missing", {
   file.remove(fname_nan)
 })
 
+test_that("xgb.DMatrix: missing in CSR", {
+  x_dense <- matrix(as.numeric(1:10), nrow = 5)
+  x_dense[2, 1] <- NA_real_
+
+  x_csr <- as(x_dense, "RsparseMatrix")
+
+  m_dense <- xgb.DMatrix(x_dense, nthread = n_threads, missing = NA_real_)
+  xgb.DMatrix.save(m_dense, "dense.dmatrix")
+
+  m_csr <- xgb.DMatrix(x_csr, nthread = n_threads, missing = NA)
+  xgb.DMatrix.save(m_csr, "csr.dmatrix")
+
+  denseconn <- file("dense.dmatrix", "rb")
+  csrconn <- file("csr.dmatrix", "rb")
+
+  expect_equal(file.size("dense.dmatrix"), file.size("csr.dmatrix"))
+
+  bytes <- file.size("dense.dmatrix")
+  densedmatrix <- readBin(denseconn, "raw", n = bytes)
+  csrmatrix <- readBin(csrconn, "raw", n = bytes)
+
+  expect_equal(length(densedmatrix), length(csrmatrix))
+  expect_equal(densedmatrix, csrmatrix)
+
+  close(denseconn)
+  close(csrconn)
+
+  file.remove("dense.dmatrix")
+  file.remove("csr.dmatrix")
+})
+
 test_that("xgb.DMatrix: error on three-dimensional array", {
   set.seed(123)
   x <- matrix(rnorm(500), nrow = 50)
@@ -692,3 +723,20 @@ test_that("xgb.DMatrix: quantile cuts look correct", {
     }
   )
 })
+
+test_that("xgb.DMatrix: can read CSV", {
+  txt <- paste(
+    "1,2,3",
+    "-1,3,2",
+    sep = "\n"
+  )
+  fname <- file.path(tempdir(), "data.csv")
+  writeChar(txt, fname)
+  uri <- paste0(fname, "?format=csv&label_column=0")
+  dm <- xgb.DMatrix(uri, silent = TRUE)
+  expect_equal(getinfo(dm, "label"), c(1, -1))
+  expect_equal(
+    as.matrix(xgb.get.DMatrix.data(dm)),
+    matrix(c(2, 3, 3, 2), nrow = 2, byrow = TRUE)
+  )
+})
diff --git a/R-package/tests/testthat/test_feature_weights.R b/R-package/tests/testthat/test_feature_weights.R
index 4ed78c9b6..54fec67cf 100644
--- a/R-package/tests/testthat/test_feature_weights.R
+++ b/R-package/tests/testthat/test_feature_weights.R
@@ -25,7 +25,7 @@ test_that("training with feature weights works", {
     expect_lt(importance[1, Frequency], importance[9, Frequency])
   }
 
-  for (tm in c("hist", "approx", "exact")) {
+  for (tm in c("hist", "approx")) {
     test(tm)
   }
 })
diff --git a/R-package/tests/testthat/test_glm.R b/R-package/tests/testthat/test_glm.R
index 349bcce8d..b59de8b62 100644
--- a/R-package/tests/testthat/test_glm.R
+++ b/R-package/tests/testthat/test_glm.R
@@ -14,37 +14,37 @@ test_that("gblinear works", {
 
   param <- list(objective = "binary:logistic", eval_metric = "error", booster = "gblinear",
                 nthread = n_threads, eta = 0.8, alpha = 0.0001, lambda = 0.0001)
-  watchlist <- list(eval = dtest, train = dtrain)
+  evals <- list(eval = dtest, train = dtrain)
 
   n <- 5         # iterations
   ERR_UL <- 0.005 # upper limit for the test set error
   VERB <- 0      # chatterbox switch
 
   param$updater <- 'shotgun'
-  bst <- xgb.train(param, dtrain, n, watchlist, verbose = VERB, feature_selector = 'shuffle')
+  bst <- xgb.train(param, dtrain, n, evals, verbose = VERB, feature_selector = 'shuffle')
   ypred <- predict(bst, dtest)
   expect_equal(length(getinfo(dtest, 'label')), 1611)
   expect_lt(attributes(bst)$evaluation_log$eval_error[n], ERR_UL)
 
-  bst <- xgb.train(param, dtrain, n, watchlist, verbose = VERB, feature_selector = 'cyclic',
-                   callbacks = list(cb.gblinear.history()))
+  bst <- xgb.train(param, dtrain, n, evals, verbose = VERB, feature_selector = 'cyclic',
+                   callbacks = list(xgb.cb.gblinear.history()))
   expect_lt(attributes(bst)$evaluation_log$eval_error[n], ERR_UL)
   h <- xgb.gblinear.history(bst)
   expect_equal(dim(h), c(n, ncol(dtrain) + 1))
   expect_is(h, "matrix")
 
   param$updater <- 'coord_descent'
-  bst <- xgb.train(param, dtrain, n, watchlist, verbose = VERB, feature_selector = 'cyclic')
+  bst <- xgb.train(param, dtrain, n, evals, verbose = VERB, feature_selector = 'cyclic')
   expect_lt(attributes(bst)$evaluation_log$eval_error[n], ERR_UL)
 
-  bst <- xgb.train(param, dtrain, n, watchlist, verbose = VERB, feature_selector = 'shuffle')
+  bst <- xgb.train(param, dtrain, n, evals, verbose = VERB, feature_selector = 'shuffle')
   expect_lt(attributes(bst)$evaluation_log$eval_error[n], ERR_UL)
 
-  bst <- xgb.train(param, dtrain, 2, watchlist, verbose = VERB, feature_selector = 'greedy')
+  bst <- xgb.train(param, dtrain, 2, evals, verbose = VERB, feature_selector = 'greedy')
   expect_lt(attributes(bst)$evaluation_log$eval_error[2], ERR_UL)
 
-  bst <- xgb.train(param, dtrain, n, watchlist, verbose = VERB, feature_selector = 'thrifty',
-                   top_k = 50, callbacks = list(cb.gblinear.history(sparse = TRUE)))
+  bst <- xgb.train(param, dtrain, n, evals, verbose = VERB, feature_selector = 'thrifty',
+                   top_k = 50, callbacks = list(xgb.cb.gblinear.history(sparse = TRUE)))
   expect_lt(attributes(bst)$evaluation_log$eval_error[n], ERR_UL)
   h <- xgb.gblinear.history(bst)
   expect_equal(dim(h), c(n, ncol(dtrain) + 1))
diff --git a/R-package/tests/testthat/test_ranking.R b/R-package/tests/testthat/test_ranking.R
index e49a32025..0e7db42da 100644
--- a/R-package/tests/testthat/test_ranking.R
+++ b/R-package/tests/testthat/test_ranking.R
@@ -15,7 +15,7 @@ test_that('Test ranking with unweighted data', {
 
   params <- list(eta = 1, tree_method = 'exact', objective = 'rank:pairwise', max_depth = 1,
                  eval_metric = 'auc', eval_metric = 'aucpr', nthread = n_threads)
-  bst <- xgb.train(params, dtrain, nrounds = 10, watchlist = list(train = dtrain))
+  bst <- xgb.train(params, dtrain, nrounds = 10, evals = list(train = dtrain))
   # Check if the metric is monotone increasing
   expect_true(all(diff(attributes(bst)$evaluation_log$train_auc) >= 0))
   expect_true(all(diff(attributes(bst)$evaluation_log$train_aucpr) >= 0))
@@ -39,7 +39,7 @@ test_that('Test ranking with weighted data', {
     eta = 1, tree_method = "exact", objective = "rank:pairwise", max_depth = 1,
     eval_metric = "auc", eval_metric = "aucpr", nthread = n_threads
   )
-  bst <- xgb.train(params, dtrain, nrounds = 10, watchlist = list(train = dtrain))
+  bst <- xgb.train(params, dtrain, nrounds = 10, evals = list(train = dtrain))
   # Check if the metric is monotone increasing
   expect_true(all(diff(attributes(bst)$evaluation_log$train_auc) >= 0))
   expect_true(all(diff(attributes(bst)$evaluation_log$train_aucpr) >= 0))
diff --git a/R-package/tests/testthat/test_update.R b/R-package/tests/testthat/test_update.R
index 3c88178e0..7fdc6eb84 100644
--- a/R-package/tests/testthat/test_update.R
+++ b/R-package/tests/testthat/test_update.R
@@ -17,7 +17,7 @@ dtest <- xgb.DMatrix(
 win32_flag <- .Platform$OS.type == "windows" && .Machine$sizeof.pointer != 8
 
 test_that("updating the model works", {
-  watchlist <- list(train = dtrain, test = dtest)
+  evals <- list(train = dtrain, test = dtest)
 
   # no-subsampling
   p1 <- list(
@@ -25,19 +25,19 @@ test_that("updating the model works", {
     updater = "grow_colmaker,prune"
   )
   set.seed(11)
-  bst1 <- xgb.train(p1, dtrain, nrounds = 10, watchlist, verbose = 0)
+  bst1 <- xgb.train(p1, dtrain, nrounds = 10, evals = evals, verbose = 0)
   tr1 <- xgb.model.dt.tree(model = bst1)
 
   # with subsampling
   p2 <- modifyList(p1, list(subsample = 0.1))
   set.seed(11)
-  bst2 <- xgb.train(p2, dtrain, nrounds = 10, watchlist, verbose = 0)
+  bst2 <- xgb.train(p2, dtrain, nrounds = 10, evals = evals, verbose = 0)
   tr2 <- xgb.model.dt.tree(model = bst2)
 
   # the same no-subsampling boosting with an extra 'refresh' updater:
   p1r <- modifyList(p1, list(updater = 'grow_colmaker,prune,refresh', refresh_leaf = FALSE))
   set.seed(11)
-  bst1r <- xgb.train(p1r, dtrain, nrounds = 10, watchlist, verbose = 0)
+  bst1r <- xgb.train(p1r, dtrain, nrounds = 10, evals = evals, verbose = 0)
   tr1r <- xgb.model.dt.tree(model = bst1r)
   # all should be the same when no subsampling
   expect_equal(attributes(bst1)$evaluation_log, attributes(bst1r)$evaluation_log)
@@ -53,7 +53,7 @@ test_that("updating the model works", {
   # the same boosting with subsampling with an extra 'refresh' updater:
   p2r <- modifyList(p2, list(updater = 'grow_colmaker,prune,refresh', refresh_leaf = FALSE))
   set.seed(11)
-  bst2r <- xgb.train(p2r, dtrain, nrounds = 10, watchlist, verbose = 0)
+  bst2r <- xgb.train(p2r, dtrain, nrounds = 10, evals = evals, verbose = 0)
   tr2r <- xgb.model.dt.tree(model = bst2r)
   # should be the same evaluation but different gains and larger cover
   expect_equal(attributes(bst2)$evaluation_log, attributes(bst2r)$evaluation_log)
@@ -66,7 +66,7 @@ test_that("updating the model works", {
   # process type 'update' for no-subsampling model, refreshing the tree stats AND leaves from training data:
   set.seed(123)
   p1u <- modifyList(p1, list(process_type = 'update', updater = 'refresh', refresh_leaf = TRUE))
-  bst1u <- xgb.train(p1u, dtrain, nrounds = 10, watchlist, verbose = 0, xgb_model = bst1)
+  bst1u <- xgb.train(p1u, dtrain, nrounds = 10, evals = evals, verbose = 0, xgb_model = bst1)
   tr1u <- xgb.model.dt.tree(model = bst1u)
   # all should be the same when no subsampling
   expect_equal(attributes(bst1)$evaluation_log, attributes(bst1u)$evaluation_log)
@@ -79,7 +79,7 @@ test_that("updating the model works", {
 
   # same thing but with a serialized model
   set.seed(123)
-  bst1u <- xgb.train(p1u, dtrain, nrounds = 10, watchlist, verbose = 0, xgb_model = xgb.save.raw(bst1))
+  bst1u <- xgb.train(p1u, dtrain, nrounds = 10, evals = evals, verbose = 0, xgb_model = xgb.save.raw(bst1))
   tr1u <- xgb.model.dt.tree(model = bst1u)
   # all should be the same when no subsampling
   expect_equal(attributes(bst1)$evaluation_log, attributes(bst1u)$evaluation_log)
@@ -87,7 +87,7 @@ test_that("updating the model works", {
 
   # process type 'update' for model with subsampling, refreshing only the tree stats from training data:
   p2u <- modifyList(p2, list(process_type = 'update', updater = 'refresh', refresh_leaf = FALSE))
-  bst2u <- xgb.train(p2u, dtrain, nrounds = 10, watchlist, verbose = 0, xgb_model = bst2)
+  bst2u <- xgb.train(p2u, dtrain, nrounds = 10, evals = evals, verbose = 0, xgb_model = bst2)
   tr2u <- xgb.model.dt.tree(model = bst2u)
   # should be the same evaluation but different gains and larger cover
   expect_equal(attributes(bst2)$evaluation_log, attributes(bst2u)$evaluation_log)
@@ -102,7 +102,7 @@ test_that("updating the model works", {
 
   # process type 'update' for no-subsampling model, refreshing only the tree stats from TEST data:
   p1ut <- modifyList(p1, list(process_type = 'update', updater = 'refresh', refresh_leaf = FALSE))
-  bst1ut <- xgb.train(p1ut, dtest, nrounds = 10, watchlist, verbose = 0, xgb_model = bst1)
+  bst1ut <- xgb.train(p1ut, dtest, nrounds = 10, evals = evals, verbose = 0, xgb_model = bst1)
   tr1ut <- xgb.model.dt.tree(model = bst1ut)
   # should be the same evaluations but different gains and smaller cover (test data is smaller)
   expect_equal(attributes(bst1)$evaluation_log, attributes(bst1ut)$evaluation_log)
@@ -115,18 +115,18 @@ test_that("updating works for multiclass & multitree", {
   dtr <- xgb.DMatrix(
     as.matrix(iris[, -5]), label = as.numeric(iris$Species) - 1, nthread = n_threads
   )
-  watchlist <- list(train = dtr)
+  evals <- list(train = dtr)
   p0 <- list(max_depth = 2, eta = 0.5, nthread = n_threads, subsample = 0.6,
              objective = "multi:softprob", num_class = 3, num_parallel_tree = 2,
              base_score = 0)
   set.seed(121)
-  bst0 <- xgb.train(p0, dtr, 5, watchlist, verbose = 0)
+  bst0 <- xgb.train(p0, dtr, 5, evals = evals, verbose = 0)
   tr0 <- xgb.model.dt.tree(model = bst0)
 
   # run update process for an original model with subsampling
   p0u <- modifyList(p0, list(process_type = 'update', updater = 'refresh', refresh_leaf = FALSE))
   bst0u <- xgb.train(p0u, dtr, nrounds = xgb.get.num.boosted.rounds(bst0),
-                     watchlist, xgb_model = bst0, verbose = 0)
+                     evals = evals, xgb_model = bst0, verbose = 0)
   tr0u <- xgb.model.dt.tree(model = bst0u)
 
   # should be the same evaluation but different gains and larger cover
diff --git a/R-package/vignettes/xgboostPresentation.Rmd b/R-package/vignettes/xgboostPresentation.Rmd
index efafc624d..fc49adc0f 100644
--- a/R-package/vignettes/xgboostPresentation.Rmd
+++ b/R-package/vignettes/xgboostPresentation.Rmd
@@ -341,10 +341,10 @@ One way to measure progress in learning of a model is to provide to **XGBoost**
 
 > in some way it is similar to what we have done above with the average error. The main difference is that below it was after building the model, and now it is during the construction that we measure errors.
 
-For the purpose of this example, we use `watchlist` parameter. It is a list of `xgb.DMatrix`, each of them tagged with a name.
+For the purpose of this example, we use the `evals` parameter. It is a list of `xgb.DMatrix` objects, each of them tagged with a name.
 
-```{r watchlist, message=F, warning=F}
-watchlist <- list(train = dtrain, test = dtest)
+```{r evals, message=F, warning=F}
+evals <- list(train = dtrain, test = dtest)
 
 bst <- xgb.train(
     data = dtrain
@@ -355,7 +355,7 @@ bst <- xgb.train(
         , objective = "binary:logistic"
     )
     , nrounds = 2
-    , watchlist = watchlist
+    , evals = evals
 )
 ```
 
@@ -367,7 +367,7 @@ If with your own dataset you have not such results, you should think about how y
 
 For a better understanding of the learning progression, you may want to have some specific metric or even use multiple evaluation metrics.
 
-```{r watchlist2, message=F, warning=F}
+```{r evals2, message=F, warning=F}
 bst <- xgb.train(
     data = dtrain
     , max_depth = 2
@@ -379,7 +379,7 @@ bst <- xgb.train(
         , eval_metric = "logloss"
     )
     , nrounds = 2
-    , watchlist = watchlist
+    , evals = evals
 )
 ```
 
@@ -401,7 +401,7 @@ bst <- xgb.train(
         , eval_metric = "logloss"
     )
     , nrounds = 2
-    , watchlist = watchlist
+    , evals = evals
 )
 ```
 
@@ -430,7 +430,7 @@ bst <- xgb.train(
         , objective = "binary:logistic"
     )
     , nrounds = 2
-    , watchlist = watchlist
+    , evals = evals
 )
 ```
 
@@ -496,6 +496,9 @@ An interesting test to see how identical our saved model is to the original one
 
 ```{r loadModel, message=F, warning=F}
 # load binary model to R
+# Note that the number of threads for 'xgb.load' is taken from global config,
+# can be modified like this:
+RhpcBLASctl::omp_set_num_threads(1)
 bst2 <- xgb.load(fname)
 xgb.parameters(bst2) <- list(nthread = 2)
 pred2 <- predict(bst2, test$data)
diff --git a/doc/R-package/index.rst b/doc/R-package/index.rst
index 8a27d0174..bf9c1f8d9 100644
--- a/doc/R-package/index.rst
+++ b/doc/R-package/index.rst
@@ -34,4 +34,5 @@ Other topics
 .. toctree::
   :maxdepth: 2
   :titlesonly:
+
   Handling of indexable elements <index_base>
diff --git a/doc/parameter.rst b/doc/parameter.rst
index a7d8203b0..00f0eaea6 100644
--- a/doc/parameter.rst
+++ b/doc/parameter.rst
@@ -118,7 +118,7 @@ Parameters for Tree Booster
   - All ``colsample_by*`` parameters have a range of (0, 1], the default value of 1, and specify the fraction of columns to be subsampled.
   - ``colsample_bytree`` is the subsample ratio of columns when constructing each tree. Subsampling occurs once for every tree constructed.
   - ``colsample_bylevel`` is the subsample ratio of columns for each level. Subsampling occurs once for every new depth level reached in a tree. Columns are subsampled from the set of columns chosen for the current tree.
-  - ``colsample_bynode`` is the subsample ratio of columns for each node (split). Subsampling occurs once every time a new split is evaluated. Columns are subsampled from the set of columns chosen for the current level.
+  - ``colsample_bynode`` is the subsample ratio of columns for each node (split). Subsampling occurs once every time a new split is evaluated. Columns are subsampled from the set of columns chosen for the current level. This is not supported by the exact tree method.
   - ``colsample_by*`` parameters work cumulatively. For instance,
     the combination ``{'colsample_bytree':0.5, 'colsample_bylevel':0.5,
     'colsample_bynode':0.5}`` with 64 features will leave 8 features to choose from at
@@ -450,7 +450,7 @@ Specify the learning task and the corresponding learning objective. The objectiv
 
 * ``seed`` [default=0]
 
-  - Random number seed.  This parameter is ignored in R package, use `set.seed()` instead.
+  - Random number seed.  In the R package, if not specified, instead of defaulting to seed 'zero', will take a random seed through R's own RNG engine.
 
 * ``seed_per_iteration`` [default= ``false``]
 
@@ -489,7 +489,7 @@ Parameters for learning to rank (``rank:ndcg``, ``rank:map``, ``rank:pairwise``)
 
 These are parameters specific to learning to rank task. See :doc:`Learning to Rank </tutorials/learning_to_rank>` for an in-depth explanation.
 
-* ``lambdarank_pair_method`` [default = ``mean``]
+* ``lambdarank_pair_method`` [default = ``topk``]
 
   How to construct pairs for pair-wise learning.
 
@@ -500,7 +500,13 @@ These are parameters specific to learning to rank task. See :doc:`Learning to Ra
 
   It specifies the number of pairs sampled for each document when pair method is ``mean``, or the truncation level for queries when the pair method is ``topk``. For example, to train with ``ndcg@6``, set ``lambdarank_num_pair_per_sample`` to :math:`6` and ``lambdarank_pair_method`` to ``topk``.
 
-* ``lambdarank_unbiased`` [default = ``false``]
+* ``lambdarank_normalization`` [default = ``true``]
+
+  .. versionadded:: 2.1.0
+
+  Whether to normalize the leaf value by lambda gradient. This can sometimes stagnate the training progress.
+
+*  ``lambdarank_unbiased`` [default = ``false``]
 
   Specify whether do we need to debias input click data.
 
diff --git a/doc/python/callbacks.rst b/doc/python/callbacks.rst
index 7cb257a81..6d8b43a11 100644
--- a/doc/python/callbacks.rst
+++ b/doc/python/callbacks.rst
@@ -36,7 +36,7 @@ inside iteration loop.  You can also pass this callback function directly into X
     # Specify which dataset and which metric should be used for early stopping.
     early_stop = xgb.callback.EarlyStopping(rounds=early_stopping_rounds,
                                             metric_name='CustomErr',
-                                            data_name='Train')
+                                            data_name='Valid')
 
     booster = xgb.train(
         {'objective': 'binary:logistic',
diff --git a/doc/python/python_intro.rst b/doc/python/python_intro.rst
index 0d26a5253..cfdd20da0 100644
--- a/doc/python/python_intro.rst
+++ b/doc/python/python_intro.rst
@@ -63,7 +63,7 @@ The input data is stored in a :py:class:`DMatrix <xgboost.DMatrix>` object. For
 
   .. code-block:: python
 
-    dtrain = xgb.DMatrix('train.svm.txt')
+    dtrain = xgb.DMatrix('train.svm.txt?format=libsvm')
     dtrain.save_binary('train.buffer')
 
 * Missing values can be replaced by a default value in the :py:class:`DMatrix <xgboost.DMatrix>` constructor:
@@ -86,7 +86,7 @@ to number of groups.
 
   .. code-block:: python
 
-    dtrain = xgb.DMatrix('train.svm.txt')
+    dtrain = xgb.DMatrix('train.svm.txt?format=libsvm')
     dtest = xgb.DMatrix('test.svm.buffer')
 
   The parser in XGBoost has limited functionality. When using Python interface, it's
@@ -176,7 +176,6 @@ Support Matrix
 +-------------------------+-----------+-------------------+-----------+-----------+--------------------+-------------+
 | pyarrow.Table           | NPA       | NPA               | NPA       | NPA       | NPA                | NPA         |
 +-------------------------+-----------+-------------------+-----------+-----------+--------------------+-------------+
-+-------------------------+-----------+-------------------+-----------+-----------+--------------------+-------------+
 | _\_array\_\_            | NPA       | F                 | NPA       | NPA       | H                  |             |
 +-------------------------+-----------+-------------------+-----------+-----------+--------------------+-------------+
 | Others                  | SciCSR    | F                 |           | F         | F                  |             |
@@ -240,7 +239,7 @@ A saved model can be loaded as follows:
 .. code-block:: python
 
   bst = xgb.Booster({'nthread': 4})  # init model
-  bst.load_model('model.bin')  # load data
+  bst.load_model('model.bin')  # load model data
 
 Methods including `update` and `boost` from `xgboost.Booster` are designed for
 internal usage only.  The wrapper function `xgboost.train` does some
diff --git a/doc/python/sklearn_estimator.rst b/doc/python/sklearn_estimator.rst
index 207b9fa30..1aaa340b1 100644
--- a/doc/python/sklearn_estimator.rst
+++ b/doc/python/sklearn_estimator.rst
@@ -62,7 +62,7 @@ stack of trees:
 .. code-block:: python
 
     early_stop = xgb.callback.EarlyStopping(
-        rounds=2, metric_name='logloss', data_name='Validation_0', save_best=True
+        rounds=2, metric_name='logloss', data_name='validation_0', save_best=True
     )
     clf = xgb.XGBClassifier(tree_method="hist", callbacks=[early_stop])
     clf.fit(X_train, y_train, eval_set=[(X_test, y_test)])
diff --git a/doc/requirements.txt b/doc/requirements.txt
index 667ef268f..ddff9be92 100644
--- a/doc/requirements.txt
+++ b/doc/requirements.txt
@@ -7,7 +7,9 @@ sh
 matplotlib
 graphviz
 numpy
+scipy
 myst-parser
+ray[train]
 xgboost_ray
 sphinx-gallery
 pyspark
diff --git a/doc/tutorials/learning_to_rank.rst b/doc/tutorials/learning_to_rank.rst
index 015f736e0..bfc727ed7 100644
--- a/doc/tutorials/learning_to_rank.rst
+++ b/doc/tutorials/learning_to_rank.rst
@@ -48,7 +48,7 @@ Notice that the samples are sorted based on their query index in a non-decreasin
   import xgboost as xgb
 
   # Make a synthetic ranking dataset for demonstration
-  seed = 1994 
+  seed = 1994
   X, y = make_classification(random_state=seed)
   rng = np.random.default_rng(seed)
   n_query_groups = 3
@@ -146,7 +146,8 @@ The consideration of effective pairs also applies to the choice of pair method (
 
 When using the mean strategy for generating pairs, where the target metric (like ``NDCG``) is computed over the whole query list, users can specify how many pairs should be generated per each document, by setting the ``lambdarank_num_pair_per_sample``. XGBoost will randomly sample ``lambdarank_num_pair_per_sample`` pairs for each element in the query group (:math:`|pairs| = |query| \times num\_pairsample`). Often, setting it to 1 can produce reasonable results. In cases where performance is inadequate due to insufficient number of effective pairs being generated, set ``lambdarank_num_pair_per_sample`` to a higher value. As more document pairs are generated, more effective pairs will be generated as well.
 
-On the other hand, if you are prioritizing the top :math:`k` documents, the ``lambdarank_num_pair_per_sample`` should be set slightly higher than :math:`k` (with a few more documents) to obtain a good training result.
+On the other hand, if you are prioritizing the top :math:`k` documents, the ``lambdarank_num_pair_per_sample`` should be set slightly higher than :math:`k` (with a few more documents) to obtain a good training result. Lastly, XGBoost employs additional regularization for learning to rank objectives, which can be disabled by setting the ``lambdarank_normalization`` to ``False``.
+
 
 **Summary** If you have large amount of training data:
 
diff --git a/doc/tutorials/spark_estimator.rst b/doc/tutorials/spark_estimator.rst
index 8bd1dcd97..4e608440a 100644
--- a/doc/tutorials/spark_estimator.rst
+++ b/doc/tutorials/spark_estimator.rst
@@ -28,7 +28,7 @@ We can create a ``SparkXGBRegressor`` estimator like:
 .. code-block:: python
 
   from xgboost.spark import SparkXGBRegressor
-  spark_reg_estimator = SparkXGBRegressor(
+  xgb_regressor = SparkXGBRegressor(
     features_col="features",
     label_col="label",
     num_workers=2,
@@ -61,7 +61,7 @@ type or spark array type.
 
 .. code-block:: python
 
-  transformed_test_spark_dataframe = xgb_regressor.predict(test_spark_dataframe)
+  transformed_test_spark_dataframe = xgb_regressor_model.transform(test_spark_dataframe)
 
 
 The above snippet code returns a ``transformed_test_spark_dataframe`` that contains the input
diff --git a/doc/xgboost_doc.yml b/doc/xgboost_doc.yml
index 90b877e73..177e8758f 100644
--- a/doc/xgboost_doc.yml
+++ b/doc/xgboost_doc.yml
@@ -1,15 +1,23 @@
 name: xgboost_docs
 dependencies:
-  - python
+  - python=3.10
   - pip
   - pygraphviz
   - sphinx
+  - sphinx-gallery
   - recommonmark
   - mock
   - sh
   - matplotlib
+  - numpy
+  - scipy
+  - scikit-learn
+  - myst-parser
+  - pyspark
   - pip:
     - breathe
     - sphinx_rtd_theme
     - pydot-ng
     - graphviz
+    - ray[train]
+    - xgboost_ray
diff --git a/include/xgboost/base.h b/include/xgboost/base.h
index 1c4b6568e..4009df59f 100644
--- a/include/xgboost/base.h
+++ b/include/xgboost/base.h
@@ -37,7 +37,7 @@
  * \brief Whether to customize global PRNG.
  */
 #ifndef XGBOOST_CUSTOMIZE_GLOBAL_PRNG
-#define XGBOOST_CUSTOMIZE_GLOBAL_PRNG XGBOOST_STRICT_R_MODE
+#define XGBOOST_CUSTOMIZE_GLOBAL_PRNG 0
 #endif  // XGBOOST_CUSTOMIZE_GLOBAL_PRNG
 
 /*!
diff --git a/include/xgboost/collective/result.h b/include/xgboost/collective/result.h
index 507171dd4..919d3a902 100644
--- a/include/xgboost/collective/result.h
+++ b/include/xgboost/collective/result.h
@@ -1,8 +1,10 @@
 /**
- *  Copyright 2023, XGBoost Contributors
+ *  Copyright 2023-2024, XGBoost Contributors
  */
 #pragma once
 
+#include <xgboost/logging.h>
+
 #include <memory>   // for unique_ptr
 #include <sstream>  // for stringstream
 #include <stack>    // for stack
@@ -160,10 +162,16 @@ struct Result {
 
 // We don't have monad, a simple helper would do.
 template <typename Fn>
-Result operator<<(Result&& r, Fn&& fn) {
+[[nodiscard]] Result operator<<(Result&& r, Fn&& fn) {
   if (!r.OK()) {
     return std::forward<Result>(r);
   }
   return fn();
 }
+
+inline void SafeColl(Result const& rc) {
+  if (!rc.OK()) {
+    LOG(FATAL) << rc.Report();
+  }
+}
 }  // namespace xgboost::collective
diff --git a/include/xgboost/json.h b/include/xgboost/json.h
index a5872ec3a..77ca6a510 100644
--- a/include/xgboost/json.h
+++ b/include/xgboost/json.h
@@ -1,5 +1,5 @@
 /**
- * Copyright 2019-2023 by XGBoost Contributors
+ * Copyright 2019-2024, XGBoost Contributors
  */
 #ifndef XGBOOST_JSON_H_
 #define XGBOOST_JSON_H_
@@ -42,7 +42,8 @@ class Value {
     kBoolean,
     kNull,
     // typed array for ubjson
-    kNumberArray,
+    kF32Array,
+    kF64Array,
     kU8Array,
     kI32Array,
     kI64Array
@@ -173,7 +174,11 @@ class JsonTypedArray : public Value {
 /**
  * @brief Typed UBJSON array for 32-bit floating point.
  */
-using F32Array = JsonTypedArray<float, Value::ValueKind::kNumberArray>;
+using F32Array = JsonTypedArray<float, Value::ValueKind::kF32Array>;
+/**
+ * @brief Typed UBJSON array for 64-bit floating point.
+ */
+using F64Array = JsonTypedArray<double, Value::ValueKind::kF64Array>;
 /**
  * @brief Typed UBJSON array for uint8_t.
  */
@@ -457,9 +462,9 @@ class Json {
   Json& operator[](int ind)                 const { return (*ptr_)[ind]; }
 
   /*! \brief Return the reference to stored Json value. */
-  Value const& GetValue() const & { return *ptr_; }
-  Value const& GetValue() &&      { return *ptr_; }
-  Value&       GetValue() &       { return *ptr_; }
+  [[nodiscard]] Value const& GetValue() const& { return *ptr_; }
+  Value const& GetValue() && { return *ptr_; }
+  Value& GetValue() & { return *ptr_; }
 
   bool operator==(Json const& rhs) const {
     return *ptr_ == *(rhs.ptr_);
@@ -472,7 +477,7 @@ class Json {
     return os;
   }
 
-  IntrusivePtr<Value> const& Ptr() const { return ptr_; }
+  [[nodiscard]] IntrusivePtr<Value> const& Ptr() const { return ptr_; }
 
  private:
   IntrusivePtr<Value> ptr_{new JsonNull};
diff --git a/include/xgboost/json_io.h b/include/xgboost/json_io.h
index 3a73d170a..ce3d25c37 100644
--- a/include/xgboost/json_io.h
+++ b/include/xgboost/json_io.h
@@ -142,6 +142,7 @@ class JsonWriter {
 
   virtual void Visit(JsonArray  const* arr);
   virtual void Visit(F32Array  const* arr);
+  virtual void Visit(F64Array const*) { LOG(FATAL) << "Only UBJSON format can handle f64 array."; }
   virtual void Visit(U8Array  const* arr);
   virtual void Visit(I32Array  const* arr);
   virtual void Visit(I64Array  const* arr);
@@ -244,7 +245,8 @@ class UBJReader : public JsonReader {
  */
 class UBJWriter : public JsonWriter {
   void Visit(JsonArray const* arr) override;
-  void Visit(F32Array  const* arr) override;
+  void Visit(F32Array const* arr) override;
+  void Visit(F64Array const* arr) override;
   void Visit(U8Array  const* arr) override;
   void Visit(I32Array  const* arr) override;
   void Visit(I64Array  const* arr) override;
diff --git a/include/xgboost/linalg.h b/include/xgboost/linalg.h
index 26a072e52..8f9cd3eaa 100644
--- a/include/xgboost/linalg.h
+++ b/include/xgboost/linalg.h
@@ -295,6 +295,9 @@ class TensorView {
   using ShapeT = std::size_t[kDim];
   using StrideT = ShapeT;
 
+  using element_type = T;                  // NOLINT
+  using value_type = std::remove_cv_t<T>;  // NOLINT
+
  private:
   StrideT stride_{1};
   ShapeT shape_{0};
@@ -314,7 +317,7 @@ class TensorView {
   }
 
   template <size_t old_dim, size_t new_dim, int32_t D, typename I>
-  LINALG_HD size_t MakeSliceDim(size_t new_shape[D], size_t new_stride[D],
+  LINALG_HD size_t MakeSliceDim(std::size_t new_shape[D], std::size_t new_stride[D],
                                 detail::RangeTag<I> &&range) const {
     static_assert(new_dim < D);
     static_assert(old_dim < kDim);
@@ -528,9 +531,10 @@ class TensorView {
   LINALG_HD auto Stride(size_t i) const { return stride_[i]; }
 
   /**
-   * \brief Number of items in the tensor.
+   * @brief Number of items in the tensor.
    */
   [[nodiscard]] LINALG_HD std::size_t Size() const { return size_; }
+  [[nodiscard]] bool Empty() const { return Size() == 0; }
   /**
    * \brief Whether this is a contiguous array, both C and F contiguous returns true.
    */
@@ -865,7 +869,9 @@ class Tensor {
   auto HostView() { return this->View(DeviceOrd::CPU()); }
   auto HostView() const { return this->View(DeviceOrd::CPU()); }
 
-  [[nodiscard]] size_t Size() const { return data_.Size(); }
+  [[nodiscard]] std::size_t Size() const { return data_.Size(); }
+  [[nodiscard]] bool Empty() const { return Size() == 0; }
+
   auto Shape() const { return common::Span<size_t const, kDim>{shape_}; }
   auto Shape(size_t i) const { return shape_[i]; }
 
diff --git a/include/xgboost/span.h b/include/xgboost/span.h
index b0c1a5c1e..3e1325ceb 100644
--- a/include/xgboost/span.h
+++ b/include/xgboost/span.h
@@ -738,10 +738,10 @@ class IterSpan {
     return {data() + _offset, _count == dynamic_extent ? size() - _offset : _count};
   }
   [[nodiscard]] XGBOOST_DEVICE constexpr iterator begin() const noexcept {  // NOLINT
-    return {this, 0};
+    return it_;
   }
   [[nodiscard]] XGBOOST_DEVICE constexpr iterator end() const noexcept {  // NOLINT
-    return {this, size()};
+    return it_ + size();
   }
 };
 }  // namespace common
diff --git a/include/xgboost/tree_model.h b/include/xgboost/tree_model.h
index 4c475da2e..32b93c5ca 100644
--- a/include/xgboost/tree_model.h
+++ b/include/xgboost/tree_model.h
@@ -1,5 +1,5 @@
 /**
- * Copyright 2014-2023 by Contributors
+ * Copyright 2014-2024, XGBoost Contributors
  * \file tree_model.h
  * \brief model structure for tree
  * \author Tianqi Chen
@@ -688,6 +688,9 @@ class RegTree : public Model {
     }
     return (*this)[nidx].DefaultLeft();
   }
+  [[nodiscard]] bst_node_t DefaultChild(bst_node_t nidx) const {
+    return this->DefaultLeft(nidx) ? this->LeftChild(nidx) : this->RightChild(nidx);
+  }
   [[nodiscard]] bool IsRoot(bst_node_t nidx) const {
     if (IsMultiTarget()) {
       return nidx == kRoot;
diff --git a/jvm-packages/create_jni.py b/jvm-packages/create_jni.py
index 395bc79b0..ff7bba693 100755
--- a/jvm-packages/create_jni.py
+++ b/jvm-packages/create_jni.py
@@ -83,44 +83,59 @@ def native_build(args):
     with cd(".."):
         build_dir = 'build-gpu' if cli_args.use_cuda == 'ON' or cli_args.use_hip == 'ON' else 'build'
         maybe_makedirs(build_dir)
+
+        if sys.platform == "linux":
+            maybe_parallel_build = " -- -j $(nproc)"
+        else:
+            maybe_parallel_build = ""
+
+        if cli_args.log_capi_invocation == "ON":
+            CONFIG["LOG_CAPI_INVOCATION"] = "ON"
+
+        if cli_args.use_cuda == "ON":
+            CONFIG["USE_CUDA"] = "ON"
+            CONFIG["USE_NCCL"] = "ON"
+            CONFIG["USE_DLOPEN_NCCL"] = "OFF"
+        elif cli_args.use_hip== 'ON':
+            CONFIG['USE_HIP'] = 'ON'
+            CONFIG['USE_RCCL'] = 'ON'
+            CONFIG["USE_DLOPEN_RCCL"] = "OFF"
+
+        args = ["-D{0}:BOOL={1}".format(k, v) for k, v in CONFIG.items()]
+
+        # if enviorment set rabit_mock
+        if os.getenv("RABIT_MOCK", None) is not None:
+            args.append("-DRABIT_MOCK:BOOL=ON")
+
+        # if enviorment set GPU_ARCH_FLAG
+        gpu_arch_flag = os.getenv("GPU_ARCH_FLAG", None)
+        if gpu_arch_flag is not None:
+            args.append("%s" % gpu_arch_flag)
+
         with cd(build_dir):
-            if sys.platform == "win32":
-                # Force x64 build on Windows.
-                maybe_generator = " -A x64"
-            else:
-                maybe_generator = ""
-            if sys.platform == "linux":
-                maybe_parallel_build = " -- -j $(nproc)"
-            else:
-                maybe_parallel_build = ""
-
-            if cli_args.log_capi_invocation == "ON":
-                CONFIG["LOG_CAPI_INVOCATION"] = "ON"
-
-            if cli_args.use_cuda == "ON":
-                CONFIG["USE_CUDA"] = "ON"
-                CONFIG["USE_NCCL"] = "ON"
-                CONFIG["USE_DLOPEN_NCCL"] = "OFF"
-            elif cli_args.use_hip== 'ON':
-                CONFIG['USE_HIP'] = 'ON'
-                CONFIG['USE_RCCL'] = 'ON'
-                CONFIG["USE_DLOPEN_RCCL"] = "OFF"
-
-            args = ["-D{0}:BOOL={1}".format(k, v) for k, v in CONFIG.items()]
-
-            # if enviorment set rabit_mock
-            if os.getenv("RABIT_MOCK", None) is not None:
-                args.append("-DRABIT_MOCK:BOOL=ON")
-
-            # if enviorment set GPU_ARCH_FLAG
-            gpu_arch_flag = os.getenv("GPU_ARCH_FLAG", None)
-            if gpu_arch_flag is not None:
-                args.append("%s" % gpu_arch_flag)
-
             lib_dir = os.path.join(os.pardir, "lib")
             if os.path.exists(lib_dir):
                 shutil.rmtree(lib_dir)
-            run("cmake .. " + " ".join(args) + maybe_generator)
+
+            # Same trick as Python build, just test all possible generators.
+            if sys.platform == "win32":
+                supported_generators = (
+                    "",  # empty, decided by cmake
+                    '-G"Visual Studio 17 2022" -A x64',
+                    '-G"Visual Studio 16 2019" -A x64',
+                    '-G"Visual Studio 15 2017" -A x64',
+                )
+                for generator in supported_generators:
+                    try:
+                        run("cmake .. " + " ".join(args + [generator]))
+                        break
+                    except subprocess.CalledProcessError as e:
+                        print(f"Failed to build with generator: {generator}", e)
+                        with cd(os.path.pardir):
+                            shutil.rmtree(build_dir)
+                            maybe_makedirs(build_dir)
+            else:
+                run("cmake .. " + " ".join(args))
             run("cmake --build . --config Release" + maybe_parallel_build)
 
         with cd("demo/CLI/regression"):
diff --git a/plugin/CMakeLists.txt b/plugin/CMakeLists.txt
index e575f1a41..5d20e120e 100644
--- a/plugin/CMakeLists.txt
+++ b/plugin/CMakeLists.txt
@@ -1,10 +1,7 @@
 if(PLUGIN_SYCL)
   set(CMAKE_CXX_COMPILER "icpx")
-  add_library(plugin_sycl OBJECT
-    ${xgboost_SOURCE_DIR}/plugin/sycl/objective/regression_obj.cc
-    ${xgboost_SOURCE_DIR}/plugin/sycl/objective/multiclass_obj.cc
-    ${xgboost_SOURCE_DIR}/plugin/sycl/device_manager.cc
-    ${xgboost_SOURCE_DIR}/plugin/sycl/predictor/predictor.cc)
+  file(GLOB_RECURSE SYCL_SOURCES "sycl/*.cc")
+  add_library(plugin_sycl OBJECT ${SYCL_SOURCES})
   target_include_directories(plugin_sycl
     PRIVATE
     ${xgboost_SOURCE_DIR}/include
diff --git a/plugin/sycl/common/partition_builder.h b/plugin/sycl/common/partition_builder.h
index 37d1af241..c520ff31f 100644
--- a/plugin/sycl/common/partition_builder.h
+++ b/plugin/sycl/common/partition_builder.h
@@ -21,6 +21,9 @@
 #pragma GCC diagnostic pop
 
 #include "../data.h"
+#include "row_set.h"
+#include "../data/gradient_index.h"
+#include "../tree/expand_entry.h"
 
 #include <CL/sycl.hpp>
 
@@ -28,6 +31,87 @@ namespace xgboost {
 namespace sycl {
 namespace common {
 
+// split row indexes (rid_span) to 2 parts (both stored in rid_buf) depending
+// on comparison of indexes values (idx_span) and split point (split_cond)
+// Handle dense columns
+template <bool default_left, typename BinIdxType>
+inline ::sycl::event PartitionDenseKernel(
+                                 ::sycl::queue* qu,
+                                 const GHistIndexMatrix& gmat,
+                                 const RowSetCollection::Elem& rid_span,
+                                 const size_t fid,
+                                 const int32_t split_cond,
+                                 xgboost::common::Span<size_t>* rid_buf,
+                                 size_t* parts_size,
+                                 ::sycl::event event) {
+  const size_t row_stride = gmat.row_stride;
+  const BinIdxType* gradient_index = gmat.index.data<BinIdxType>();
+  const size_t* rid = rid_span.begin;
+  const size_t range_size = rid_span.Size();
+  const size_t offset = gmat.cut.Ptrs()[fid];
+
+  size_t* p_rid_buf = rid_buf->data();
+
+  return qu->submit([&](::sycl::handler& cgh) {
+    cgh.depends_on(event);
+    cgh.parallel_for<>(::sycl::range<1>(range_size), [=](::sycl::item<1> nid) {
+      const size_t id = rid[nid.get_id(0)];
+      const int32_t value = static_cast<int32_t>(gradient_index[id * row_stride + fid] + offset);
+      const bool is_left = value <= split_cond;
+      if (is_left) {
+        AtomicRef<size_t> n_left(parts_size[0]);
+        p_rid_buf[n_left.fetch_add(1)] = id;
+      } else {
+        AtomicRef<size_t> n_right(parts_size[1]);
+        p_rid_buf[range_size - n_right.fetch_add(1) - 1] = id;
+      }
+    });
+  });
+}
+
+// split row indexes (rid_span) to 2 parts (both stored in rid_buf) depending
+// on comparison of indexes values (idx_span) and split point (split_cond)
+// Handle sparce columns
+template <bool default_left, typename BinIdxType>
+inline ::sycl::event PartitionSparseKernel(::sycl::queue* qu,
+                                  const GHistIndexMatrix& gmat,
+                                  const RowSetCollection::Elem& rid_span,
+                                  const size_t fid,
+                                  const int32_t split_cond,
+                                  xgboost::common::Span<size_t>* rid_buf,
+                                  size_t* parts_size,
+                                  ::sycl::event event) {
+  const size_t row_stride = gmat.row_stride;
+  const BinIdxType* gradient_index = gmat.index.data<BinIdxType>();
+  const size_t* rid = rid_span.begin;
+  const size_t range_size = rid_span.Size();
+  const uint32_t* cut_ptrs = gmat.cut_device.Ptrs().DataConst();
+
+  size_t* p_rid_buf = rid_buf->data();
+  return qu->submit([&](::sycl::handler& cgh) {
+    cgh.depends_on(event);
+    cgh.parallel_for<>(::sycl::range<1>(range_size), [=](::sycl::item<1> nid) {
+      const size_t id = rid[nid.get_id(0)];
+
+      const BinIdxType* gr_index_local = gradient_index + row_stride * id;
+      const int32_t fid_local = std::lower_bound(gr_index_local,
+                                                 gr_index_local + row_stride,
+                                                 cut_ptrs[fid]) - gr_index_local;
+      const bool is_left = (fid_local >= row_stride ||
+                            gr_index_local[fid_local] >= cut_ptrs[fid + 1]) ?
+                              default_left :
+                              gr_index_local[fid_local] <= split_cond;
+      if (is_left) {
+        AtomicRef<size_t> n_left(parts_size[0]);
+        p_rid_buf[n_left.fetch_add(1)] = id;
+      } else {
+        AtomicRef<size_t> n_right(parts_size[1]);
+        p_rid_buf[range_size - n_right.fetch_add(1) - 1] = id;
+      }
+    });
+  });
+}
+
 // The builder is required for samples partition to left and rights children for set of nodes
 class PartitionBuilder {
  public:
@@ -53,7 +137,6 @@ class PartitionBuilder {
     return result_rows_[2 * nid];
   }
 
-
   size_t GetNRightElems(int nid) const {
     return result_rows_[2 * nid + 1];
   }
@@ -72,19 +155,97 @@ class PartitionBuilder {
     return { data_.Data() + nodes_offsets_[nid], nodes_offsets_[nid + 1] - nodes_offsets_[nid] };
   }
 
+  template <typename BinIdxType>
+  ::sycl::event Partition(const int32_t split_cond,
+                        const GHistIndexMatrix& gmat,
+                        const RowSetCollection::Elem& rid_span,
+                        const xgboost::RegTree::Node& node,
+                        xgboost::common::Span<size_t>* rid_buf,
+                        size_t* parts_size,
+                        ::sycl::event event) {
+    const bst_uint fid = node.SplitIndex();
+    const bool default_left = node.DefaultLeft();
+
+    if (gmat.IsDense()) {
+      if (default_left) {
+        return PartitionDenseKernel<true, BinIdxType>(qu_, gmat, rid_span, fid,
+                                                      split_cond, rid_buf, parts_size, event);
+      } else {
+        return PartitionDenseKernel<false, BinIdxType>(qu_, gmat, rid_span, fid,
+                                                      split_cond, rid_buf, parts_size, event);
+      }
+    } else {
+      if (default_left) {
+        return PartitionSparseKernel<true, BinIdxType>(qu_, gmat, rid_span, fid,
+                                                      split_cond, rid_buf, parts_size, event);
+      } else {
+        return PartitionSparseKernel<false, BinIdxType>(qu_, gmat, rid_span, fid,
+                                                        split_cond, rid_buf, parts_size, event);
+      }
+    }
+  }
+
+  // Entry point for Partition
+  void Partition(const GHistIndexMatrix& gmat,
+                 const std::vector<tree::ExpandEntry> nodes,
+                 const RowSetCollection& row_set_collection,
+                 const std::vector<int32_t>& split_conditions,
+                 RegTree* p_tree,
+                 ::sycl::event* general_event) {
+    nodes_events_.resize(n_nodes_);
+
+    parts_size_.ResizeAndFill(qu_, 2 * n_nodes_, 0, general_event);
+
+    for (size_t node_in_set = 0; node_in_set < n_nodes_; node_in_set++) {
+      const int32_t nid = nodes[node_in_set].nid;
+      ::sycl::event& node_event = nodes_events_[node_in_set];
+      const auto& rid_span = row_set_collection[nid];
+      if (rid_span.Size() > 0) {
+        const RegTree::Node& node = (*p_tree)[nid];
+        xgboost::common::Span<size_t> rid_buf = GetData(node_in_set);
+        size_t* part_size = parts_size_.Data() + 2 * node_in_set;
+        int32_t split_condition = split_conditions[node_in_set];
+        switch (gmat.index.GetBinTypeSize()) {
+          case common::BinTypeSize::kUint8BinsTypeSize:
+            node_event = Partition<uint8_t>(split_condition, gmat, rid_span, node,
+                                            &rid_buf, part_size, *general_event);
+            break;
+          case common::BinTypeSize::kUint16BinsTypeSize:
+            node_event = Partition<uint16_t>(split_condition, gmat, rid_span, node,
+                                            &rid_buf, part_size, *general_event);
+            break;
+          case common::BinTypeSize::kUint32BinsTypeSize:
+            node_event = Partition<uint32_t>(split_condition, gmat, rid_span, node,
+                                            &rid_buf, part_size, *general_event);
+            break;
+          default:
+            CHECK(false);  // no default behavior
+        }
+      } else {
+        node_event = ::sycl::event();
+      }
+    }
+
+    *general_event = qu_->memcpy(result_rows_.data(),
+                                 parts_size_.DataConst(),
+                                 sizeof(size_t) * 2 * n_nodes_,
+                                 nodes_events_);
+  }
+
   void MergeToArray(size_t nid,
                     size_t* data_result,
-                    ::sycl::event event) {
+                    ::sycl::event* event) {
     size_t n_nodes_total = GetNLeftElems(nid) + GetNRightElems(nid);
     if (n_nodes_total > 0) {
       const size_t* data = data_.Data() + nodes_offsets_[nid];
-      qu_->memcpy(data_result, data, sizeof(size_t) * n_nodes_total, event);
+      qu_->memcpy(data_result, data, sizeof(size_t) * n_nodes_total, *event);
     }
   }
 
  protected:
   std::vector<size_t> nodes_offsets_;
   std::vector<size_t> result_rows_;
+  std::vector<::sycl::event> nodes_events_;
   size_t n_nodes_;
 
   USMVector<size_t, MemoryType::on_device> parts_size_;
diff --git a/plugin/sycl/common/row_set.h b/plugin/sycl/common/row_set.h
new file mode 100644
index 000000000..574adbf8d
--- /dev/null
+++ b/plugin/sycl/common/row_set.h
@@ -0,0 +1,123 @@
+/*!
+ * Copyright 2017-2023 XGBoost contributors
+ */
+#ifndef PLUGIN_SYCL_COMMON_ROW_SET_H_
+#define PLUGIN_SYCL_COMMON_ROW_SET_H_
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wtautological-constant-compare"
+#pragma GCC diagnostic ignored "-W#pragma-messages"
+#include <xgboost/data.h>
+#pragma GCC diagnostic pop
+#include <algorithm>
+#include <vector>
+#include <utility>
+
+#include "../data.h"
+
+#include <CL/sycl.hpp>
+
+namespace xgboost {
+namespace sycl {
+namespace common {
+
+
+/*! \brief Collection of rowsets stored on device in USM memory */
+class RowSetCollection {
+ public:
+  /*! \brief data structure to store an instance set, a subset of
+   *  rows (instances) associated with a particular node in a decision
+   *  tree. */
+  struct Elem {
+    const size_t* begin{nullptr};
+    const size_t* end{nullptr};
+    bst_node_t node_id{-1};  // id of node associated with this instance set; -1 means uninitialized
+    Elem()
+         = default;
+    Elem(const size_t* begin,
+         const size_t* end,
+         bst_node_t node_id = -1)
+        : begin(begin), end(end), node_id(node_id) {}
+
+
+    inline size_t Size() const {
+      return end - begin;
+    }
+  };
+
+  inline size_t Size() const {
+    return elem_of_each_node_.size();
+  }
+
+  /*! \brief return corresponding element set given the node_id */
+  inline const Elem& operator[](unsigned node_id) const {
+    const Elem& e = elem_of_each_node_[node_id];
+    CHECK(e.begin != nullptr)
+        << "access element that is not in the set";
+    return e;
+  }
+
+  /*! \brief return corresponding element set given the node_id */
+  inline Elem& operator[](unsigned node_id) {
+    Elem& e = elem_of_each_node_[node_id];
+    return e;
+  }
+
+  // clear up things
+  inline void Clear() {
+    elem_of_each_node_.clear();
+  }
+  // initialize node id 0->everything
+  inline void Init() {
+    CHECK_EQ(elem_of_each_node_.size(), 0U);
+
+    const size_t* begin = row_indices_.Begin();
+    const size_t* end = row_indices_.End();
+    elem_of_each_node_.emplace_back(Elem(begin, end, 0));
+  }
+
+  auto& Data() { return row_indices_; }
+
+  // split rowset into two
+  inline void AddSplit(unsigned node_id,
+                       unsigned left_node_id,
+                       unsigned right_node_id,
+                       size_t n_left,
+                       size_t n_right) {
+    const Elem e = elem_of_each_node_[node_id];
+    CHECK(e.begin != nullptr);
+    size_t* all_begin = row_indices_.Begin();
+    size_t* begin = all_begin + (e.begin - all_begin);
+
+
+    CHECK_EQ(n_left + n_right, e.Size());
+    CHECK_LE(begin + n_left, e.end);
+    CHECK_EQ(begin + n_left + n_right, e.end);
+
+
+    if (left_node_id >= elem_of_each_node_.size()) {
+      elem_of_each_node_.resize(left_node_id + 1, Elem(nullptr, nullptr, -1));
+    }
+    if (right_node_id >= elem_of_each_node_.size()) {
+      elem_of_each_node_.resize(right_node_id + 1, Elem(nullptr, nullptr, -1));
+    }
+
+
+    elem_of_each_node_[left_node_id] = Elem(begin, begin + n_left, left_node_id);
+    elem_of_each_node_[right_node_id] = Elem(begin + n_left, e.end, right_node_id);
+    elem_of_each_node_[node_id] = Elem(nullptr, nullptr, -1);
+  }
+
+ private:
+  // stores the row indexes in the set
+  USMVector<size_t, MemoryType::on_device> row_indices_;
+  // vector: node_id -> elements
+  std::vector<Elem> elem_of_each_node_;
+};
+
+}  // namespace common
+}  // namespace sycl
+}  // namespace xgboost
+
+
+#endif  // PLUGIN_SYCL_COMMON_ROW_SET_H_
diff --git a/plugin/sycl/data.h b/plugin/sycl/data.h
index 489fde989..f420ef470 100644
--- a/plugin/sycl/data.h
+++ b/plugin/sycl/data.h
@@ -26,8 +26,13 @@
 
 namespace xgboost {
 namespace sycl {
-enum class MemoryType { shared, on_device};
+template <typename T>
+using AtomicRef = ::sycl::atomic_ref<T,
+                                    ::sycl::memory_order::relaxed,
+                                    ::sycl::memory_scope::device,
+                                    ::sycl::access::address_space::ext_intel_global_device_space>;
 
+enum class MemoryType { shared, on_device};
 
 template <typename T>
 class USMDeleter {
@@ -166,20 +171,20 @@ class USMVector {
     }
   }
 
-  ::sycl::event ResizeAndFill(::sycl::queue* qu, size_t size_new, int v) {
+  void ResizeAndFill(::sycl::queue* qu, size_t size_new, int v, ::sycl::event* event) {
     if (size_new <= size_) {
       size_ = size_new;
-      return qu->memset(data_.get(), v, size_new * sizeof(T));
+      *event = qu->memset(data_.get(), v, size_new * sizeof(T), *event);
     } else if (size_new <= capacity_) {
       size_ = size_new;
-      return qu->memset(data_.get(), v, size_new * sizeof(T));
+      *event = qu->memset(data_.get(), v, size_new * sizeof(T), *event);
     } else {
       size_t size_old = size_;
       auto data_old = data_;
       size_ = size_new;
       capacity_ = size_new;
       data_ = allocate_memory_(qu, size_);
-      return qu->memset(data_.get(), v, size_new * sizeof(T));
+      *event = qu->memset(data_.get(), v, size_new * sizeof(T), *event);
     }
   }
 
@@ -206,11 +211,16 @@ class USMVector {
 struct DeviceMatrix {
   DMatrix* p_mat;  // Pointer to the original matrix on the host
   ::sycl::queue qu_;
-  USMVector<size_t> row_ptr;
+  USMVector<size_t, MemoryType::on_device> row_ptr;
   USMVector<Entry, MemoryType::on_device> data;
   size_t total_offset;
 
-  DeviceMatrix(::sycl::queue qu, DMatrix* dmat) : p_mat(dmat), qu_(qu) {
+  DeviceMatrix() = default;
+
+  void Init(::sycl::queue qu, DMatrix* dmat) {
+    qu_ = qu;
+    p_mat = dmat;
+
     size_t num_row = 0;
     size_t num_nonzero = 0;
     for (auto &batch : dmat->GetBatches<SparsePage>()) {
@@ -221,27 +231,41 @@ struct DeviceMatrix {
     }
 
     row_ptr.Resize(&qu_, num_row + 1);
+    size_t* rows = row_ptr.Data();
     data.Resize(&qu_, num_nonzero);
 
     size_t data_offset = 0;
+    ::sycl::event event;
     for (auto &batch : dmat->GetBatches<SparsePage>()) {
       const auto& data_vec = batch.data.HostVector();
       const auto& offset_vec = batch.offset.HostVector();
       size_t batch_size = batch.Size();
       if (batch_size > 0) {
-        std::copy(offset_vec.data(), offset_vec.data() + batch_size,
-                  row_ptr.Data() + batch.base_rowid);
-        if (batch.base_rowid > 0) {
-          for (size_t i = 0; i < batch_size; i++)
-            row_ptr[i + batch.base_rowid] += batch.base_rowid;
+        const auto base_rowid = batch.base_rowid;
+        event = qu.memcpy(row_ptr.Data() + base_rowid, offset_vec.data(),
+                          sizeof(size_t) * batch_size, event);
+        if (base_rowid > 0) {
+          qu.submit([&](::sycl::handler& cgh) {
+            cgh.depends_on(event);
+            cgh.parallel_for<>(::sycl::range<1>(batch_size), [=](::sycl::id<1> pid) {
+              int row_id = pid[0];
+              rows[row_id] += base_rowid;
+            });
+          });
         }
-        qu.memcpy(data.Data() + data_offset,
-                  data_vec.data(),
-                  offset_vec[batch_size] * sizeof(Entry)).wait();
+        event = qu.memcpy(data.Data() + data_offset, data_vec.data(),
+                          sizeof(Entry) * offset_vec[batch_size], event);
         data_offset += offset_vec[batch_size];
+        qu.wait();
       }
     }
-    row_ptr[num_row] = data_offset;
+    qu.submit([&](::sycl::handler& cgh) {
+      cgh.depends_on(event);
+      cgh.single_task<>([=] {
+        rows[num_row] = data_offset;
+      });
+    });
+    qu.wait();
     total_offset = data_offset;
   }
 
diff --git a/plugin/sycl/data/gradient_index.cc b/plugin/sycl/data/gradient_index.cc
new file mode 100644
index 000000000..49b66a710
--- /dev/null
+++ b/plugin/sycl/data/gradient_index.cc
@@ -0,0 +1,177 @@
+/*!
+ * Copyright 2017-2024 by Contributors
+ * \file gradient_index.cc
+ */
+#include <vector>
+#include <limits>
+#include <algorithm>
+
+#include "gradient_index.h"
+
+#include <CL/sycl.hpp>
+
+namespace xgboost {
+namespace sycl {
+namespace common {
+
+uint32_t SearchBin(const bst_float* cut_values, const uint32_t* cut_ptrs, Entry const& e) {
+  auto beg = cut_ptrs[e.index];
+  auto end = cut_ptrs[e.index + 1];
+  auto it = std::upper_bound(cut_values + beg, cut_values + end, e.fvalue);
+  uint32_t idx = it - cut_values;
+  if (idx == end) {
+    idx -= 1;
+  }
+  return idx;
+}
+
+template <typename BinIdxType>
+void mergeSort(BinIdxType* begin, BinIdxType* end, BinIdxType* buf) {
+  const size_t total_len = end - begin;
+  for (size_t block_len = 1; block_len < total_len; block_len <<= 1) {
+    for (size_t cur_block = 0; cur_block + block_len < total_len; cur_block += 2 * block_len) {
+      size_t start = cur_block;
+      size_t mid = start + block_len;
+      size_t finish = mid + block_len < total_len ? mid + block_len : total_len;
+      size_t left_pos = start;
+      size_t right_pos = mid;
+      size_t pos = start;
+      while (left_pos < mid || right_pos < finish) {
+        if (left_pos < mid && (right_pos == finish || begin[left_pos] < begin[right_pos])) {
+          buf[pos++] = begin[left_pos++];
+        } else {
+          buf[pos++] = begin[right_pos++];
+        }
+      }
+      for (size_t i = start; i < finish; i++) begin[i] = buf[i];
+    }
+  }
+}
+
+template <typename BinIdxType>
+void GHistIndexMatrix::SetIndexData(::sycl::queue qu,
+                                    BinIdxType* index_data,
+                                    const DeviceMatrix &dmat,
+                                    size_t nbins,
+                                    size_t row_stride,
+                                    uint32_t* offsets) {
+  if (nbins == 0) return;
+  const xgboost::Entry *data_ptr = dmat.data.DataConst();
+  const bst_row_t *offset_vec = dmat.row_ptr.DataConst();
+  const size_t num_rows = dmat.row_ptr.Size() - 1;
+  const bst_float* cut_values = cut_device.Values().DataConst();
+  const uint32_t* cut_ptrs = cut_device.Ptrs().DataConst();
+  size_t* hit_count_ptr = hit_count_buff.Data();
+
+  // Sparse case only
+  if (!offsets) {
+    // sort_buff has type uint8_t
+    sort_buff.Resize(&qu, num_rows * row_stride * sizeof(BinIdxType));
+  }
+  BinIdxType* sort_data = reinterpret_cast<BinIdxType*>(sort_buff.Data());
+
+  auto event = qu.submit([&](::sycl::handler& cgh) {
+    cgh.parallel_for<>(::sycl::range<1>(num_rows), [=](::sycl::item<1> pid) {
+      const size_t i = pid.get_id(0);
+      const size_t ibegin = offset_vec[i];
+      const size_t iend = offset_vec[i + 1];
+      const size_t size = iend - ibegin;
+      const size_t start = i * row_stride;
+      for (bst_uint j = 0; j < size; ++j) {
+        uint32_t idx = SearchBin(cut_values, cut_ptrs, data_ptr[ibegin + j]);
+        index_data[start + j] = offsets ? idx - offsets[j] : idx;
+        AtomicRef<size_t> hit_count_ref(hit_count_ptr[idx]);
+        hit_count_ref.fetch_add(1);
+      }
+      if (!offsets) {
+        // Sparse case only
+        mergeSort<BinIdxType>(index_data + start, index_data + start + size, sort_data + start);
+        for (bst_uint j = size; j < row_stride; ++j) {
+          index_data[start + j] = nbins;
+        }
+      }
+    });
+  });
+  qu.memcpy(hit_count.data(), hit_count_ptr, nbins * sizeof(size_t), event);
+  qu.wait();
+}
+
+void GHistIndexMatrix::ResizeIndex(size_t n_index, bool isDense) {
+  if ((max_num_bins - 1 <= static_cast<int>(std::numeric_limits<uint8_t>::max())) && isDense) {
+    index.SetBinTypeSize(BinTypeSize::kUint8BinsTypeSize);
+    index.Resize((sizeof(uint8_t)) * n_index);
+  } else if ((max_num_bins - 1 > static_cast<int>(std::numeric_limits<uint8_t>::max())  &&
+    max_num_bins - 1 <= static_cast<int>(std::numeric_limits<uint16_t>::max())) && isDense) {
+    index.SetBinTypeSize(BinTypeSize::kUint16BinsTypeSize);
+    index.Resize((sizeof(uint16_t)) * n_index);
+  } else {
+    index.SetBinTypeSize(BinTypeSize::kUint32BinsTypeSize);
+    index.Resize((sizeof(uint32_t)) * n_index);
+  }
+}
+
+void GHistIndexMatrix::Init(::sycl::queue qu,
+                            Context const * ctx,
+                            const DeviceMatrix& p_fmat_device,
+                            int max_bins) {
+  nfeatures = p_fmat_device.p_mat->Info().num_col_;
+
+  cut = xgboost::common::SketchOnDMatrix(ctx, p_fmat_device.p_mat, max_bins);
+  cut_device.Init(qu, cut);
+
+  max_num_bins = max_bins;
+  const uint32_t nbins = cut.Ptrs().back();
+  this->nbins = nbins;
+  hit_count.resize(nbins, 0);
+  hit_count_buff.Resize(&qu, nbins, 0);
+
+  this->p_fmat = p_fmat_device.p_mat;
+  const bool isDense = p_fmat_device.p_mat->IsDense();
+  this->isDense_ = isDense;
+
+  index.setQueue(qu);
+
+  row_stride = 0;
+  for (const auto& batch : p_fmat_device.p_mat->GetBatches<SparsePage>()) {
+    const auto& row_offset = batch.offset.ConstHostVector();
+    for (auto i = 1ull; i < row_offset.size(); i++) {
+      row_stride = std::max(row_stride, static_cast<size_t>(row_offset[i] - row_offset[i - 1]));
+    }
+  }
+
+  const size_t n_offsets = cut_device.Ptrs().Size() - 1;
+  const size_t n_rows = p_fmat_device.row_ptr.Size() - 1;
+  const size_t n_index = n_rows * row_stride;
+  ResizeIndex(n_index, isDense);
+
+  CHECK_GT(cut_device.Values().Size(), 0U);
+
+  uint32_t* offsets = nullptr;
+  if (isDense) {
+    index.ResizeOffset(n_offsets);
+    offsets = index.Offset();
+    qu.memcpy(offsets, cut_device.Ptrs().DataConst(),
+              sizeof(uint32_t) * n_offsets).wait_and_throw();
+  }
+
+  if (isDense) {
+    BinTypeSize curent_bin_size = index.GetBinTypeSize();
+    if (curent_bin_size == BinTypeSize::kUint8BinsTypeSize) {
+      SetIndexData(qu, index.data<uint8_t>(), p_fmat_device, nbins, row_stride, offsets);
+
+    } else if (curent_bin_size == BinTypeSize::kUint16BinsTypeSize) {
+      SetIndexData(qu, index.data<uint16_t>(), p_fmat_device, nbins, row_stride, offsets);
+    } else {
+      CHECK_EQ(curent_bin_size, BinTypeSize::kUint32BinsTypeSize);
+      SetIndexData(qu, index.data<uint32_t>(), p_fmat_device, nbins, row_stride, offsets);
+    }
+  /* For sparse DMatrix we have to store index of feature for each bin
+     in index field to chose right offset. So offset is nullptr and index is not reduced */
+  } else {
+    SetIndexData(qu, index.data<uint32_t>(), p_fmat_device, nbins, row_stride, offsets);
+  }
+}
+
+}  // namespace common
+}  // namespace sycl
+}  // namespace xgboost
diff --git a/plugin/sycl/data/gradient_index.h b/plugin/sycl/data/gradient_index.h
new file mode 100644
index 000000000..13577025c
--- /dev/null
+++ b/plugin/sycl/data/gradient_index.h
@@ -0,0 +1,216 @@
+/*!
+ * Copyright 2017-2024 by Contributors
+ * \file gradient_index.h
+ */
+#ifndef PLUGIN_SYCL_DATA_GRADIENT_INDEX_H_
+#define PLUGIN_SYCL_DATA_GRADIENT_INDEX_H_
+
+#include <vector>
+
+#include "../data.h"
+#include "../../src/common/hist_util.h"
+
+#include <CL/sycl.hpp>
+
+namespace xgboost {
+namespace sycl {
+namespace common {
+
+/*!
+ * \brief SYCL implementation of HistogramCuts stored in USM buffers to provide access from device kernels
+ */
+class HistogramCuts {
+ protected:
+  using BinIdx = uint32_t;
+
+ public:
+  HistogramCuts() {}
+
+  explicit HistogramCuts(::sycl::queue qu) {}
+
+  ~HistogramCuts() {
+  }
+
+  void Init(::sycl::queue qu, xgboost::common::HistogramCuts const& cuts) {
+    qu_ = qu;
+    cut_values_.Init(&qu_, cuts.cut_values_.HostVector());
+    cut_ptrs_.Init(&qu_, cuts.cut_ptrs_.HostVector());
+    min_vals_.Init(&qu_, cuts.min_vals_.HostVector());
+  }
+
+  // Getters for USM buffers to pass pointers into device kernels
+  const USMVector<uint32_t>& Ptrs()      const { return cut_ptrs_;   }
+  const USMVector<float>&    Values()    const { return cut_values_; }
+  const USMVector<float>&    MinValues() const { return min_vals_;   }
+
+ private:
+  USMVector<bst_float> cut_values_;
+  USMVector<uint32_t> cut_ptrs_;
+  USMVector<float> min_vals_;
+  ::sycl::queue qu_;
+};
+
+using BinTypeSize = ::xgboost::common::BinTypeSize;
+
+/*!
+ * \brief Index data and offsets stored in USM buffers to provide access from device kernels
+ */
+struct Index {
+  Index() {
+    SetBinTypeSize(binTypeSize_);
+  }
+  Index(const Index& i) = delete;
+  Index& operator=(Index i) = delete;
+  Index(Index&& i) = delete;
+  Index& operator=(Index&& i) = delete;
+  uint32_t operator[](size_t i) const {
+    if (!offset_.Empty()) {
+      return func_(data_.DataConst(), i) + offset_[i%p_];
+    } else {
+      return func_(data_.DataConst(), i);
+    }
+  }
+  void SetBinTypeSize(BinTypeSize binTypeSize) {
+    binTypeSize_ = binTypeSize;
+    switch (binTypeSize) {
+      case BinTypeSize::kUint8BinsTypeSize:
+        func_ = &GetValueFromUint8;
+        break;
+      case BinTypeSize::kUint16BinsTypeSize:
+        func_ = &GetValueFromUint16;
+        break;
+      case BinTypeSize::kUint32BinsTypeSize:
+        func_ = &GetValueFromUint32;
+        break;
+      default:
+        CHECK(binTypeSize == BinTypeSize::kUint8BinsTypeSize  ||
+              binTypeSize == BinTypeSize::kUint16BinsTypeSize ||
+              binTypeSize == BinTypeSize::kUint32BinsTypeSize);
+    }
+  }
+  BinTypeSize GetBinTypeSize() const {
+    return binTypeSize_;
+  }
+
+  template<typename T>
+  T* data() {
+    return reinterpret_cast<T*>(data_.Data());
+  }
+
+  template<typename T>
+  const T* data() const {
+    return reinterpret_cast<const T*>(data_.DataConst());
+  }
+
+  uint32_t* Offset() {
+    return offset_.Data();
+  }
+
+  const uint32_t* Offset() const {
+    return offset_.DataConst();
+  }
+
+  size_t Size() const {
+    return data_.Size() / (binTypeSize_);
+  }
+
+  void Resize(const size_t nBytesData) {
+    data_.Resize(&qu_, nBytesData);
+  }
+
+  void ResizeOffset(const size_t nDisps) {
+    offset_.Resize(&qu_, nDisps);
+    p_ = nDisps;
+  }
+
+  uint8_t* begin() const {
+    return data_.Begin();
+  }
+
+  uint8_t* end() const {
+    return data_.End();
+  }
+
+  void setQueue(::sycl::queue qu) {
+    qu_ = qu;
+  }
+
+ private:
+  static uint32_t GetValueFromUint8(const uint8_t* t, size_t i) {
+    return reinterpret_cast<const uint8_t*>(t)[i];
+  }
+  static uint32_t GetValueFromUint16(const uint8_t* t, size_t i) {
+    return reinterpret_cast<const uint16_t*>(t)[i];
+  }
+  static uint32_t GetValueFromUint32(const uint8_t* t, size_t i) {
+    return reinterpret_cast<const uint32_t*>(t)[i];
+  }
+
+  using Func = uint32_t (*)(const uint8_t*, size_t);
+
+  USMVector<uint8_t, MemoryType::on_device> data_;
+  // size of this field is equal to number of features
+  USMVector<uint32_t, MemoryType::on_device> offset_;
+  BinTypeSize binTypeSize_ {BinTypeSize::kUint8BinsTypeSize};
+  size_t p_ {1};
+  Func func_;
+
+  ::sycl::queue qu_;
+};
+
+/*!
+ * \brief Preprocessed global index matrix, in CSR format, stored in USM buffers
+ *
+ *  Transform floating values to integer index in histogram
+ */
+struct GHistIndexMatrix {
+  /*! \brief row pointer to rows by element position */
+  /*! \brief The index data */
+  Index index;
+  /*! \brief hit count of each index */
+  std::vector<size_t> hit_count;
+  /*! \brief buffers for calculations */
+  USMVector<size_t, MemoryType::on_device> hit_count_buff;
+  USMVector<uint8_t, MemoryType::on_device> sort_buff;
+  /*! \brief The corresponding cuts */
+  xgboost::common::HistogramCuts cut;
+  HistogramCuts cut_device;
+  DMatrix* p_fmat;
+  size_t max_num_bins;
+  size_t nbins;
+  size_t nfeatures;
+  size_t row_stride;
+
+  // Create a global histogram matrix based on a given DMatrix device wrapper
+  void Init(::sycl::queue qu, Context const * ctx,
+            const sycl::DeviceMatrix& p_fmat_device, int max_num_bins);
+
+  template <typename BinIdxType>
+  void SetIndexData(::sycl::queue qu, BinIdxType* index_data,
+                    const sycl::DeviceMatrix &dmat_device,
+                    size_t nbins, size_t row_stride, uint32_t* offsets);
+
+  void ResizeIndex(size_t n_index, bool isDense);
+
+  inline void GetFeatureCounts(size_t* counts) const {
+    auto nfeature = cut_device.Ptrs().Size() - 1;
+    for (unsigned fid = 0; fid < nfeature; ++fid) {
+      auto ibegin = cut_device.Ptrs()[fid];
+      auto iend = cut_device.Ptrs()[fid + 1];
+      for (auto i = ibegin; i < iend; ++i) {
+        *(counts + fid) += hit_count[i];
+      }
+    }
+  }
+  inline bool IsDense() const {
+    return isDense_;
+  }
+
+ private:
+  bool isDense_;
+};
+
+}  // namespace common
+}  // namespace sycl
+}  // namespace xgboost
+#endif  // PLUGIN_SYCL_DATA_GRADIENT_INDEX_H_
diff --git a/plugin/sycl/predictor/predictor.cc b/plugin/sycl/predictor/predictor.cc
index dd56dd3bd..943949c2a 100755
--- a/plugin/sycl/predictor/predictor.cc
+++ b/plugin/sycl/predictor/predictor.cc
@@ -280,7 +280,8 @@ class Predictor : public xgboost::Predictor {
                     uint32_t tree_end = 0) const override {
     ::sycl::queue qu = device_manager.GetQueue(ctx_->Device());
     // TODO(razdoburdin): remove temporary workaround after cache fix
-    sycl::DeviceMatrix device_matrix(qu, dmat);
+    sycl::DeviceMatrix device_matrix;
+    device_matrix.Init(qu, dmat);
 
     auto* out_preds = &predts->predictions;
     if (tree_end == 0) {
diff --git a/plugin/sycl/tree/expand_entry.h b/plugin/sycl/tree/expand_entry.h
new file mode 100644
index 000000000..2520ff95d
--- /dev/null
+++ b/plugin/sycl/tree/expand_entry.h
@@ -0,0 +1,50 @@
+/*!
+ * Copyright 2017-2024 by Contributors
+ * \file expand_entry.h
+ */
+#ifndef PLUGIN_SYCL_TREE_EXPAND_ENTRY_H_
+#define PLUGIN_SYCL_TREE_EXPAND_ENTRY_H_
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wtautological-constant-compare"
+#include "../../src/tree/constraints.h"
+#pragma GCC diagnostic pop
+#include "../../src/tree/hist/expand_entry.h"
+
+namespace xgboost {
+namespace sycl {
+namespace tree {
+/* tree growing policies */
+struct ExpandEntry : public xgboost::tree::ExpandEntryImpl<ExpandEntry> {
+  static constexpr bst_node_t kRootNid  = 0;
+
+  xgboost::tree::SplitEntry split;
+
+  ExpandEntry(int nid, int depth) : ExpandEntryImpl{nid, depth} {}
+
+  inline bst_node_t GetSiblingId(const xgboost::RegTree* p_tree) const {
+    CHECK_EQ((*p_tree)[nid].IsRoot(), false);
+    const size_t parent_id = (*p_tree)[nid].Parent();
+    return GetSiblingId(p_tree, parent_id);
+  }
+
+  inline bst_node_t GetSiblingId(const xgboost::RegTree* p_tree, size_t parent_id) const {
+    return p_tree->IsLeftChild(nid) ? p_tree->RightChild(parent_id)
+                                    : p_tree->LeftChild(parent_id);
+  }
+
+  bool IsValidImpl(xgboost::tree::TrainParam const &param, int32_t num_leaves) const {
+    if (split.loss_chg <= kRtEps) return false;
+    if (split.loss_chg < param.min_split_loss) return false;
+    if (param.max_depth > 0 && depth == param.max_depth) return false;
+    if (param.max_leaves > 0 && num_leaves == param.max_leaves) return false;
+
+    return true;
+  }
+};
+
+}  // namespace tree
+}  // namespace sycl
+}  // namespace xgboost
+
+#endif  // PLUGIN_SYCL_TREE_EXPAND_ENTRY_H_
diff --git a/python-package/packager/nativelib.py b/python-package/packager/nativelib.py
index 0227cff37..42b510eef 100644
--- a/python-package/packager/nativelib.py
+++ b/python-package/packager/nativelib.py
@@ -32,7 +32,10 @@ def build_libxgboost(
     build_dir: pathlib.Path,
     build_config: BuildConfiguration,
 ) -> pathlib.Path:
-    """Build libxgboost in a temporary directory and obtain the path to built libxgboost"""
+    """Build libxgboost in a temporary directory and obtain the path to built
+    libxgboost.
+
+    """
     logger = logging.getLogger("xgboost.packager.build_libxgboost")
 
     if not cpp_src_dir.is_dir():
@@ -51,8 +54,8 @@ def build_libxgboost(
         cmake_cmd.extend(build_config.get_cmake_args())
 
         # Flag for cross-compiling for Apple Silicon
-        # We use environment variable because it's the only way to pass down custom flags
-        # through the cibuildwheel package, which calls `pip wheel` command.
+        # We use environment variable because it's the only way to pass down custom
+        # flags through the cibuildwheel package, which calls `pip wheel` command.
         if "CIBW_TARGET_OSX_ARM64" in os.environ:
             cmake_cmd.append("-DCMAKE_OSX_ARCHITECTURES=arm64")
 
diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py
index 27331d3de..36e4bdcf0 100644
--- a/python-package/xgboost/core.py
+++ b/python-package/xgboost/core.py
@@ -804,10 +804,11 @@ class DMatrix:  # pylint: disable=too-many-instance-attributes,too-many-public-m
 
             Otherwise, one can pass a list-like input with the same length as number
             of columns in `data`, with the following possible values:
-             - "c", which represents categorical columns.
-             - "q", which represents numeric columns.
-             - "int", which represents integer columns.
-             - "i", which represents boolean columns.
+
+            - "c", which represents categorical columns.
+            - "q", which represents numeric columns.
+            - "int", which represents integer columns.
+            - "i", which represents boolean columns.
 
             Note that, while categorical types are treated differently from
             the rest for model fitting purposes, the other types do not influence
@@ -861,9 +862,9 @@ class DMatrix:  # pylint: disable=too-many-instance-attributes,too-many-public-m
         self.nthread = nthread if nthread is not None else -1
         self.silent = silent
 
-        # force into void_p, mac need to pass things in as void_p
-        if data is None:
-            self.handle: Optional[ctypes.c_void_p] = None
+        if isinstance(data, ctypes.c_void_p):
+            # Used for constructing DMatrix slice.
+            self.handle = data
             return
 
         from .data import _is_iter, dispatch_data_backend
@@ -925,9 +926,10 @@ class DMatrix:  # pylint: disable=too-many-instance-attributes,too-many-public-m
         self.handle = handle
 
     def __del__(self) -> None:
-        if hasattr(self, "handle") and self.handle:
+        if hasattr(self, "handle"):
+            assert self.handle is not None
             _check_call(_LIB.XGDMatrixFree(self.handle))
-            self.handle = None
+            del self.handle
 
     @_deprecate_positional_args
     def set_info(
@@ -1281,19 +1283,19 @@ class DMatrix:  # pylint: disable=too-many-instance-attributes,too-many-public-m
         """
         from .data import _maybe_np_slice
 
-        res = DMatrix(None)
-        res.handle = ctypes.c_void_p()
+        handle = ctypes.c_void_p()
+
         rindex = _maybe_np_slice(rindex, dtype=np.int32)
         _check_call(
             _LIB.XGDMatrixSliceDMatrixEx(
                 self.handle,
                 c_array(ctypes.c_int, rindex),
                 c_bst_ulong(len(rindex)),
-                ctypes.byref(res.handle),
+                ctypes.byref(handle),
                 ctypes.c_int(1 if allow_groups else 0),
             )
         )
-        return res
+        return DMatrix(handle)
 
     @property
     def feature_names(self) -> Optional[FeatureNames]:
diff --git a/python-package/xgboost/data.py b/python-package/xgboost/data.py
index 49a0f43b7..07a08dc5f 100644
--- a/python-package/xgboost/data.py
+++ b/python-package/xgboost/data.py
@@ -1053,10 +1053,10 @@ def _is_dlpack(data: DataType) -> bool:
 
 
 def _transform_dlpack(data: DataType) -> bool:
-    from cupy import fromDlpack  # pylint: disable=E0401
+    from cupy import from_dlpack  # pylint: disable=E0401
 
     assert "used_dltensor" not in str(data)
-    data = fromDlpack(data)
+    data = from_dlpack(data)
     return data
 
 
diff --git a/python-package/xgboost/sklearn.py b/python-package/xgboost/sklearn.py
index 5d651948c..c4713a9e4 100644
--- a/python-package/xgboost/sklearn.py
+++ b/python-package/xgboost/sklearn.py
@@ -5,12 +5,14 @@ import json
 import os
 import warnings
 from concurrent.futures import ThreadPoolExecutor
+from inspect import signature
 from typing import (
     Any,
     Callable,
     Dict,
     List,
     Optional,
+    Protocol,
     Sequence,
     Tuple,
     Type,
@@ -67,14 +69,20 @@ def _can_use_qdm(tree_method: Optional[str]) -> bool:
     return tree_method in ("hist", "gpu_hist", None, "auto")
 
 
-SklObjective = Optional[
-    Union[str, Callable[[np.ndarray, np.ndarray], Tuple[np.ndarray, np.ndarray]]]
-]
+class _SklObjWProto(Protocol):  # pylint: disable=too-few-public-methods
+    def __call__(
+        self,
+        y_true: ArrayLike,
+        y_pred: ArrayLike,
+        sample_weight: Optional[ArrayLike],
+    ) -> Tuple[ArrayLike, ArrayLike]: ...
 
 
-def _objective_decorator(
-    func: Callable[[np.ndarray, np.ndarray], Tuple[np.ndarray, np.ndarray]]
-) -> Objective:
+_SklObjProto = Callable[[ArrayLike, ArrayLike], Tuple[np.ndarray, np.ndarray]]
+SklObjective = Optional[Union[str, _SklObjWProto, _SklObjProto]]
+
+
+def _objective_decorator(func: Union[_SklObjWProto, _SklObjProto]) -> Objective:
     """Decorate an objective function
 
     Converts an objective function using the typical sklearn metrics
@@ -89,6 +97,8 @@ def _objective_decorator(
             The target values
         y_pred: array_like of shape [n_samples]
             The predicted values
+        sample_weight :
+            Optional sample weight, None or a ndarray.
 
     Returns
     -------
@@ -103,10 +113,25 @@ def _objective_decorator(
             ``dmatrix.get_label()``
     """
 
+    parameters = signature(func).parameters
+    supports_sw = "sample_weight" in parameters
+
     def inner(preds: np.ndarray, dmatrix: DMatrix) -> Tuple[np.ndarray, np.ndarray]:
-        """internal function"""
+        """Internal function."""
+        sample_weight = dmatrix.get_weight()
         labels = dmatrix.get_label()
-        return func(labels, preds)
+
+        if sample_weight.size > 0 and not supports_sw:
+            raise ValueError(
+                "Custom objective doesn't have the `sample_weight` parameter while"
+                " sample_weight is used."
+            )
+        if sample_weight.size > 0:
+            fnw = cast(_SklObjWProto, func)
+            return fnw(labels, preds, sample_weight=sample_weight)
+
+        fn = cast(_SklObjProto, func)
+        return fn(labels, preds)
 
     return inner
 
@@ -172,75 +197,121 @@ def ltr_metric_decorator(func: Callable, n_jobs: Optional[int]) -> Metric:
     return inner
 
 
-__estimator_doc = """
-    n_estimators : Optional[int]
+__estimator_doc = f"""
+    n_estimators : {Optional[int]}
         Number of gradient boosted trees.  Equivalent to number of boosting
         rounds.
 """
 
 __model_doc = f"""
-    max_depth :  Optional[int]
+    max_depth :  {Optional[int]}
+
         Maximum tree depth for base learners.
-    max_leaves :
+
+    max_leaves : {Optional[int]}
+
         Maximum number of leaves; 0 indicates no limit.
-    max_bin :
+
+    max_bin : {Optional[int]}
+
         If using histogram-based algorithm, maximum number of bins per feature
-    grow_policy :
-        Tree growing policy. 0: favor splitting at nodes closest to the node, i.e. grow
-        depth-wise. 1: favor splitting at nodes with highest loss change.
-    learning_rate : Optional[float]
+
+    grow_policy : {Optional[str]}
+
+        Tree growing policy.
+
+        - depthwise: Favors splitting at nodes closest to the node,
+        - lossguide: Favors splitting at nodes with highest loss change.
+
+    learning_rate : {Optional[float]}
+
         Boosting learning rate (xgb's "eta")
-    verbosity : Optional[int]
+
+    verbosity : {Optional[int]}
+
         The degree of verbosity. Valid values are 0 (silent) - 3 (debug).
 
     objective : {SklObjective}
 
         Specify the learning task and the corresponding learning objective or a custom
-        objective function to be used. For custom objective, see
-        :doc:`/tutorials/custom_metric_obj` and :ref:`custom-obj-metric` for more
-        information.
+        objective function to be used.
+
+        For custom objective, see :doc:`/tutorials/custom_metric_obj` and
+        :ref:`custom-obj-metric` for more information, along with the end note for
+        function signatures.
+
+    booster: {Optional[str]}
+
+        Specify which booster to use: ``gbtree``, ``gblinear`` or ``dart``.
+
+    tree_method : {Optional[str]}
 
-    booster: Optional[str]
-        Specify which booster to use: `gbtree`, `gblinear` or `dart`.
-    tree_method: Optional[str]
         Specify which tree method to use.  Default to auto.  If this parameter is set to
         default, XGBoost will choose the most conservative option available.  It's
         recommended to study this option from the parameters document :doc:`tree method
         </treemethod>`
-    n_jobs : Optional[int]
+
+    n_jobs : {Optional[int]}
+
         Number of parallel threads used to run xgboost.  When used with other
         Scikit-Learn algorithms like grid search, you may choose which algorithm to
         parallelize and balance the threads.  Creating thread contention will
         significantly slow down both algorithms.
-    gamma : Optional[float]
-        (min_split_loss) Minimum loss reduction required to make a further partition on a
-        leaf node of the tree.
-    min_child_weight : Optional[float]
+
+    gamma : {Optional[float]}
+
+        (min_split_loss) Minimum loss reduction required to make a further partition on
+        a leaf node of the tree.
+
+    min_child_weight : {Optional[float]}
+
         Minimum sum of instance weight(hessian) needed in a child.
-    max_delta_step : Optional[float]
+
+    max_delta_step : {Optional[float]}
+
         Maximum delta step we allow each tree's weight estimation to be.
-    subsample : Optional[float]
+
+    subsample : {Optional[float]}
+
         Subsample ratio of the training instance.
-    sampling_method :
+
+    sampling_method : {Optional[str]}
+
         Sampling method. Used only by the GPU version of ``hist`` tree method.
-          - ``uniform``: select random training instances uniformly.
-          - ``gradient_based`` select random training instances with higher probability
+
+        - ``uniform``: Select random training instances uniformly.
+        - ``gradient_based``: Select random training instances with higher probability
             when the gradient and hessian are larger. (cf. CatBoost)
-    colsample_bytree : Optional[float]
+
+    colsample_bytree : {Optional[float]}
+
         Subsample ratio of columns when constructing each tree.
-    colsample_bylevel : Optional[float]
+
+    colsample_bylevel : {Optional[float]}
+
         Subsample ratio of columns for each level.
-    colsample_bynode : Optional[float]
+
+    colsample_bynode : {Optional[float]}
+
         Subsample ratio of columns for each split.
-    reg_alpha : Optional[float]
+
+    reg_alpha : {Optional[float]}
+
         L1 regularization term on weights (xgb's alpha).
-    reg_lambda : Optional[float]
+
+    reg_lambda : {Optional[float]}
+
         L2 regularization term on weights (xgb's lambda).
-    scale_pos_weight : Optional[float]
+
+    scale_pos_weight : {Optional[float]}
         Balancing of positive and negative weights.
-    base_score : Optional[float]
+
+    base_score : {Optional[float]}
+
         The initial prediction score of all instances, global bias.
-    random_state : Optional[Union[numpy.random.RandomState, numpy.random.Generator, int]]
+
+    random_state : {Optional[Union[np.random.RandomState, np.random.Generator, int]]}
+
         Random number seed.
 
         .. note::
@@ -248,34 +319,44 @@ __model_doc = f"""
            Using gblinear booster with shotgun updater is nondeterministic as
            it uses Hogwild algorithm.
 
-    missing : float, default np.nan
-        Value in the data which needs to be present as a missing value.
-    num_parallel_tree: Optional[int]
+    missing : float
+
+        Value in the data which needs to be present as a missing value. Default to
+        :py:data:`numpy.nan`.
+
+    num_parallel_tree: {Optional[int]}
+
         Used for boosting random forest.
-    monotone_constraints : Optional[Union[Dict[str, int], str]]
+
+    monotone_constraints : {Optional[Union[Dict[str, int], str]]}
+
         Constraint of variable monotonicity.  See :doc:`tutorial </tutorials/monotonic>`
         for more information.
-    interaction_constraints : Optional[Union[str, List[Tuple[str]]]]
+
+    interaction_constraints : {Optional[Union[str, List[Tuple[str]]]]}
+
         Constraints for interaction representing permitted interactions.  The
         constraints must be specified in the form of a nested list, e.g. ``[[0, 1], [2,
         3, 4]]``, where each inner list is a group of indices of features that are
         allowed to interact with each other.  See :doc:`tutorial
         </tutorials/feature_interaction_constraint>` for more information
-    importance_type: Optional[str]
+
+    importance_type: {Optional[str]}
+
         The feature importance type for the feature_importances\\_ property:
 
         * For tree model, it's either "gain", "weight", "cover", "total_gain" or
           "total_cover".
-        * For linear model, only "weight" is defined and it's the normalized coefficients
-          without bias.
+        * For linear model, only "weight" is defined and it's the normalized
+          coefficients without bias.
 
-    device : Optional[str]
+    device : {Optional[str]}
 
         .. versionadded:: 2.0.0
 
         Device ordinal, available options are `cpu`, `cuda`, and `gpu`.
 
-    validate_parameters : Optional[bool]
+    validate_parameters : {Optional[bool]}
 
         Give warnings for unknown parameter.
 
@@ -283,14 +364,14 @@ __model_doc = f"""
 
         See the same parameter of :py:class:`DMatrix` for details.
 
-    feature_types : Optional[FeatureTypes]
+    feature_types : {Optional[FeatureTypes]}
 
         .. versionadded:: 1.7.0
 
         Used for specifying feature types without constructing a dataframe. See
         :py:class:`DMatrix` for details.
 
-    max_cat_to_onehot : Optional[int]
+    max_cat_to_onehot : {Optional[int]}
 
         .. versionadded:: 1.6.0
 
@@ -303,7 +384,7 @@ __model_doc = f"""
         categorical feature support. See :doc:`Categorical Data
         </tutorials/categorical>` and :ref:`cat-param` for details.
 
-    max_cat_threshold : Optional[int]
+    max_cat_threshold : {Optional[int]}
 
         .. versionadded:: 1.7.0
 
@@ -314,7 +395,7 @@ __model_doc = f"""
         needs to be set to have categorical feature support. See :doc:`Categorical Data
         </tutorials/categorical>` and :ref:`cat-param` for details.
 
-    multi_strategy : Optional[str]
+    multi_strategy : {Optional[str]}
 
         .. versionadded:: 2.0.0
 
@@ -327,7 +408,7 @@ __model_doc = f"""
         - ``one_output_per_tree``: One model for each target.
         - ``multi_output_tree``:  Use multi-target trees.
 
-    eval_metric : Optional[Union[str, List[str], Callable]]
+    eval_metric : {Optional[Union[str, List[str], Callable]]}
 
         .. versionadded:: 1.6.0
 
@@ -360,7 +441,7 @@ __model_doc = f"""
             )
             reg.fit(X, y, eval_set=[(X, y)])
 
-    early_stopping_rounds : Optional[int]
+    early_stopping_rounds : {Optional[int]}
 
         .. versionadded:: 1.6.0
 
@@ -383,7 +464,8 @@ __model_doc = f"""
           early stopping.  If there's more than one metric in **eval_metric**, the last
           metric will be used for early stopping.
 
-    callbacks : Optional[List[TrainingCallback]]
+    callbacks : {Optional[List[TrainingCallback]]}
+
         List of callback functions that are applied at end of each iteration.
         It is possible to use predefined callbacks by using
         :ref:`Callback API <callback_api>`.
@@ -402,7 +484,8 @@ __model_doc = f"""
                 reg = xgboost.XGBRegressor(**params, callbacks=callbacks)
                 reg.fit(X, y)
 
-    kwargs : dict, optional
+    kwargs : {Optional[Any]}
+
         Keyword arguments for XGBoost Booster object.  Full documentation of parameters
         can be found :doc:`here </parameter>`.
         Attempting to set a parameter via the constructor args and \\*\\*kwargs
@@ -419,13 +502,16 @@ __custom_obj_note = """
         .. note::  Custom objective function
 
             A custom objective function can be provided for the ``objective``
-            parameter. In this case, it should have the signature
-            ``objective(y_true, y_pred) -> grad, hess``:
+            parameter. In this case, it should have the signature ``objective(y_true,
+            y_pred) -> [grad, hess]`` or ``objective(y_true, y_pred, *, sample_weight)
+            -> [grad, hess]``:
 
             y_true: array_like of shape [n_samples]
                 The target values
             y_pred: array_like of shape [n_samples]
                 The predicted values
+            sample_weight :
+                Optional sample weights.
 
             grad: array_like of shape [n_samples]
                 The value of the gradient for each sample point.
diff --git a/python-package/xgboost/spark/core.py b/python-package/xgboost/spark/core.py
index eb226611d..e44182cb3 100644
--- a/python-package/xgboost/spark/core.py
+++ b/python-package/xgboost/spark/core.py
@@ -95,6 +95,7 @@ from .utils import (
     deserialize_xgb_model,
     get_class_name,
     get_logger,
+    get_logger_level,
     serialize_booster,
     use_cuda,
 )
@@ -181,6 +182,8 @@ pred = Pred("prediction", "rawPrediction", "probability", "predContrib")
 
 _INIT_BOOSTER_SAVE_PATH = "init_booster.json"
 
+_LOG_TAG = "XGBoost-PySpark"
+
 
 class _SparkXGBParams(
     HasFeaturesCol,
@@ -1034,6 +1037,8 @@ class _SparkXGBEstimator(Estimator, _SparkXGBParams, MLReadable, MLWritable):
 
         num_workers = self.getOrDefault(self.num_workers)
 
+        log_level = get_logger_level(_LOG_TAG)
+
         def _train_booster(
             pandas_df_iter: Iterator[pd.DataFrame],
         ) -> Iterator[pd.DataFrame]:
@@ -1047,7 +1052,7 @@ class _SparkXGBEstimator(Estimator, _SparkXGBParams, MLReadable, MLWritable):
 
             dev_ordinal = None
             use_qdm = _can_use_qdm(booster_params.get("tree_method", None))
-
+            msg = "Training on CPUs"
             if run_on_gpu:
                 dev_ordinal = (
                     context.partitionId() if is_local else _get_gpu_id(context)
@@ -1058,10 +1063,9 @@ class _SparkXGBEstimator(Estimator, _SparkXGBParams, MLReadable, MLWritable):
                 # Note: Checking `is_cudf_available` in spark worker side because
                 # spark worker might has different python environment with driver side.
                 use_qdm = use_qdm and is_cudf_available()
-                get_logger("XGBoost-PySpark").info(
-                    "Leveraging %s to train with QDM: %s",
-                    booster_params["device"],
-                    "on" if use_qdm else "off",
+                msg = (
+                    f"Leveraging {booster_params['device']} to train with "
+                    f"QDM: {'on' if use_qdm else 'off'}"
                 )
 
             if use_qdm and (booster_params.get("max_bin", None) is not None):
@@ -1070,6 +1074,7 @@ class _SparkXGBEstimator(Estimator, _SparkXGBParams, MLReadable, MLWritable):
             _rabit_args = {}
             if context.partitionId() == 0:
                 _rabit_args = _get_rabit_args(context, num_workers)
+                get_logger(_LOG_TAG, log_level).info(msg)
 
             worker_message = {
                 "rabit_msg": _rabit_args,
@@ -1127,7 +1132,7 @@ class _SparkXGBEstimator(Estimator, _SparkXGBParams, MLReadable, MLWritable):
             ret = rdd_with_resource.collect()[0]
             return ret[0], ret[1]
 
-        get_logger("XGBoost-PySpark").info(
+        get_logger(_LOG_TAG).info(
             "Running xgboost-%s on %s workers with"
             "\n\tbooster params: %s"
             "\n\ttrain_call_kwargs_params: %s"
@@ -1139,7 +1144,7 @@ class _SparkXGBEstimator(Estimator, _SparkXGBParams, MLReadable, MLWritable):
             dmatrix_kwargs,
         )
         (config, booster) = _run_job()
-        get_logger("XGBoost-PySpark").info("Finished xgboost training!")
+        get_logger(_LOG_TAG).info("Finished xgboost training!")
 
         result_xgb_model = self._convert_to_sklearn_model(
             bytearray(booster, "utf-8"), config
@@ -1342,7 +1347,7 @@ class _SparkXGBModel(Model, _SparkXGBParams, MLReadable, MLWritable):
         # User don't set gpu configurations, just use cpu
         if gpu_per_task is None:
             if use_gpu_by_params:
-                get_logger("XGBoost-PySpark").warning(
+                get_logger(_LOG_TAG).warning(
                     "Do the prediction on the CPUs since "
                     "no gpu configurations are set"
                 )
@@ -1377,6 +1382,8 @@ class _SparkXGBModel(Model, _SparkXGBParams, MLReadable, MLWritable):
         is_local = _is_local(_get_spark_session().sparkContext)
         run_on_gpu = self._run_on_gpu()
 
+        log_level = get_logger_level(_LOG_TAG)
+
         @pandas_udf(schema)  # type: ignore
         def predict_udf(iterator: Iterator[pd.DataFrame]) -> Iterator[pd.Series]:
             assert xgb_sklearn_model is not None
@@ -1413,7 +1420,8 @@ class _SparkXGBModel(Model, _SparkXGBParams, MLReadable, MLWritable):
                 else:
                     msg = "CUDF or Cupy is unavailable, fallback the inference on the CPUs"
 
-            get_logger("XGBoost-PySpark").info(msg)
+            if context.partitionId() == 0:
+                get_logger(_LOG_TAG, log_level).info(msg)
 
             def to_gpu_if_possible(data: ArrayLike) -> ArrayLike:
                 """Move the data to gpu if possible"""
diff --git a/python-package/xgboost/spark/utils.py b/python-package/xgboost/spark/utils.py
index 84333df53..c0a876419 100644
--- a/python-package/xgboost/spark/utils.py
+++ b/python-package/xgboost/spark/utils.py
@@ -8,7 +8,7 @@ import os
 import sys
 import uuid
 from threading import Thread
-from typing import Any, Callable, Dict, Optional, Set, Type
+from typing import Any, Callable, Dict, Optional, Set, Type, Union
 
 import pyspark
 from pyspark import BarrierTaskContext, SparkConf, SparkContext, SparkFiles, TaskContext
@@ -98,10 +98,15 @@ def _get_spark_session() -> SparkSession:
     return SparkSession.builder.getOrCreate()
 
 
-def get_logger(name: str, level: str = "INFO") -> logging.Logger:
+def get_logger(name: str, level: Optional[Union[str, int]] = None) -> logging.Logger:
     """Gets a logger by name, or creates and configures it for the first time."""
     logger = logging.getLogger(name)
-    logger.setLevel(level)
+    if level is not None:
+        logger.setLevel(level)
+    else:
+        # Default to info if not set.
+        if logger.level == logging.NOTSET:
+            logger.setLevel(logging.INFO)
     # If the logger is configured, skip the configure
     if not logger.handlers and not logging.getLogger().handlers:
         handler = logging.StreamHandler(sys.stderr)
@@ -113,6 +118,12 @@ def get_logger(name: str, level: str = "INFO") -> logging.Logger:
     return logger
 
 
+def get_logger_level(name: str) -> Optional[int]:
+    """Get the logger level for the given log name"""
+    logger = logging.getLogger(name)
+    return None if logger.level == logging.NOTSET else logger.level
+
+
 def _get_max_num_concurrent_tasks(spark_context: SparkContext) -> int:
     """Gets the current max number of concurrent tasks."""
     # pylint: disable=protected-access
diff --git a/python-package/xgboost/testing/__init__.py b/python-package/xgboost/testing/__init__.py
index 389066f0e..f7d9510fa 100644
--- a/python-package/xgboost/testing/__init__.py
+++ b/python-package/xgboost/testing/__init__.py
@@ -815,10 +815,15 @@ def softprob_obj(
     return objective
 
 
-def ls_obj(y_true: np.ndarray, y_pred: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
+def ls_obj(
+    y_true: np.ndarray, y_pred: np.ndarray, sample_weight: Optional[np.ndarray] = None
+) -> Tuple[np.ndarray, np.ndarray]:
     """Least squared error."""
     grad = y_pred - y_true
     hess = np.ones(len(y_true))
+    if sample_weight is not None:
+        grad *= sample_weight
+        hess *= sample_weight
     return grad, hess
 
 
diff --git a/python-package/xgboost/testing/ranking.py b/python-package/xgboost/testing/ranking.py
index a11eb3e03..72cf37aeb 100644
--- a/python-package/xgboost/testing/ranking.py
+++ b/python-package/xgboost/testing/ranking.py
@@ -100,3 +100,21 @@ def run_ranking_categorical(device: str) -> None:
     scores = cross_val_score(ltr, X, y)
     for s in scores:
         assert s > 0.7
+
+
+def run_normalization(device: str) -> None:
+    """Test normalization."""
+    X, y, qid, _ = tm.make_ltr(2048, 4, 64, 3)
+    ltr = xgb.XGBRanker(objective="rank:pairwise", n_estimators=4, device=device)
+    ltr.fit(X, y, qid=qid, eval_set=[(X, y)], eval_qid=[qid])
+    e0 = ltr.evals_result()
+
+    ltr = xgb.XGBRanker(
+        objective="rank:pairwise",
+        n_estimators=4,
+        device=device,
+        lambdarank_normalization=False,
+    )
+    ltr.fit(X, y, qid=qid, eval_set=[(X, y)], eval_qid=[qid])
+    e1 = ltr.evals_result()
+    assert e1["validation_0"]["ndcg@32"][-1] > e0["validation_0"]["ndcg@32"][-1]
diff --git a/src/collective/aggregator.h b/src/collective/aggregator.h
index f2a9ff528..8a5b31c36 100644
--- a/src/collective/aggregator.h
+++ b/src/collective/aggregator.h
@@ -1,22 +1,21 @@
 /**
- * Copyright 2023 by XGBoost contributors
+ * Copyright 2023-2024, XGBoost contributors
  *
  * Higher level functions built on top the Communicator API, taking care of behavioral differences
  * between row-split vs column-split distributed training, and horizontal vs vertical federated
  * learning.
  */
 #pragma once
-#include <xgboost/data.h>
-
 #include <limits>
 #include <string>
 #include <utility>
 #include <vector>
 
 #include "communicator-inl.h"
+#include "xgboost/collective/result.h"  // for Result
+#include "xgboost/data.h"               // for MetaINfo
 
-namespace xgboost {
-namespace collective {
+namespace xgboost::collective {
 
 /**
  * @brief Apply the given function where the labels are.
@@ -31,15 +30,16 @@ namespace collective {
  * @param size The size of the buffer.
  * @param function The function used to calculate the results.
  */
-template <typename Function>
-void ApplyWithLabels(MetaInfo const& info, void* buffer, size_t size, Function&& function) {
+template <typename FN>
+void ApplyWithLabels(Context const*, MetaInfo const& info, void* buffer, std::size_t size,
+                     FN&& function) {
   if (info.IsVerticalFederated()) {
     // We assume labels are only available on worker 0, so the calculation is done there and result
     // broadcast to other workers.
     std::string message;
     if (collective::GetRank() == 0) {
       try {
-        std::forward<Function>(function)();
+        std::forward<FN>(function)();
       } catch (dmlc::Error& e) {
         message = e.what();
       }
@@ -52,7 +52,7 @@ void ApplyWithLabels(MetaInfo const& info, void* buffer, size_t size, Function&&
       LOG(FATAL) << &message[0];
     }
   } else {
-    std::forward<Function>(function)();
+    std::forward<FN>(function)();
   }
 }
 
@@ -70,7 +70,8 @@ void ApplyWithLabels(MetaInfo const& info, void* buffer, size_t size, Function&&
  * @param function The function used to calculate the results.
  */
 template <typename T, typename Function>
-void ApplyWithLabels(MetaInfo const& info, HostDeviceVector<T>* result, Function&& function) {
+void ApplyWithLabels(Context const*, MetaInfo const& info, HostDeviceVector<T>* result,
+                     Function&& function) {
   if (info.IsVerticalFederated()) {
     // We assume labels are only available on worker 0, so the calculation is done there and result
     // broadcast to other workers.
@@ -114,7 +115,9 @@ void ApplyWithLabels(MetaInfo const& info, HostDeviceVector<T>* result, Function
  * @return The global max of the input.
  */
 template <typename T>
-T GlobalMax(MetaInfo const& info, T value) {
+std::enable_if_t<std::is_trivially_copy_assignable_v<T>, T> GlobalMax(Context const*,
+                                                                      MetaInfo const& info,
+                                                                      T value) {
   if (info.IsRowSplit()) {
     collective::Allreduce<collective::Operation::kMax>(&value, 1);
   }
@@ -132,16 +135,18 @@ T GlobalMax(MetaInfo const& info, T value) {
  * @param values Pointer to the inputs to sum.
  * @param size Number of values to sum.
  */
-template <typename T>
-void GlobalSum(MetaInfo const& info, T* values, size_t size) {
+template <typename T, std::int32_t kDim>
+[[nodiscard]] Result GlobalSum(Context const*, MetaInfo const& info,
+                               linalg::TensorView<T, kDim> values) {
   if (info.IsRowSplit()) {
-    collective::Allreduce<collective::Operation::kSum>(values, size);
+    collective::Allreduce<collective::Operation::kSum>(values.Values().data(), values.Size());
   }
+  return Success();
 }
 
 template <typename Container>
-void GlobalSum(MetaInfo const& info, Container* values) {
-  GlobalSum(info, values->data(), values->size());
+[[nodiscard]] Result GlobalSum(Context const* ctx, MetaInfo const& info, Container* values) {
+  return GlobalSum(ctx, info, values->data(), values->size());
 }
 
 /**
@@ -157,9 +162,10 @@ void GlobalSum(MetaInfo const& info, Container* values) {
  * @return The global ratio of the two inputs.
  */
 template <typename T>
-T GlobalRatio(MetaInfo const& info, T dividend, T divisor) {
+T GlobalRatio(Context const* ctx, MetaInfo const& info, T dividend, T divisor) {
   std::array<T, 2> results{dividend, divisor};
-  GlobalSum(info, &results);
+  auto rc = GlobalSum(ctx, info, linalg::MakeVec(results.data(), results.size()));
+  collective::SafeColl(rc);
   std::tie(dividend, divisor) = std::tuple_cat(results);
   if (divisor <= 0) {
     return std::numeric_limits<T>::quiet_NaN();
@@ -167,6 +173,4 @@ T GlobalRatio(MetaInfo const& info, T dividend, T divisor) {
     return dividend / divisor;
   }
 }
-
-}  // namespace collective
-}  // namespace xgboost
+}  // namespace xgboost::collective
diff --git a/src/collective/communicator-inl.cc b/src/collective/communicator-inl.cc
new file mode 100644
index 000000000..4164855f1
--- /dev/null
+++ b/src/collective/communicator-inl.cc
@@ -0,0 +1,34 @@
+/**
+ * Copyright 2024, XGBoost contributors
+ */
+#include "communicator-inl.h"
+
+namespace xgboost::collective {
+[[nodiscard]] std::vector<std::vector<char>> VectorAllgatherV(
+    std::vector<std::vector<char>> const &input) {
+  auto n_inputs = input.size();
+  std::vector<std::int64_t> sizes(n_inputs);
+  std::transform(input.cbegin(), input.cend(), sizes.begin(),
+                 [](auto const &vec) { return vec.size(); });
+
+  std::vector<std::int64_t> global_sizes = AllgatherV(sizes);
+  std::vector<std::int64_t> offset(global_sizes.size() + 1);
+  offset[0] = 0;
+  for (std::size_t i = 1; i < offset.size(); i++) {
+    offset[i] = offset[i - 1] + global_sizes[i - 1];
+  }
+
+  std::vector<char> collected;
+  for (auto const &vec : input) {
+    collected.insert(collected.end(), vec.cbegin(), vec.cend());
+  }
+  auto out = AllgatherV(collected);
+
+  std::vector<std::vector<char>> result;
+  for (std::size_t i = 1; i < offset.size(); ++i) {
+    std::vector<char> local(out.cbegin() + offset[i - 1], out.cbegin() + offset[i]);
+    result.emplace_back(std::move(local));
+  }
+  return result;
+}
+}  // namespace xgboost::collective
diff --git a/src/collective/communicator-inl.h b/src/collective/communicator-inl.h
index 34212def2..991e19f2c 100644
--- a/src/collective/communicator-inl.h
+++ b/src/collective/communicator-inl.h
@@ -1,5 +1,5 @@
 /**
- * Copyright 2022-2023 by XGBoost contributors
+ * Copyright 2022-2024, XGBoost contributors
  */
 #pragma once
 #include <string>
@@ -192,6 +192,18 @@ inline std::vector<T> AllgatherV(std::vector<T> const &input) {
   return result;
 }
 
+/**
+ * @brief Gathers variable-length data from all processes and distributes it to all processes.
+ *
+ * @param inputs All the inputs from the local worker. The number of inputs can vary
+ *               across different workers. Along with which, the size of each vector in
+ *               the input can also vary.
+ *
+ * @return The AllgatherV result, containing vectors from all workers.
+ */
+[[nodiscard]] std::vector<std::vector<char>> VectorAllgatherV(
+    std::vector<std::vector<char>> const &input);
+
 /**
  * @brief Gathers variable-length strings from all processes and distributes them to all processes.
  * @param input Variable-length list of variable-length strings.
@@ -294,38 +306,5 @@ template <Operation op>
 inline void Allreduce(double *send_receive_buffer, size_t count) {
   Communicator::Get()->AllReduce(send_receive_buffer, count, DataType::kDouble, op);
 }
-
-template <typename T>
-struct SpecialAllgatherVResult {
-  std::vector<std::size_t> offsets;
-  std::vector<std::size_t> sizes;
-  std::vector<T> result;
-};
-
-/**
- * @brief Gathers variable-length data from all processes and distributes it to all processes.
- *
- * We assume each worker has the same number of inputs, but each input may be of a different size.
- *
- * @param inputs All the inputs from the local worker.
- * @param sizes  Sizes of each input.
- */
-template <typename T>
-inline SpecialAllgatherVResult<T> SpecialAllgatherV(std::vector<T> const &inputs,
-                                                    std::vector<std::size_t> const &sizes) {
-  // Gather the sizes across all workers.
-  auto const all_sizes = Allgather(sizes);
-
-  // Calculate input offsets (std::exclusive_scan).
-  std::vector<std::size_t> offsets(all_sizes.size());
-  for (std::size_t i = 1; i < offsets.size(); i++) {
-    offsets[i] = offsets[i - 1] + all_sizes[i - 1];
-  }
-
-  // Gather all the inputs.
-  auto const all_inputs = AllgatherV(inputs);
-
-  return {offsets, all_sizes, all_inputs};
-}
 }  // namespace collective
 }  // namespace xgboost
diff --git a/src/common/json.cc b/src/common/json.cc
index de9a89f78..2887eeccf 100644
--- a/src/common/json.cc
+++ b/src/common/json.cc
@@ -1,11 +1,12 @@
 /**
- * Copyright 2019-2023, XGBoost Contributors
+ * Copyright 2019-2024, XGBoost Contributors
  */
 #include "xgboost/json.h"
 
 #include <array>             // for array
 #include <cctype>            // for isdigit
 #include <cmath>             // for isinf, isnan
+#include <cstdint>           // for uint8_t, uint16_t, uint32_t
 #include <cstdio>            // for EOF
 #include <cstdlib>           // for size_t, strtof
 #include <cstring>           // for memcpy
@@ -72,15 +73,16 @@ void JsonWriter::Visit(JsonNumber const* num) {
 }
 
 void JsonWriter::Visit(JsonInteger const* num) {
-  char i2s_buffer_[NumericLimits<int64_t>::kToCharsSize];
+  std::array<char, NumericLimits<int64_t>::kToCharsSize> i2s_buffer_;
   auto i = num->GetInteger();
-  auto ret = to_chars(i2s_buffer_, i2s_buffer_ + NumericLimits<int64_t>::kToCharsSize, i);
+  auto ret =
+      to_chars(i2s_buffer_.data(), i2s_buffer_.data() + NumericLimits<int64_t>::kToCharsSize, i);
   auto end = ret.ptr;
   CHECK(ret.ec == std::errc());
-  auto digits = std::distance(i2s_buffer_, end);
+  auto digits = std::distance(i2s_buffer_.data(), end);
   auto ori_size = stream_->size();
   stream_->resize(ori_size + digits);
-  std::memcpy(stream_->data() + ori_size, i2s_buffer_, digits);
+  std::memcpy(stream_->data() + ori_size, i2s_buffer_.data(), digits);
 }
 
 void JsonWriter::Visit(JsonNull const* ) {
@@ -143,8 +145,10 @@ std::string Value::TypeStr() const {
       return "Null";
     case ValueKind::kInteger:
       return "Integer";
-    case ValueKind::kNumberArray:
+    case ValueKind::kF32Array:
       return "F32Array";
+    case ValueKind::kF64Array:
+      return "F64Array";
     case ValueKind::kU8Array:
       return "U8Array";
     case ValueKind::kI32Array:
@@ -262,10 +266,11 @@ bool JsonTypedArray<T, kind>::operator==(Value const& rhs) const {
   return std::equal(arr.cbegin(), arr.cend(), vec_.cbegin());
 }
 
-template class JsonTypedArray<float, Value::ValueKind::kNumberArray>;
-template class JsonTypedArray<uint8_t, Value::ValueKind::kU8Array>;
-template class JsonTypedArray<int32_t, Value::ValueKind::kI32Array>;
-template class JsonTypedArray<int64_t, Value::ValueKind::kI64Array>;
+template class JsonTypedArray<float, Value::ValueKind::kF32Array>;
+template class JsonTypedArray<double, Value::ValueKind::kF64Array>;
+template class JsonTypedArray<std::uint8_t, Value::ValueKind::kU8Array>;
+template class JsonTypedArray<std::int32_t, Value::ValueKind::kI32Array>;
+template class JsonTypedArray<std::int64_t, Value::ValueKind::kI64Array>;
 
 // Json Number
 bool JsonNumber::operator==(Value const& rhs) const {
@@ -708,6 +713,8 @@ Json UBJReader::ParseArray() {
     switch (type) {
       case 'd':
         return ParseTypedArray<F32Array>(n);
+      case 'D':
+        return ParseTypedArray<F64Array>(n);
       case 'U':
         return ParseTypedArray<U8Array>(n);
       case 'l':
@@ -791,12 +798,16 @@ Json UBJReader::Parse() {
         return Json{JsonBoolean{true}};
       }
       case 'F': {
-        return Json{JsonBoolean{true}};
+        return Json{JsonBoolean{false}};
       }
       case 'd': {
         auto v = this->ReadPrimitive<float>();
         return Json{v};
       }
+      case 'D': {
+        auto v = this->ReadPrimitive<double>();
+        return Json{v};
+      }
       case 'S': {
         auto str = this->DecodeStr();
         return Json{str};
@@ -825,10 +836,6 @@ Json UBJReader::Parse() {
         Integer::Int i = this->ReadPrimitive<char>();
         return Json{i};
       }
-      case 'D': {
-        LOG(FATAL) << "f64 is not supported.";
-        break;
-      }
       case 'H': {
         LOG(FATAL) << "High precision number is not supported.";
         break;
@@ -882,6 +889,8 @@ void WriteTypedArray(JsonTypedArray<T, kind> const* arr, std::vector<char>* stre
   stream->push_back('$');
   if (std::is_same<T, float>::value) {
     stream->push_back('d');
+  } else if (std::is_same_v<T, double>) {
+    stream->push_back('D');
   } else if (std::is_same<T, int8_t>::value) {
     stream->push_back('i');
   } else if (std::is_same<T, uint8_t>::value) {
@@ -910,6 +919,7 @@ void WriteTypedArray(JsonTypedArray<T, kind> const* arr, std::vector<char>* stre
 }
 
 void UBJWriter::Visit(F32Array const* arr) { WriteTypedArray(arr, stream_); }
+void UBJWriter::Visit(F64Array const* arr) { WriteTypedArray(arr, stream_); }
 void UBJWriter::Visit(U8Array const* arr) { WriteTypedArray(arr, stream_); }
 void UBJWriter::Visit(I32Array const* arr) { WriteTypedArray(arr, stream_); }
 void UBJWriter::Visit(I64Array const* arr) { WriteTypedArray(arr, stream_); }
diff --git a/src/common/linalg_op.cuh b/src/common/linalg_op.cuh
index 74cee3757..21fad2dc0 100644
--- a/src/common/linalg_op.cuh
+++ b/src/common/linalg_op.cuh
@@ -13,15 +13,14 @@
 #include "xgboost/context.h"  // for Context
 #include "xgboost/linalg.h"   // for TensorView
 
-namespace xgboost {
-namespace linalg {
+namespace xgboost::linalg {
 namespace cuda_impl {
 // Use template specialization to dispatch, Windows + CUDA 11.8 doesn't support extended
 // lambda inside constexpr if
 template <typename T, std::int32_t D>
 struct ElementWiseImpl {
   template <typename Fn>
-  void operator()(linalg::TensorView<T, D> t, Fn&& fn, cudaStream_t s) {
+  void operator()(TensorView<T, D> t, Fn&& fn, cudaStream_t s) {
     static_assert(D > 1);
     dh::LaunchN(t.Size(), s, [=] __device__(std::size_t i) mutable {
       std::apply(fn, linalg::UnravelIndex(i, t.Shape()));
@@ -32,37 +31,59 @@ struct ElementWiseImpl {
 template <typename T>
 struct ElementWiseImpl<T, 1> {
   template <typename Fn>
-  void operator()(linalg::TensorView<T, 1> t, Fn&& fn, cudaStream_t s) {
+  void operator()(TensorView<T, 1> t, Fn&& fn, cudaStream_t s) {
     dh::LaunchN(t.Size(), s, [=] __device__(std::size_t i) { fn(i); });
   }
 };
 
 template <typename T, std::int32_t D, typename Fn>
-void ElementWiseKernel(linalg::TensorView<T, D> t, Fn&& fn, cudaStream_t s = nullptr) {
+void ElementWiseKernel(TensorView<T, D> t, Fn&& fn, cudaStream_t s = nullptr) {
   dh::safe_cuda(cudaSetDevice(t.Device().ordinal));
   cuda_impl::ElementWiseImpl<T, D>{}(t, fn, s);
 }
 }  // namespace cuda_impl
 
 template <typename T, int32_t D, typename Fn>
-void ElementWiseTransformDevice(linalg::TensorView<T, D> t, Fn&& fn, cudaStream_t s = nullptr)
-{
+void ElementWiseTransformDevice(TensorView<T, D> t, Fn&& fn, cudaStream_t s = nullptr) {
   if (t.Contiguous()) {
     auto ptr = t.Values().data();
     dh::LaunchN(t.Size(), s, [=] __device__(size_t i) { ptr[i] = fn(i, ptr[i]); });
   } else {
     dh::LaunchN(t.Size(), s, [=] __device__(size_t i) mutable {
-      T& v = detail::Apply(t, linalg::UnravelIndex(i, t.Shape()));
+      T& v = detail::Apply(t, UnravelIndex(i, t.Shape()));
       v = fn(i, v);
     });
   }
 }
 
 template <typename T, int32_t D, typename Fn>
-void ElementWiseKernel(Context const* ctx, linalg::TensorView<T, D> t, Fn&& fn) {
+void ElementWiseKernel(Context const* ctx, TensorView<T, D> t, Fn&& fn) {
   ctx->IsCUDA() ? cuda_impl::ElementWiseKernel(t, fn)
                 : ElementWiseKernelHost(t, ctx->Threads(), fn);
 }
-}  // namespace linalg
-}  // namespace xgboost
+
+namespace detail {
+template <typename T, std::int32_t kDim>
+struct IterOp {
+  TensorView<T, kDim> v;
+  XGBOOST_DEVICE T& operator()(std::size_t i) {
+    return detail::Apply(v, UnravelIndex(i, v.Shape()));
+  }
+};
+}  // namespace detail
+
+// naming: thrust begin
+// returns a thrust iterator for a tensor view.
+template <typename T, std::int32_t kDim>
+auto tcbegin(TensorView<T, kDim> v) {  // NOLINT
+  return dh::MakeTransformIterator<T>(
+      thrust::make_counting_iterator(0ul),
+      detail::IterOp<std::add_const_t<std::remove_const_t<T>>, kDim>{v});
+}
+
+template <typename T, std::int32_t kDim>
+auto tcend(TensorView<T, kDim> v) {  // NOLINT
+  return tcbegin(v) + v.Size();
+}
+}  // namespace xgboost::linalg
 #endif  // XGBOOST_COMMON_LINALG_OP_CUH_
diff --git a/src/common/quantile.cc b/src/common/quantile.cc
index c74db99e4..e521fae69 100644
--- a/src/common/quantile.cc
+++ b/src/common/quantile.cc
@@ -1,5 +1,5 @@
-/*!
- * Copyright 2020-2022 by XGBoost Contributors
+/**
+ * Copyright 2020-2024, XGBoost Contributors
  */
 #include "quantile.h"
 
@@ -145,7 +145,7 @@ struct QuantileAllreduce {
 
 template <typename WQSketch>
 void SketchContainerImpl<WQSketch>::GatherSketchInfo(
-    Context const *, MetaInfo const &info,
+    Context const *ctx, MetaInfo const &info,
     std::vector<typename WQSketch::SummaryContainer> const &reduced,
     std::vector<size_t> *p_worker_segments, std::vector<bst_row_t> *p_sketches_scan,
     std::vector<typename WQSketch::Entry> *p_global_sketches) {
@@ -171,7 +171,9 @@ void SketchContainerImpl<WQSketch>::GatherSketchInfo(
   std::partial_sum(sketch_size.cbegin(), sketch_size.cend(), sketches_scan.begin() + beg_scan + 1);
 
   // Gather all column pointers
-  collective::GlobalSum(info, sketches_scan.data(), sketches_scan.size());
+  auto rc =
+      collective::GlobalSum(ctx, info, linalg::MakeVec(sketches_scan.data(), sketches_scan.size()));
+  collective::SafeColl(rc);
   for (int32_t i = 0; i < world; ++i) {
     size_t back = (i + 1) * (n_columns + 1) - 1;
     auto n_entries = sketches_scan.at(back);
@@ -199,14 +201,15 @@ void SketchContainerImpl<WQSketch>::GatherSketchInfo(
 
   static_assert(sizeof(typename WQSketch::Entry) / 4 == sizeof(float),
                 "Unexpected size of sketch entry.");
-  collective::GlobalSum(
-      info,
-      reinterpret_cast<float *>(global_sketches.data()),
-      global_sketches.size() * sizeof(typename WQSketch::Entry) / sizeof(float));
+  rc = collective::GlobalSum(
+      ctx, info,
+      linalg::MakeVec(reinterpret_cast<float *>(global_sketches.data()),
+                      global_sketches.size() * sizeof(typename WQSketch::Entry) / sizeof(float)));
+  collective::SafeColl(rc);
 }
 
 template <typename WQSketch>
-void SketchContainerImpl<WQSketch>::AllreduceCategories(Context const*, MetaInfo const& info) {
+void SketchContainerImpl<WQSketch>::AllreduceCategories(Context const* ctx, MetaInfo const& info) {
   auto world_size = collective::GetWorldSize();
   auto rank = collective::GetRank();
   if (world_size == 1 || info.IsColumnSplit()) {
@@ -226,7 +229,8 @@ void SketchContainerImpl<WQSketch>::AllreduceCategories(Context const*, MetaInfo
   std::vector<size_t> global_feat_ptrs(feature_ptr.size() * world_size, 0);
   size_t feat_begin = rank * feature_ptr.size();  // pointer to current worker
   std::copy(feature_ptr.begin(), feature_ptr.end(), global_feat_ptrs.begin() + feat_begin);
-  collective::GlobalSum(info, global_feat_ptrs.data(), global_feat_ptrs.size());
+  auto rc = collective::GlobalSum(
+      ctx, info, linalg::MakeVec(global_feat_ptrs.data(), global_feat_ptrs.size()));
 
   // move all categories into a flatten vector to prepare for allreduce
   size_t total = feature_ptr.back();
@@ -239,7 +243,8 @@ void SketchContainerImpl<WQSketch>::AllreduceCategories(Context const*, MetaInfo
   // indptr for indexing workers
   std::vector<size_t> global_worker_ptr(world_size + 1, 0);
   global_worker_ptr[rank + 1] = total;  // shift 1 to right for constructing the indptr
-  collective::GlobalSum(info, global_worker_ptr.data(), global_worker_ptr.size());
+  rc = collective::GlobalSum(ctx, info,
+                             linalg::MakeVec(global_worker_ptr.data(), global_worker_ptr.size()));
   std::partial_sum(global_worker_ptr.cbegin(), global_worker_ptr.cend(), global_worker_ptr.begin());
   // total number of categories in all workers with all features
   auto gtotal = global_worker_ptr.back();
@@ -251,7 +256,8 @@ void SketchContainerImpl<WQSketch>::AllreduceCategories(Context const*, MetaInfo
   CHECK_EQ(rank_size, total);
   std::copy(flatten.cbegin(), flatten.cend(), global_categories.begin() + rank_begin);
   // gather values from all workers.
-  collective::GlobalSum(info, global_categories.data(), global_categories.size());
+  rc = collective::GlobalSum(ctx, info,
+                             linalg::MakeVec(global_categories.data(), global_categories.size()));
   QuantileAllreduce<float> allreduce_result{global_categories, global_worker_ptr, global_feat_ptrs,
                                             categories_.size()};
   ParallelFor(categories_.size(), n_threads_, [&](auto fidx) {
@@ -293,7 +299,9 @@ void SketchContainerImpl<WQSketch>::AllReduce(
 
   // Prune the intermediate num cuts for synchronization.
   std::vector<bst_row_t> global_column_size(columns_size_);
-  collective::GlobalSum(info, &global_column_size);
+  auto rc = collective::GlobalSum(
+      ctx, info, linalg::MakeVec(global_column_size.data(), global_column_size.size()));
+  collective::SafeColl(rc);
 
   ParallelFor(sketches_.size(), n_threads_, [&](size_t i) {
     int32_t intermediate_num_cuts = static_cast<int32_t>(
diff --git a/src/common/random.h b/src/common/random.h
index 098e94b74..908090871 100644
--- a/src/common/random.h
+++ b/src/common/random.h
@@ -31,7 +31,7 @@ namespace xgboost::common {
  */
 using RandomEngine = std::mt19937;
 
-#if XGBOOST_CUSTOMIZE_GLOBAL_PRNG
+#if defined(XGBOOST_CUSTOMIZE_GLOBAL_PRNG) && XGBOOST_CUSTOMIZE_GLOBAL_PRNG == 1
 /*!
  * \brief An customized random engine, used to be plugged in PRNG from other systems.
  *  The implementation of this library is not provided by xgboost core library.
diff --git a/src/common/ranking_utils.h b/src/common/ranking_utils.h
index e6b87ed4b..acba0feeb 100644
--- a/src/common/ranking_utils.h
+++ b/src/common/ranking_utils.h
@@ -78,6 +78,7 @@ struct LambdaRankParam : public XGBoostParameter<LambdaRankParam> {
 
   // unbiased
   bool lambdarank_unbiased{false};
+  bool lambdarank_normalization{true};
   double lambdarank_bias_norm{1.0};
   // ndcg
   bool ndcg_exp_gain{true};
@@ -86,6 +87,7 @@ struct LambdaRankParam : public XGBoostParameter<LambdaRankParam> {
     return lambdarank_pair_method == that.lambdarank_pair_method &&
            lambdarank_num_pair_per_sample == that.lambdarank_num_pair_per_sample &&
            lambdarank_unbiased == that.lambdarank_unbiased &&
+           lambdarank_normalization == that.lambdarank_normalization &&
            lambdarank_bias_norm == that.lambdarank_bias_norm && ndcg_exp_gain == that.ndcg_exp_gain;
   }
   bool operator!=(LambdaRankParam const& that) const { return !(*this == that); }
@@ -134,6 +136,9 @@ struct LambdaRankParam : public XGBoostParameter<LambdaRankParam> {
     DMLC_DECLARE_FIELD(lambdarank_unbiased)
         .set_default(false)
         .describe("Unbiased lambda mart. Use extended IPW to debias click position");
+    DMLC_DECLARE_FIELD(lambdarank_normalization)
+        .set_default(true)
+        .describe("Whether to normalize the leaf value for lambda rank.");
     DMLC_DECLARE_FIELD(lambdarank_bias_norm)
         .set_default(1.0)
         .set_lower_bound(0.0)
diff --git a/src/gbm/gbtree_model.cc b/src/gbm/gbtree_model.cc
index 14131865f..2edb456c9 100644
--- a/src/gbm/gbtree_model.cc
+++ b/src/gbm/gbtree_model.cc
@@ -106,30 +106,13 @@ void GBTreeModel::Load(dmlc::Stream* fi) {
   Validate(*this);
 }
 
-namespace {
-std::int32_t IOThreads(Context const* ctx) {
-  CHECK(ctx);
-  std::int32_t n_threads = ctx->Threads();
-  // CRAN checks for number of threads used by examples, but we might not have the right
-  // number of threads when serializing/unserializing models as nthread is a booster
-  // parameter, which is only effective after booster initialization.
-  //
-  // The threshold ratio of CPU time to user time for R is 2.5, we set the number of
-  // threads to 2.
-#if defined(XGBOOST_STRICT_R_MODE) && XGBOOST_STRICT_R_MODE == 1
-  n_threads = std::min(2, n_threads);
-#endif
-  return n_threads;
-}
-}  // namespace
-
 void GBTreeModel::SaveModel(Json* p_out) const {
   auto& out = *p_out;
   CHECK_EQ(param.num_trees, static_cast<int>(trees.size()));
   out["gbtree_model_param"] = ToJson(param);
   std::vector<Json> trees_json(trees.size());
 
-  common::ParallelFor(trees.size(), IOThreads(ctx_), [&](auto t) {
+  common::ParallelFor(trees.size(), ctx_->Threads(), [&](auto t) {
     auto const& tree = trees[t];
     Json jtree{Object{}};
     tree->SaveModel(&jtree);
@@ -167,7 +150,7 @@ void GBTreeModel::LoadModel(Json const& in) {
   CHECK_EQ(tree_info_json.size(), param.num_trees);
   tree_info.resize(param.num_trees);
 
-  common::ParallelFor(param.num_trees, IOThreads(ctx_), [&](auto t) {
+  common::ParallelFor(param.num_trees, ctx_->Threads(), [&](auto t) {
     auto tree_id = get<Integer const>(trees_json[t]["id"]);
     trees.at(tree_id).reset(new RegTree{});
     trees[tree_id]->LoadModel(trees_json[t]);
diff --git a/src/learner.cc b/src/learner.cc
index db72f7164..eed9dd5cd 100644
--- a/src/learner.cc
+++ b/src/learner.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright 2014-2023 by XGBoost Contributors
+ * Copyright 2014-2024, XGBoost Contributors
  * \file learner.cc
  * \brief Implementation of learning algorithm.
  * \author Tianqi Chen
@@ -846,7 +846,7 @@ class LearnerConfiguration : public Learner {
 
   void InitEstimation(MetaInfo const& info, linalg::Tensor<float, 1>* base_score) {
     base_score->Reshape(1);
-    collective::ApplyWithLabels(info, base_score->Data(),
+    collective::ApplyWithLabels(this->Ctx(), info, base_score->Data(),
                                 [&] { UsePtr(obj_)->InitEstimation(info, base_score); });
   }
 };
@@ -1472,7 +1472,7 @@ class LearnerImpl : public LearnerIO {
   void GetGradient(HostDeviceVector<bst_float> const& preds, MetaInfo const& info,
                    std::int32_t iter, linalg::Matrix<GradientPair>* out_gpair) {
     out_gpair->Reshape(info.num_row_, this->learner_model_param_.OutputLength());
-    collective::ApplyWithLabels(info, out_gpair->Data(),
+    collective::ApplyWithLabels(&ctx_, info, out_gpair->Data(),
                                 [&] { obj_->GetGradient(preds, info, iter, out_gpair); });
   }
 
diff --git a/src/metric/auc.cc b/src/metric/auc.cc
index b5d63bffc..81c731bf0 100644
--- a/src/metric/auc.cc
+++ b/src/metric/auc.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright 2021-2023 by XGBoost Contributors
+ * Copyright 2021-2024, XGBoost Contributors
  */
 #include "auc.h"
 
@@ -112,7 +112,9 @@ double MultiClassOVR(Context const *ctx, common::Span<float const> predts, MetaI
 
   // we have 2 averages going in here, first is among workers, second is among
   // classes. allreduce sums up fp/tp auc for each class.
-  collective::GlobalSum(info, &results.Values());
+  auto rc = collective::GlobalSum(ctx, info, results);
+  collective::SafeColl(rc);
+
   double auc_sum{0};
   double tp_sum{0};
   for (size_t c = 0; c < n_classes; ++c) {
@@ -286,7 +288,7 @@ class EvalAUC : public MetricNoCache {
         InvalidGroupAUC();
       }
 
-      auc = collective::GlobalRatio(info, auc, static_cast<double>(valid_groups));
+      auc = collective::GlobalRatio(ctx_, info, auc, static_cast<double>(valid_groups));
       if (!std::isnan(auc)) {
         CHECK_LE(auc, 1) << "Total AUC across groups: " << auc * valid_groups
                          << ", valid groups: " << valid_groups;
@@ -307,7 +309,7 @@ class EvalAUC : public MetricNoCache {
         std::tie(fp, tp, auc) =
             static_cast<Curve *>(this)->EvalBinary(preds, info);
       }
-      auc = collective::GlobalRatio(info, auc, fp * tp);
+      auc = collective::GlobalRatio(ctx_, info, auc, fp * tp);
       if (!std::isnan(auc)) {
         CHECK_LE(auc, 1.0);
       }
diff --git a/src/metric/elementwise_metric.cu b/src/metric/elementwise_metric.cu
index 1886ff12f..863666f70 100644
--- a/src/metric/elementwise_metric.cu
+++ b/src/metric/elementwise_metric.cu
@@ -1,5 +1,5 @@
 /**
- * Copyright 2015-2023 by XGBoost Contributors
+ * Copyright 2015-2024, XGBoost Contributors
  * \file elementwise_metric.cu
  * \brief evaluation metrics for elementwise binary or regression.
  * \author Kailong Chen, Tianqi Chen
@@ -12,13 +12,14 @@
 #include <cmath>
 
 #include "../collective/communicator-inl.h"
-#include "../common/common.h"           // MetricNoCache
+#include "../common/common.h"  // MetricNoCache
 #include "../common/math.h"
 #include "../common/optional_weight.h"  // OptionalWeights
 #include "../common/pseudo_huber.h"
 #include "../common/quantile_loss_utils.h"  // QuantileLossParam
 #include "../common/threading_utils.h"
 #include "metric_common.h"
+#include "xgboost/collective/result.h"  // for SafeColl
 #include "xgboost/metric.h"
 
 #if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
@@ -30,8 +31,7 @@
 #include "../common/device_helpers.cuh"
 #endif  // defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
 
-namespace xgboost {
-namespace metric {
+namespace xgboost::metric {
 // tag the this file, used by force static link later.
 DMLC_REGISTRY_FILE_TAG(elementwise_metric);
 
@@ -199,7 +199,8 @@ class PseudoErrorLoss : public MetricNoCache {
           return std::make_tuple(v, wt);
         });
     std::array<double, 2> dat{result.Residue(), result.Weights()};
-    collective::GlobalSum(info, &dat);
+    auto rc = collective::GlobalSum(ctx_, info, linalg::MakeVec(dat.data(), dat.size()));
+    collective::SafeColl(rc);
     return EvalRowMAPE::GetFinal(dat[0], dat[1]);
   }
 };
@@ -243,11 +244,11 @@ struct EvalError {
 };
 
 struct EvalPoissonNegLogLik {
-  const char *Name() const {
+  [[nodiscard]] const char *Name() const {
     return "poisson-nloglik";
   }
 
-  XGBOOST_DEVICE bst_float EvalRow(bst_float y, bst_float py) const {
+  [[nodiscard]] XGBOOST_DEVICE bst_float EvalRow(bst_float y, bst_float py) const {
     const bst_float eps = 1e-16f;
     if (py < eps) py = eps;
     return common::LogGamma(y + 1.0f) + py - std::log(py) * y;
@@ -266,9 +267,9 @@ struct EvalPoissonNegLogLik {
  *   predt >= 0
  */
 struct EvalGammaDeviance {
-  const char *Name() const { return "gamma-deviance"; }
+  [[nodiscard]] const char *Name() const { return "gamma-deviance"; }
 
-  XGBOOST_DEVICE bst_float EvalRow(bst_float label, bst_float predt) const {
+  [[nodiscard]] XGBOOST_DEVICE bst_float EvalRow(bst_float label, bst_float predt) const {
     predt += kRtEps;
     label += kRtEps;
     return std::log(predt / label) + label / predt - 1;
@@ -287,7 +288,7 @@ struct EvalGammaNLogLik {
     return "gamma-nloglik";
   }
 
-  XGBOOST_DEVICE bst_float EvalRow(bst_float y, bst_float py) const {
+  [[nodiscard]] XGBOOST_DEVICE bst_float EvalRow(bst_float y, bst_float py) const {
     py = std::max(py, 1e-6f);
     // hardcoded dispersion.
     float constexpr kPsi = 1.0;
@@ -313,7 +314,7 @@ struct EvalTweedieNLogLik {
     CHECK(rho_ < 2 && rho_ >= 1)
         << "tweedie variance power must be in interval [1, 2)";
   }
-  const char *Name() const {
+  [[nodiscard]] const char *Name() const {
     static thread_local std::string name;
     std::ostringstream os;
     os << "tweedie-nloglik@" << rho_;
@@ -321,7 +322,7 @@ struct EvalTweedieNLogLik {
     return name.c_str();
   }
 
-  XGBOOST_DEVICE bst_float EvalRow(bst_float y, bst_float p) const {
+  [[nodiscard]] XGBOOST_DEVICE bst_float EvalRow(bst_float y, bst_float p) const {
     bst_float a = y * std::exp((1 - rho_) * std::log(p)) / (1 - rho_);
     bst_float b = std::exp((2 - rho_) * std::log(p)) / (2 - rho_);
     return -a + b;
@@ -366,7 +367,8 @@ struct EvalEWiseBase : public MetricNoCache {
         });
 
     std::array<double, 2> dat{result.Residue(), result.Weights()};
-    collective::GlobalSum(info, &dat);
+    auto rc = collective::GlobalSum(ctx_, info, linalg::MakeVec(dat.data(), dat.size()));
+    collective::SafeColl(rc);
     return Policy::GetFinal(dat[0], dat[1]);
   }
 
@@ -438,7 +440,8 @@ class QuantileError : public MetricNoCache {
     if (info.num_row_ == 0) {
       // empty DMatrix on distributed env
       std::array<double, 2> dat{0.0, 0.0};
-      collective::GlobalSum(info, &dat);
+      auto rc = collective::GlobalSum(ctx_, info, linalg::MakeVec(dat.data(), dat.size()));
+      collective::SafeColl(rc);
       CHECK_GT(dat[1], 0);
       return dat[0] / dat[1];
     }
@@ -476,7 +479,8 @@ class QuantileError : public MetricNoCache {
           return std::make_tuple(l, w);
         });
     std::array<double, 2> dat{result.Residue(), result.Weights()};
-    collective::GlobalSum(info, &dat);
+    auto rc = collective::GlobalSum(ctx, info, linalg::MakeVec(dat.data(), dat.size()));
+    collective::SafeColl(rc);
     CHECK_GT(dat[1], 0);
     return dat[0] / dat[1];
   }
@@ -501,5 +505,4 @@ class QuantileError : public MetricNoCache {
 XGBOOST_REGISTER_METRIC(QuantileError, "quantile")
     .describe("Quantile regression error.")
     .set_body([](const char*) { return new QuantileError{}; });
-}  // namespace metric
-}  // namespace xgboost
+}  // namespace xgboost::metric
diff --git a/src/metric/metric_common.h b/src/metric/metric_common.h
index 1b148ab0f..53c38ff2a 100644
--- a/src/metric/metric_common.h
+++ b/src/metric/metric_common.h
@@ -1,6 +1,5 @@
-/*!
- * Copyright 2018-2022 by Contributors
- * \file metric_common.h
+/**
+ * Copyright 2018-2024, Contributors
  */
 #ifndef XGBOOST_METRIC_METRIC_COMMON_H_
 #define XGBOOST_METRIC_METRIC_COMMON_H_
@@ -24,7 +23,7 @@ class MetricNoCache : public Metric {
   double Evaluate(HostDeviceVector<float> const &predts, std::shared_ptr<DMatrix> p_fmat) final {
     double result{0.0};
     auto const &info = p_fmat->Info();
-    collective::ApplyWithLabels(info, &result, sizeof(double),
+    collective::ApplyWithLabels(ctx_, info, &result, sizeof(double),
                                 [&] { result = this->Eval(predts, info); });
     return result;
   }
diff --git a/src/metric/multiclass_metric.cu b/src/metric/multiclass_metric.cu
index a6d215e6a..052e5eb4c 100644
--- a/src/metric/multiclass_metric.cu
+++ b/src/metric/multiclass_metric.cu
@@ -1,5 +1,5 @@
 /**
- * Copyright 2015-2023 by XGBoost Contributors
+ * Copyright 2015-2024, XGBoost Contributors
  * \file multiclass_metric.cc
  * \brief evaluation metrics for multiclass classification.
  * \author Kailong Chen, Tianqi Chen
@@ -24,8 +24,7 @@
 #include "../common/device_helpers.cuh"
 #endif  // XGBOOST_USE_CUDA || XGBOOST_USE_HIP
 
-namespace xgboost {
-namespace metric {
+namespace xgboost::metric {
 // tag the this file, used by force static link later.
 DMLC_REGISTRY_FILE_TAG(multiclass_metric);
 
@@ -40,11 +39,10 @@ class MultiClassMetricsReduction {
  public:
   MultiClassMetricsReduction() = default;
 
-  PackedReduceResult
-  CpuReduceMetrics(const HostDeviceVector<bst_float> &weights,
-                   const HostDeviceVector<bst_float> &labels,
-                   const HostDeviceVector<bst_float> &preds,
-                   const size_t n_class, int32_t n_threads) const {
+  [[nodiscard]] PackedReduceResult CpuReduceMetrics(const HostDeviceVector<bst_float>& weights,
+                                                    const HostDeviceVector<bst_float>& labels,
+                                                    const HostDeviceVector<bst_float>& preds,
+                                                    const size_t n_class, int32_t n_threads) const {
     size_t ndata = labels.Size();
 
     const auto& h_labels = labels.HostVector();
@@ -184,7 +182,8 @@ struct EvalMClassBase : public MetricNoCache {
       dat[0] = result.Residue();
       dat[1] = result.Weights();
     }
-    collective::GlobalSum(info, &dat);
+    auto rc = collective::GlobalSum(ctx_, info, linalg::MakeVec(dat.data(), dat.size()));
+    collective::SafeColl(rc);
     return Derived::GetFinal(dat[0], dat[1]);
   }
   /*!
@@ -247,5 +246,4 @@ XGBOOST_REGISTER_METRIC(MatchError, "merror")
 XGBOOST_REGISTER_METRIC(MultiLogLoss, "mlogloss")
     .describe("Multiclass negative loglikelihood.")
     .set_body([](const char*) { return new EvalMultiLogLoss(); });
-}  // namespace metric
-}  // namespace xgboost
+}  // namespace xgboost::metric
diff --git a/src/metric/rank_metric.cc b/src/metric/rank_metric.cc
index 6762aec32..53841c051 100644
--- a/src/metric/rank_metric.cc
+++ b/src/metric/rank_metric.cc
@@ -101,7 +101,7 @@ struct EvalAMS : public MetricNoCache {
     }
   }
 
-  const char* Name() const override {
+  [[nodiscard]] const char* Name() const override {
     return name_.c_str();
   }
 
@@ -159,7 +159,7 @@ struct EvalRank : public MetricNoCache, public EvalRankConfig {
       exc.Rethrow();
     }
 
-    return collective::GlobalRatio(info, sum_metric, static_cast<double>(ngroups));
+    return collective::GlobalRatio(ctx_, info, sum_metric, static_cast<double>(ngroups));
   }
 
   [[nodiscard]] const char* Name() const override {
@@ -274,7 +274,7 @@ class EvalRankWithCache : public Metric {
   double Evaluate(HostDeviceVector<float> const& preds, std::shared_ptr<DMatrix> p_fmat) override {
     double result{0.0};
     auto const& info = p_fmat->Info();
-    collective::ApplyWithLabels(info, &result, sizeof(double), [&] {
+    collective::ApplyWithLabels(ctx_, info, &result, sizeof(double), [&] {
       auto p_cache = cache_.CacheItem(p_fmat, ctx_, info, param_);
       if (p_cache->Param() != param_) {
         p_cache = cache_.ResetItem(p_fmat, ctx_, info, param_);
@@ -294,9 +294,10 @@ class EvalRankWithCache : public Metric {
 };
 
 namespace {
-double Finalize(Context const*, MetaInfo const& info, double score, double sw) {
+double Finalize(Context const* ctx, MetaInfo const& info, double score, double sw) {
   std::array<double, 2> dat{score, sw};
-  collective::GlobalSum(info, &dat);
+  auto rc = collective::GlobalSum(ctx, info, linalg::MakeVec(dat.data(), 2));
+  collective::SafeColl(rc);
   std::tie(score, sw) = std::tuple_cat(dat);
   if (sw > 0.0) {
     score = score / sw;
diff --git a/src/metric/rank_metric.cu b/src/metric/rank_metric.cu
index eb6f1b3a1..aab0f144b 100644
--- a/src/metric/rank_metric.cu
+++ b/src/metric/rank_metric.cu
@@ -1,5 +1,5 @@
 /**
- * Copyright 2020-2023 by XGBoost Contributors
+ * Copyright 2020-2024, XGBoost Contributors
  */
 #include <dmlc/registry.h>
 #include <thrust/iterator/counting_iterator.h>  // for make_counting_iterator
diff --git a/src/metric/survival_metric.cu b/src/metric/survival_metric.cu
index 4cf77669b..14db5c240 100644
--- a/src/metric/survival_metric.cu
+++ b/src/metric/survival_metric.cu
@@ -1,5 +1,5 @@
 /**
- * Copyright 2019-2023 by Contributors
+ * Copyright 2019-2024, Contributors
  * \file survival_metric.cu
  * \brief Metrics for survival analysis
  * \author Avinash Barnwal, Hyunsu Cho and Toby Hocking
@@ -30,8 +30,7 @@ using ProbabilityDistributionType = xgboost::common::ProbabilityDistributionType
 template <typename Distribution>
 using AFTLoss = xgboost::common::AFTLoss<Distribution>;
 
-namespace xgboost {
-namespace metric {
+namespace xgboost::metric {
 // tag the this file, used by force static link later.
 DMLC_REGISTRY_FILE_TAG(survival_metric);
 
@@ -43,12 +42,11 @@ class ElementWiseSurvivalMetricsReduction {
     policy_ = policy;
   }
 
-  PackedReduceResult
-  CpuReduceMetrics(const HostDeviceVector<bst_float> &weights,
-                   const HostDeviceVector<bst_float> &labels_lower_bound,
-                   const HostDeviceVector<bst_float> &labels_upper_bound,
-                   const HostDeviceVector<bst_float> &preds,
-                   int32_t n_threads) const {
+  [[nodiscard]] PackedReduceResult CpuReduceMetrics(
+      const HostDeviceVector<bst_float>& weights,
+      const HostDeviceVector<bst_float>& labels_lower_bound,
+      const HostDeviceVector<bst_float>& labels_upper_bound,
+      const HostDeviceVector<bst_float>& preds, int32_t n_threads) const {
     size_t ndata = labels_lower_bound.Size();
     CHECK_EQ(ndata, labels_upper_bound.Size());
 
@@ -156,7 +154,7 @@ class ElementWiseSurvivalMetricsReduction {
 struct EvalIntervalRegressionAccuracy {
   void Configure(const Args&) {}
 
-  const char* Name() const {
+  [[nodiscard]] const char* Name() const {
     return "interval-regression-accuracy";
   }
 
@@ -178,7 +176,7 @@ struct EvalAFTNLogLik {
     param_.UpdateAllowUnknown(args);
   }
 
-  const char* Name() const {
+  [[nodiscard]] const char* Name() const {
     return "aft-nloglik";
   }
 
@@ -214,7 +212,8 @@ struct EvalEWiseSurvivalBase : public MetricNoCache {
                                   info.labels_upper_bound_, preds);
 
     std::array<double, 2> dat{result.Residue(), result.Weights()};
-    collective::GlobalSum(info, &dat);
+    auto rc = collective::GlobalSum(ctx_, info, linalg::MakeVec(dat.data(), dat.size()));
+    collective::SafeColl(rc);
     return Policy::GetFinal(dat[0], dat[1]);
   }
 
@@ -231,7 +230,7 @@ struct EvalEWiseSurvivalBase : public MetricNoCache {
 // This class exists because we want to perform dispatch according to the distribution type at
 // configuration time, not at prediction time.
 struct AFTNLogLikDispatcher : public MetricNoCache {
-  const char* Name() const override {
+  [[nodiscard]] const char* Name() const override {
     return "aft-nloglik";
   }
 
@@ -283,5 +282,4 @@ XGBOOST_REGISTER_METRIC(IntervalRegressionAccuracy, "interval-regression-accurac
       return new EvalEWiseSurvivalBase<EvalIntervalRegressionAccuracy>();
     });
 
-}  // namespace metric
-}  // namespace xgboost
+}  // namespace xgboost::metric
diff --git a/src/objective/adaptive.cc b/src/objective/adaptive.cc
index 53676a4b8..2aef51824 100644
--- a/src/objective/adaptive.cc
+++ b/src/objective/adaptive.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright 2022-2023 by XGBoost Contributors
+ * Copyright 2022-2024, XGBoost Contributors
  */
 #include "adaptive.h"
 
@@ -85,7 +85,7 @@ void UpdateTreeLeafHost(Context const* ctx, std::vector<bst_node_t> const& posit
   size_t n_leaf = nidx.size();
   if (nptr.empty()) {
     std::vector<float> quantiles;
-    UpdateLeafValues(&quantiles, nidx, info, learning_rate, p_tree);
+    UpdateLeafValues(ctx, &quantiles, nidx, info, learning_rate, p_tree);
     return;
   }
 
@@ -100,7 +100,7 @@ void UpdateTreeLeafHost(Context const* ctx, std::vector<bst_node_t> const& posit
                                         predt.Size() / info.num_row_);
 
   collective::ApplyWithLabels(
-      info, static_cast<void*>(quantiles.data()), quantiles.size() * sizeof(float), [&] {
+      ctx, info, static_cast<void*>(quantiles.data()), quantiles.size() * sizeof(float), [&] {
         // loop over each leaf
         common::ParallelFor(quantiles.size(), ctx->Threads(), [&](size_t k) {
           auto nidx = h_node_idx[k];
@@ -134,7 +134,7 @@ void UpdateTreeLeafHost(Context const* ctx, std::vector<bst_node_t> const& posit
         });
       });
 
-  UpdateLeafValues(&quantiles, nidx, info, learning_rate, p_tree);
+  UpdateLeafValues(ctx, &quantiles, nidx, info, learning_rate, p_tree);
 }
 
 #if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
diff --git a/src/objective/adaptive.cu b/src/objective/adaptive.cu
index d5c7bfda9..5a0b17210 100644
--- a/src/objective/adaptive.cu
+++ b/src/objective/adaptive.cu
@@ -1,5 +1,5 @@
 /**
- * Copyright 2022-2023 by XGBoost Contributors
+ * Copyright 2022-2024, XGBoost Contributors
  */
 #include <thrust/sort.h>
 
@@ -157,7 +157,7 @@ void UpdateTreeLeafDevice(Context const* ctx, common::Span<bst_node_t const> pos
 
   if (nptr.Empty()) {
     std::vector<float> quantiles;
-    UpdateLeafValues(&quantiles, nidx.ConstHostVector(), info, learning_rate, p_tree);
+    UpdateLeafValues(ctx, &quantiles, nidx.ConstHostVector(), info, learning_rate, p_tree);
   }
 
   predt.SetDevice(ctx->Device());
@@ -167,7 +167,7 @@ void UpdateTreeLeafDevice(Context const* ctx, common::Span<bst_node_t const> pos
   auto t_predt = d_predt.Slice(linalg::All(), group_idx);
 
   HostDeviceVector<float> quantiles;
-  collective::ApplyWithLabels(info, &quantiles, [&] {
+  collective::ApplyWithLabels(ctx, info, &quantiles, [&] {
     auto d_labels = info.labels.View(ctx->Device()).Slice(linalg::All(), IdxY(info, group_idx));
     auto d_row_index = dh::ToSpan(ridx);
     auto seg_beg = nptr.DevicePointer();
@@ -193,6 +193,7 @@ void UpdateTreeLeafDevice(Context const* ctx, common::Span<bst_node_t const> pos
                                         w_it + d_weights.size(), &quantiles);
     }
   });
-  UpdateLeafValues(&quantiles.HostVector(), nidx.ConstHostVector(), info, learning_rate, p_tree);
+  UpdateLeafValues(ctx, &quantiles.HostVector(), nidx.ConstHostVector(), info, learning_rate,
+                   p_tree);
 }
 }  // namespace xgboost::obj::detail
diff --git a/src/objective/adaptive.h b/src/objective/adaptive.h
index a64f37f63..cbe69e79a 100644
--- a/src/objective/adaptive.h
+++ b/src/objective/adaptive.h
@@ -1,5 +1,5 @@
 /**
- * Copyright 2022-2023 by XGBoost Contributors
+ * Copyright 2022-2024, XGBoost Contributors
  */
 #pragma once
 
@@ -17,8 +17,7 @@
 #include "xgboost/host_device_vector.h"  // HostDeviceVector
 #include "xgboost/tree_model.h"          // RegTree
 
-namespace xgboost {
-namespace obj {
+namespace xgboost::obj {
 namespace detail {
 inline void FillMissingLeaf(std::vector<bst_node_t> const& maybe_missing,
                             std::vector<bst_node_t>* p_nidx, std::vector<size_t>* p_nptr) {
@@ -36,13 +35,14 @@ inline void FillMissingLeaf(std::vector<bst_node_t> const& maybe_missing,
   }
 }
 
-inline void UpdateLeafValues(std::vector<float>* p_quantiles, std::vector<bst_node_t> const& nidx,
-                             MetaInfo const& info, float learning_rate, RegTree* p_tree) {
+inline void UpdateLeafValues(Context const* ctx, std::vector<float>* p_quantiles,
+                             std::vector<bst_node_t> const& nidx, MetaInfo const& info,
+                             float learning_rate, RegTree* p_tree) {
   auto& tree = *p_tree;
   auto& quantiles = *p_quantiles;
   auto const& h_node_idx = nidx;
 
-  size_t n_leaf = collective::GlobalMax(info, h_node_idx.size());
+  size_t n_leaf = collective::GlobalMax(ctx, info, h_node_idx.size());
   CHECK(quantiles.empty() || quantiles.size() == n_leaf);
   if (quantiles.empty()) {
     quantiles.resize(n_leaf, std::numeric_limits<float>::quiet_NaN());
@@ -52,12 +52,16 @@ inline void UpdateLeafValues(std::vector<float>* p_quantiles, std::vector<bst_no
   std::vector<int32_t> n_valids(quantiles.size());
   std::transform(quantiles.cbegin(), quantiles.cend(), n_valids.begin(),
                  [](float q) { return static_cast<int32_t>(!std::isnan(q)); });
-  collective::GlobalSum(info, &n_valids);
+  auto rc = collective::GlobalSum(ctx, info, linalg::MakeVec(n_valids.data(), n_valids.size()));
+  collective::SafeColl(rc);
+
   // convert to 0 for all reduce
   std::replace_if(
       quantiles.begin(), quantiles.end(), [](float q) { return std::isnan(q); }, 0.f);
   // use the mean value
-  collective::GlobalSum(info, &quantiles);
+  rc = collective::GlobalSum(ctx, info, linalg::MakeVec(quantiles.data(), quantiles.size()));
+  collective::SafeColl(rc);
+
   for (size_t i = 0; i < n_leaf; ++i) {
     if (n_valids[i] > 0) {
       quantiles[i] /= static_cast<float>(n_valids[i]);
@@ -105,5 +109,4 @@ inline void UpdateTreeLeaf(Context const* ctx, HostDeviceVector<bst_node_t> cons
                                predt, alpha, p_tree);
   }
 }
-}  // namespace obj
-}  // namespace xgboost
+}  // namespace xgboost::obj
diff --git a/src/objective/lambdarank_obj.cc b/src/objective/lambdarank_obj.cc
index efddf636e..e39134ea4 100644
--- a/src/objective/lambdarank_obj.cc
+++ b/src/objective/lambdarank_obj.cc
@@ -222,7 +222,7 @@ class LambdaRankObj : public FitIntercept {
     };
 
     MakePairs(ctx_, iter, p_cache_, g, g_label, g_rank, loop);
-    if (sum_lambda > 0.0) {
+    if (sum_lambda > 0.0 && param_.lambdarank_normalization) {
       double norm = std::log2(1.0 + sum_lambda) / sum_lambda;
       std::transform(g_gpair.Values().data(), g_gpair.Values().data() + g_gpair.Size(),
                      g_gpair.Values().data(), [norm](GradientPair const& g) { return g * norm; });
@@ -474,7 +474,6 @@ class LambdaRankMAP : public LambdaRankObj<LambdaRankMAP, ltr::MAPCache> {
  public:
   void GetGradientImpl(std::int32_t iter, const HostDeviceVector<float>& predt,
                        const MetaInfo& info, linalg::Matrix<GradientPair>* out_gpair) {
-    CHECK(param_.ndcg_exp_gain) << "NDCG gain can not be set for the MAP objective.";
     if (ctx_->IsCUDA()) {
       return cuda_impl::LambdaRankGetGradientMAP(
           ctx_, iter, predt, info, GetCache(), ti_plus_.View(ctx_->Device()),
@@ -564,7 +563,6 @@ class LambdaRankPairwise : public LambdaRankObj<LambdaRankPairwise, ltr::Ranking
  public:
   void GetGradientImpl(std::int32_t iter, const HostDeviceVector<float>& predt,
                        const MetaInfo& info, linalg::Matrix<GradientPair>* out_gpair) {
-    CHECK(param_.ndcg_exp_gain) << "NDCG gain can not be set for the pairwise objective.";
     if (ctx_->IsCUDA()) {
       return cuda_impl::LambdaRankGetGradientPairwise(
           ctx_, iter, predt, info, GetCache(), ti_plus_.View(ctx_->Device()),
@@ -610,6 +608,13 @@ class LambdaRankPairwise : public LambdaRankObj<LambdaRankPairwise, ltr::Ranking
   [[nodiscard]] const char* DefaultEvalMetric() const override {
     return this->RankEvalMetric("ndcg");
   }
+
+  [[nodiscard]] Json DefaultMetricConfig() const override {
+    Json config{Object{}};
+    config["name"] = String{DefaultEvalMetric()};
+    config["lambdarank_param"] = ToJson(param_);
+    return config;
+  }
 };
 
 #if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
diff --git a/src/objective/lambdarank_obj.cu b/src/objective/lambdarank_obj.cu
index 47d7957e8..e6f5be64c 100644
--- a/src/objective/lambdarank_obj.cu
+++ b/src/objective/lambdarank_obj.cu
@@ -270,12 +270,13 @@ void CalcGrad(Context const* ctx, MetaInfo const& info, std::shared_ptr<ltr::Ran
    */
   auto d_weights = common::MakeOptionalWeights(ctx, info.weights_);
   auto w_norm = p_cache->WeightNorm();
+  auto norm = p_cache->Param().lambdarank_normalization;
   thrust::for_each_n(ctx->CUDACtx()->CTP(), thrust::make_counting_iterator(0ul), d_gpair.Size(),
                      [=] XGBOOST_DEVICE(std::size_t i) mutable {
                        auto g = dh::SegmentId(d_gptr, i);
                        auto sum_lambda = thrust::get<2>(d_max_lambdas[g]);
                        // Normalization
-                       if (sum_lambda > 0.0) {
+                       if (sum_lambda > 0.0 && norm) {
                          double norm = std::log2(1.0 + sum_lambda) / sum_lambda;
                          d_gpair(i, 0) *= norm;
                        }
diff --git a/src/objective/quantile_obj.cu b/src/objective/quantile_obj.cu
index 444151295..f4713fbc8 100644
--- a/src/objective/quantile_obj.cu
+++ b/src/objective/quantile_obj.cu
@@ -1,5 +1,5 @@
 /**
- * Copyright 2023 by XGBoost contributors
+ * Copyright 2023-2024, XGBoost contributors
  */
 #include <array>                            // std::array
 #include <cstddef>                          // std::size_t
@@ -170,7 +170,9 @@ class QuantileRegression : public ObjFunction {
     double meanq = temp(0) * sw;
 
     std::array<double, 2> dat{meanq, sw};
-    collective::GlobalSum(info, &dat);
+    auto rc = collective::GlobalSum(ctx_, info, linalg::MakeVec(dat.data(), dat.size()));
+    collective::SafeColl(rc);
+
     std::tie(meanq, sw) = std::tuple_cat(dat);
     meanq /= (sw + kRtEps);
     base_score->Reshape(1);
diff --git a/src/objective/regression_obj.cu b/src/objective/regression_obj.cu
index 20e04ac59..fdb06474e 100644
--- a/src/objective/regression_obj.cu
+++ b/src/objective/regression_obj.cu
@@ -1,5 +1,5 @@
 /**
- * Copyright 2015-2023 by XGBoost Contributors
+ * Copyright 2015-2024, XGBoost Contributors
  * \file regression_obj.cu
  * \brief Definition of single-value regression and classification objectives.
  * \author Tianqi Chen, Kailong Chen
@@ -672,8 +672,12 @@ class MeanAbsoluteError : public ObjFunction {
     std::transform(linalg::cbegin(out), linalg::cend(out), linalg::begin(out),
                    [w](float v) { return v * w; });
 
-    collective::GlobalSum(info, &out.Values());
-    collective::GlobalSum(info, &w, 1);
+    auto rc = collective::Success() << [&] {
+      return collective::GlobalSum(ctx_, info, out);
+    } << [&] {
+      return collective::GlobalSum(ctx_, info, linalg::MakeVec(&w, 1));
+    };
+    collective::SafeColl(rc);
 
     if (common::CloseTo(w, 0.0)) {
       // Mostly for handling empty dataset test.
diff --git a/src/predictor/cpu_predictor.cc b/src/predictor/cpu_predictor.cc
index d97b527f0..06b8079ee 100644
--- a/src/predictor/cpu_predictor.cc
+++ b/src/predictor/cpu_predictor.cc
@@ -698,6 +698,67 @@ class CPUPredictor : public Predictor {
     }
   }
 
+  template <typename DataView>
+  void PredictContributionKernel(DataView batch, const MetaInfo& info,
+                                 const gbm::GBTreeModel& model,
+                                 const std::vector<bst_float>* tree_weights,
+                                 std::vector<std::vector<float>>* mean_values,
+                                 std::vector<RegTree::FVec>* feat_vecs,
+                                 std::vector<bst_float>* contribs, uint32_t ntree_limit,
+                                 bool approximate, int condition,
+                                 unsigned condition_feature) const {
+    const int num_feature = model.learner_model_param->num_feature;
+    const int ngroup = model.learner_model_param->num_output_group;
+    CHECK_NE(ngroup, 0);
+    size_t const ncolumns = num_feature + 1;
+    CHECK_NE(ncolumns, 0);
+    auto base_margin = info.base_margin_.View(ctx_->Device());
+    auto base_score = model.learner_model_param->BaseScore(ctx_->Device())(0);
+
+    // parallel over local batch
+    common::ParallelFor(batch.Size(), this->ctx_->Threads(), [&](auto i) {
+      auto row_idx = batch.base_rowid + i;
+      RegTree::FVec &feats = (*feat_vecs)[omp_get_thread_num()];
+      if (feats.Size() == 0) {
+        feats.Init(num_feature);
+      }
+      std::vector<bst_float> this_tree_contribs(ncolumns);
+      // loop over all classes
+      for (int gid = 0; gid < ngroup; ++gid) {
+        bst_float* p_contribs = &(*contribs)[(row_idx * ngroup + gid) * ncolumns];
+        feats.Fill(batch[i]);
+        // calculate contributions
+        for (unsigned j = 0; j < ntree_limit; ++j) {
+          auto *tree_mean_values = &mean_values->at(j);
+          std::fill(this_tree_contribs.begin(), this_tree_contribs.end(), 0);
+          if (model.tree_info[j] != gid) {
+            continue;
+          }
+          if (!approximate) {
+            CalculateContributions(*model.trees[j], feats, tree_mean_values,
+                                   &this_tree_contribs[0], condition, condition_feature);
+          } else {
+            model.trees[j]->CalculateContributionsApprox(
+                feats, tree_mean_values, &this_tree_contribs[0]);
+          }
+          for (size_t ci = 0; ci < ncolumns; ++ci) {
+            p_contribs[ci] +=
+                this_tree_contribs[ci] *
+                (tree_weights == nullptr ? 1 : (*tree_weights)[j]);
+          }
+        }
+        feats.Drop();
+        // add base margin to BIAS
+        if (base_margin.Size() != 0) {
+          CHECK_EQ(base_margin.Shape(1), ngroup);
+          p_contribs[ncolumns - 1] += base_margin(row_idx, gid);
+        } else {
+          p_contribs[ncolumns - 1] += base_score;
+        }
+      }
+    });
+  }
+
  public:
   explicit CPUPredictor(Context const *ctx) : Predictor::Predictor{ctx} {}
 
@@ -861,7 +922,6 @@ class CPUPredictor : public Predictor {
     CHECK(!p_fmat->Info().IsColumnSplit())
         << "Predict contribution support for column-wise data split is not yet implemented.";
     auto const n_threads = this->ctx_->Threads();
-    const int num_feature = model.learner_model_param->num_feature;
     std::vector<RegTree::FVec> feat_vecs;
     InitThreadTemp(n_threads, &feat_vecs);
     const MetaInfo& info = p_fmat->Info();
@@ -869,10 +929,7 @@ class CPUPredictor : public Predictor {
     if (ntree_limit == 0 || ntree_limit > model.trees.size()) {
       ntree_limit = static_cast<unsigned>(model.trees.size());
     }
-    const int ngroup = model.learner_model_param->num_output_group;
-    CHECK_NE(ngroup, 0);
-    size_t const ncolumns = num_feature + 1;
-    CHECK_NE(ncolumns, 0);
+    size_t const ncolumns = model.learner_model_param->num_feature + 1;
     // allocate space for (number of features + bias) times the number of rows
     std::vector<bst_float>& contribs = out_contribs->HostVector();
     contribs.resize(info.num_row_ * ncolumns * model.learner_model_param->num_output_group);
@@ -884,53 +941,22 @@ class CPUPredictor : public Predictor {
     common::ParallelFor(ntree_limit, n_threads, [&](bst_omp_uint i) {
       FillNodeMeanValues(model.trees[i].get(), &(mean_values[i]));
     });
-    auto base_margin = info.base_margin_.View(ctx_->Device());
-    auto base_score = model.learner_model_param->BaseScore(ctx_->Device())(0);
     // start collecting the contributions
-    for (const auto &batch : p_fmat->GetBatches<SparsePage>()) {
-      auto page = batch.GetView();
-      // parallel over local batch
-      common::ParallelFor(batch.Size(), n_threads, [&](auto i) {
-        auto row_idx = batch.base_rowid + i;
-        RegTree::FVec &feats = feat_vecs[omp_get_thread_num()];
-        if (feats.Size() == 0) {
-          feats.Init(num_feature);
-        }
-        std::vector<bst_float> this_tree_contribs(ncolumns);
-        // loop over all classes
-        for (int gid = 0; gid < ngroup; ++gid) {
-          bst_float* p_contribs = &contribs[(row_idx * ngroup + gid) * ncolumns];
-          feats.Fill(page[i]);
-          // calculate contributions
-          for (unsigned j = 0; j < ntree_limit; ++j) {
-            auto *tree_mean_values = &mean_values.at(j);
-            std::fill(this_tree_contribs.begin(), this_tree_contribs.end(), 0);
-            if (model.tree_info[j] != gid) {
-              continue;
-            }
-            if (!approximate) {
-              CalculateContributions(*model.trees[j], feats, tree_mean_values,
-                                     &this_tree_contribs[0], condition, condition_feature);
-            } else {
-              model.trees[j]->CalculateContributionsApprox(
-                  feats, tree_mean_values, &this_tree_contribs[0]);
-            }
-            for (size_t ci = 0; ci < ncolumns; ++ci) {
-              p_contribs[ci] +=
-                  this_tree_contribs[ci] *
-                  (tree_weights == nullptr ? 1 : (*tree_weights)[j]);
-            }
-          }
-          feats.Drop();
-          // add base margin to BIAS
-          if (base_margin.Size() != 0) {
-            CHECK_EQ(base_margin.Shape(1), ngroup);
-            p_contribs[ncolumns - 1] += base_margin(row_idx, gid);
-          } else {
-            p_contribs[ncolumns - 1] += base_score;
-          }
-        }
-      });
+    if (!p_fmat->PageExists<SparsePage>()) {
+      std::vector<Entry> workspace(info.num_col_ * kUnroll * n_threads);
+      auto ft = p_fmat->Info().feature_types.ConstHostVector();
+      for (const auto &batch : p_fmat->GetBatches<GHistIndexMatrix>(ctx_, {})) {
+        PredictContributionKernel(
+            GHistIndexMatrixView{batch, info.num_col_, ft, workspace, n_threads},
+            info, model, tree_weights, &mean_values, &feat_vecs, &contribs, ntree_limit,
+            approximate, condition, condition_feature);
+      }
+    } else {
+      for (const auto &batch : p_fmat->GetBatches<SparsePage>()) {
+        PredictContributionKernel(
+            SparsePageView{&batch}, info, model, tree_weights, &mean_values, &feat_vecs,
+            &contribs, ntree_limit, approximate, condition, condition_feature);
+      }
     }
   }
 
diff --git a/src/predictor/gpu_predictor.cu b/src/predictor/gpu_predictor.cu
index 1aa1cfa1a..47d014fac 100644
--- a/src/predictor/gpu_predictor.cu
+++ b/src/predictor/gpu_predictor.cu
@@ -1048,6 +1048,9 @@ class GPUPredictor : public xgboost::Predictor {
     if (tree_weights != nullptr) {
       LOG(FATAL) << "Dart booster feature " << not_implemented;
     }
+    if (!p_fmat->PageExists<SparsePage>()) {
+      LOG(FATAL) << "SHAP value for QuantileDMatrix is not yet implemented for GPU.";
+    }
     CHECK(!p_fmat->Info().IsColumnSplit())
         << "Predict contribution support for column-wise data split is not yet implemented.";
     dh::safe_cuda(cudaSetDevice(ctx_->Ordinal()));
@@ -1108,6 +1111,9 @@ class GPUPredictor : public xgboost::Predictor {
     if (tree_weights != nullptr) {
       LOG(FATAL) << "Dart booster feature " << not_implemented;
     }
+    if (!p_fmat->PageExists<SparsePage>()) {
+      LOG(FATAL) << "SHAP value for QuantileDMatrix is not yet implemented for GPU.";
+    }
     dh::safe_cuda(cudaSetDevice(ctx_->Ordinal()));
     out_contribs->SetDevice(ctx_->Device());
     if (tree_end == 0 || tree_end > model.trees.size()) {
diff --git a/src/tree/fit_stump.cc b/src/tree/fit_stump.cc
index 6bafa77d0..edd56fb2d 100644
--- a/src/tree/fit_stump.cc
+++ b/src/tree/fit_stump.cc
@@ -1,7 +1,7 @@
 /**
- * Copyright 2022 by XGBoost Contributors
+ * Copyright 2022-2024, XGBoost Contributors
  *
- * \brief Utilities for estimating initial score.
+ * @brief Utilities for estimating initial score.
  */
 #include "fit_stump.h"
 
@@ -44,8 +44,11 @@ void FitStump(Context const* ctx, MetaInfo const& info,
     }
   }
   CHECK(h_sum.CContiguous());
-
-  collective::GlobalSum(info, reinterpret_cast<double*>(h_sum.Values().data()), h_sum.Size() * 2);
+  auto as_double = linalg::MakeTensorView(
+      ctx, common::Span{reinterpret_cast<double*>(h_sum.Values().data()), h_sum.Size() * 2},
+      h_sum.Size() * 2);
+  auto rc = collective::GlobalSum(ctx, info, as_double);
+  collective::SafeColl(rc);
 
   for (std::size_t i = 0; i < h_sum.Size(); ++i) {
     out(i) = static_cast<float>(CalcUnregularizedWeight(h_sum(i).GetGrad(), h_sum(i).GetHess()));
diff --git a/src/tree/fit_stump.cu b/src/tree/fit_stump.cu
index 03055e7c9..aea7ba4d7 100644
--- a/src/tree/fit_stump.cu
+++ b/src/tree/fit_stump.cu
@@ -1,19 +1,18 @@
 /**
- * Copyright 2022-2023 by XGBoost Contributors
+ * Copyright 2022-2024, XGBoost Contributors
  *
- * \brief Utilities for estimating initial score.
+ * @brief Utilities for estimating initial score.
  */
 #if !defined(NOMINMAX) && defined(_WIN32)
 #define NOMINMAX
-#endif                                            // !defined(NOMINMAX)
-#include <thrust/execution_policy.h>              // cuda::par
-#include <thrust/iterator/counting_iterator.h>    // thrust::make_counting_iterator
+#endif                                          // !defined(NOMINMAX)
+#include <thrust/execution_policy.h>            // cuda::par
+#include <thrust/iterator/counting_iterator.h>  // thrust::make_counting_iterator
 
-#include <cstddef>                                // std::size_t
+#include <cstddef>  // std::size_t
 
-#include "../collective/aggregator.cuh"
-#include "../collective/communicator-inl.cuh"
-#include "../common/device_helpers.cuh"           // dh::MakeTransformIterator
+#include "../collective/aggregator.cuh"  // for GlobalSum
+#include "../common/device_helpers.cuh"  // dh::MakeTransformIterator
 #include "fit_stump.h"
 #include "xgboost/base.h"     // GradientPairPrecise, GradientPair, XGBOOST_DEVICE
 #include "xgboost/context.h"  // Context
diff --git a/src/tree/gpu_hist/histogram.cu b/src/tree/gpu_hist/histogram.cu
index 62e40f4d4..7f5a20dbf 100644
--- a/src/tree/gpu_hist/histogram.cu
+++ b/src/tree/gpu_hist/histogram.cu
@@ -1,5 +1,5 @@
 /**
- * Copyright 2020-2023 by XGBoost Contributors
+ * Copyright 2020-2024, XGBoost Contributors
  */
 #include <thrust/iterator/transform_iterator.h>
 #include <thrust/reduce.h>
@@ -52,7 +52,7 @@ struct Clip : public thrust::unary_function<GradientPair, Pair> {
  *
  * to avoid outliers, as the full reduction is reproducible on GPU with reduction tree.
  */
-GradientQuantiser::GradientQuantiser(Context const*, common::Span<GradientPair const> gpair,
+GradientQuantiser::GradientQuantiser(Context const* ctx, common::Span<GradientPair const> gpair,
                                      MetaInfo const& info) {
   using GradientSumT = GradientPairPrecise;
   using T = typename GradientSumT::ValueT;
@@ -66,11 +66,14 @@ GradientQuantiser::GradientQuantiser(Context const*, common::Span<GradientPair c
   // Treat pair as array of 4 primitive types to allreduce
   using ReduceT = typename decltype(p.first)::ValueT;
   static_assert(sizeof(Pair) == sizeof(ReduceT) * 4, "Expected to reduce four elements.");
-  collective::GlobalSum(info, reinterpret_cast<ReduceT*>(&p), 4);
+  auto rc = collective::GlobalSum(ctx, info, linalg::MakeVec(reinterpret_cast<ReduceT*>(&p), 4));
+  collective::SafeColl(rc);
+
   GradientPair positive_sum{p.first}, negative_sum{p.second};
 
   std::size_t total_rows = gpair.size();
-  collective::GlobalSum(info, &total_rows, 1);
+  rc = collective::GlobalSum(ctx, info, linalg::MakeVec(&total_rows, 1));
+  collective::SafeColl(rc);
 
   auto histogram_rounding =
       GradientSumT{common::CreateRoundingFactor<T>(
diff --git a/src/tree/hist/evaluate_splits.h b/src/tree/hist/evaluate_splits.h
index bc534d351..d25a41cb0 100644
--- a/src/tree/hist/evaluate_splits.h
+++ b/src/tree/hist/evaluate_splits.h
@@ -1,5 +1,5 @@
 /**
- * Copyright 2021-2023 by XGBoost Contributors
+ * Copyright 2021-2024, XGBoost Contributors
  */
 #ifndef XGBOOST_TREE_HIST_EVALUATE_SPLITS_H_
 #define XGBOOST_TREE_HIST_EVALUATE_SPLITS_H_
@@ -26,6 +26,47 @@
 #include "xgboost/linalg.h"            // for Constants, Vector
 
 namespace xgboost::tree {
+/**
+ * @brief Gather the expand entries from all the workers.
+ * @param entries Local expand entries on this worker.
+ * @return Global expand entries gathered from all workers.
+ */
+template <typename ExpandEntry>
+std::enable_if_t<std::is_same_v<ExpandEntry, CPUExpandEntry> ||
+                     std::is_same_v<ExpandEntry, MultiExpandEntry>,
+                 std::vector<ExpandEntry>>
+AllgatherColumnSplit(std::vector<ExpandEntry> const &entries) {
+  auto const n_entries = entries.size();
+
+  // First, gather all the primitive fields.
+  std::vector<ExpandEntry> local_entries(n_entries);
+
+  // Collect and serialize all entries
+  std::vector<std::vector<char>> serialized_entries;
+  for (std::size_t i = 0; i < n_entries; ++i) {
+    Json jentry{Object{}};
+    entries[i].Save(&jentry);
+
+    std::vector<char> out;
+    Json::Dump(jentry, &out, std::ios::binary);
+
+    serialized_entries.emplace_back(std::move(out));
+  }
+  auto all_serialized = collective::VectorAllgatherV(serialized_entries);
+  CHECK_GE(all_serialized.size(), local_entries.size());
+
+  std::vector<ExpandEntry> all_entries(all_serialized.size());
+  std::transform(all_serialized.cbegin(), all_serialized.cend(), all_entries.begin(),
+                 [](std::vector<char> const &e) {
+                   ExpandEntry entry;
+                   auto je = Json::Load(StringView{e.data(), e.size()}, std::ios::binary);
+                   entry.Load(je);
+                   return entry;
+                 });
+
+  return all_entries;
+}
+
 class HistEvaluator {
  private:
   struct NodeEntry {
@@ -36,8 +77,8 @@ class HistEvaluator {
   };
 
  private:
-  Context const* ctx_;
-  TrainParam const* param_;
+  Context const *ctx_;
+  TrainParam const *param_;
   std::shared_ptr<common::ColumnSampler> column_sampler_;
   TreeEvaluator tree_evaluator_;
   bool is_col_split_{false};
@@ -202,7 +243,7 @@ class HistEvaluator {
       common::CatBitField cat_bits{best.cat_bits};
       bst_bin_t partition = d_step == 1 ? (best_thresh - it_begin + 1) : (best_thresh - f_begin);
       CHECK_GT(partition, 0);
-      std::for_each(sorted_idx.begin(), sorted_idx.begin() + partition, [&](size_t c) {
+      std::for_each(sorted_idx.begin(), sorted_idx.begin() + partition, [&](std::size_t c) {
         auto cat = cut_val[c + f_begin];
         cat_bits.Set(cat);
       });
@@ -285,57 +326,23 @@ class HistEvaluator {
     return left_sum;
   }
 
-  /**
-   * @brief Gather the expand entries from all the workers.
-   * @param entries Local expand entries on this worker.
-   * @return Global expand entries gathered from all workers.
-   */
-  std::vector<CPUExpandEntry> Allgather(std::vector<CPUExpandEntry> const &entries) {
-    auto const world = collective::GetWorldSize();
-    auto const num_entries = entries.size();
-
-    // First, gather all the primitive fields.
-    std::vector<CPUExpandEntry> local_entries(num_entries);
-    std::vector<uint32_t> cat_bits;
-    std::vector<std::size_t> cat_bits_sizes;
-    for (std::size_t i = 0; i < num_entries; i++) {
-      local_entries[i].CopyAndCollect(entries[i], &cat_bits, &cat_bits_sizes);
-    }
-    auto all_entries = collective::Allgather(local_entries);
-
-    // Gather all the cat_bits.
-    auto gathered = collective::SpecialAllgatherV(cat_bits, cat_bits_sizes);
-
-    common::ParallelFor(num_entries * world, ctx_->Threads(), [&] (auto i) {
-      // Copy the cat_bits back into all expand entries.
-      all_entries[i].split.cat_bits.resize(gathered.sizes[i]);
-      std::copy_n(gathered.result.cbegin() + gathered.offsets[i], gathered.sizes[i],
-                  all_entries[i].split.cat_bits.begin());
-    });
-
-    return all_entries;
-  }
-
  public:
   void EvaluateSplits(const BoundedHistCollection &hist, common::HistogramCuts const &cut,
                       common::Span<FeatureType const> feature_types, const RegTree &tree,
                       std::vector<CPUExpandEntry> *p_entries) {
     auto n_threads = ctx_->Threads();
-    auto& entries = *p_entries;
+    auto &entries = *p_entries;
     // All nodes are on the same level, so we can store the shared ptr.
-    std::vector<std::shared_ptr<HostDeviceVector<bst_feature_t>>> features(
-        entries.size());
+    std::vector<std::shared_ptr<HostDeviceVector<bst_feature_t>>> features(entries.size());
     for (size_t nidx_in_set = 0; nidx_in_set < entries.size(); ++nidx_in_set) {
       auto nidx = entries[nidx_in_set].nid;
-      features[nidx_in_set] =
-          column_sampler_->GetFeatureSet(tree.GetDepth(nidx));
+      features[nidx_in_set] = column_sampler_->GetFeatureSet(tree.GetDepth(nidx));
     }
     CHECK(!features.empty());
-    const size_t grain_size =
-        std::max<size_t>(1, features.front()->Size() / n_threads);
-    common::BlockedSpace2d space(entries.size(), [&](size_t nidx_in_set) {
-      return features[nidx_in_set]->Size();
-    }, grain_size);
+    const size_t grain_size = std::max<size_t>(1, features.front()->Size() / n_threads);
+    common::BlockedSpace2d space(
+        entries.size(), [&](size_t nidx_in_set) { return features[nidx_in_set]->Size(); },
+        grain_size);
 
     std::vector<CPUExpandEntry> tloc_candidates(n_threads * entries.size());
     for (size_t i = 0; i < entries.size(); ++i) {
@@ -344,7 +351,7 @@ class HistEvaluator {
       }
     }
     auto evaluator = tree_evaluator_.GetEvaluator();
-    auto const& cut_ptrs = cut.Ptrs();
+    auto const &cut_ptrs = cut.Ptrs();
 
     common::ParallelFor2d(space, n_threads, [&](size_t nidx_in_set, common::Range1d r) {
       auto tidx = omp_get_thread_num();
@@ -385,18 +392,16 @@ class HistEvaluator {
       }
     });
 
-    for (unsigned nidx_in_set = 0; nidx_in_set < entries.size();
-         ++nidx_in_set) {
+    for (unsigned nidx_in_set = 0; nidx_in_set < entries.size(); ++nidx_in_set) {
       for (auto tidx = 0; tidx < n_threads; ++tidx) {
-        entries[nidx_in_set].split.Update(
-            tloc_candidates[n_threads * nidx_in_set + tidx].split);
+        entries[nidx_in_set].split.Update(tloc_candidates[n_threads * nidx_in_set + tidx].split);
       }
     }
 
     if (is_col_split_) {
       // With column-wise data split, we gather the best splits from all the workers and update the
       // expand entries accordingly.
-      auto all_entries = Allgather(entries);
+      auto all_entries = AllgatherColumnSplit(entries);
       for (auto worker = 0; worker < collective::GetWorldSize(); ++worker) {
         for (std::size_t nidx_in_set = 0; nidx_in_set < entries.size(); ++nidx_in_set) {
           entries[nidx_in_set].split.Update(
@@ -407,7 +412,7 @@ class HistEvaluator {
   }
 
   // Add splits to tree, handles all statistic
-  void ApplyTreeSplit(CPUExpandEntry const& candidate, RegTree *p_tree) {
+  void ApplyTreeSplit(CPUExpandEntry const &candidate, RegTree *p_tree) {
     auto evaluator = tree_evaluator_.GetEvaluator();
     RegTree &tree = *p_tree;
 
@@ -437,8 +442,7 @@ class HistEvaluator {
     auto left_child = tree[candidate.nid].LeftChild();
     auto right_child = tree[candidate.nid].RightChild();
     tree_evaluator_.AddSplit(candidate.nid, left_child, right_child,
-                             tree[candidate.nid].SplitIndex(), left_weight,
-                             right_weight);
+                             tree[candidate.nid].SplitIndex(), left_weight, right_weight);
     evaluator = tree_evaluator_.GetEvaluator();
 
     snode_.resize(tree.GetNodes().size());
@@ -449,8 +453,7 @@ class HistEvaluator {
     snode_.at(right_child).root_gain =
         evaluator.CalcGain(candidate.nid, *param_, GradStats{candidate.split.right_sum});
 
-    interaction_constraints_.Split(candidate.nid,
-                                   tree[candidate.nid].SplitIndex(), left_child,
+    interaction_constraints_.Split(candidate.nid, tree[candidate.nid].SplitIndex(), left_child,
                                    right_child);
   }
 
@@ -571,53 +574,6 @@ class HistMultiEvaluator {
     return false;
   }
 
-  /**
-   * @brief Gather the expand entries from all the workers.
-   * @param entries Local expand entries on this worker.
-   * @return Global expand entries gathered from all workers.
-   */
-  std::vector<MultiExpandEntry> Allgather(std::vector<MultiExpandEntry> const &entries) {
-    auto const world = collective::GetWorldSize();
-    auto const num_entries = entries.size();
-
-    // First, gather all the primitive fields.
-    std::vector<MultiExpandEntry> local_entries(num_entries);
-    std::vector<uint32_t> cat_bits;
-    std::vector<std::size_t> cat_bits_sizes;
-    std::vector<GradientPairPrecise> gradients;
-    for (std::size_t i = 0; i < num_entries; i++) {
-      local_entries[i].CopyAndCollect(entries[i], &cat_bits, &cat_bits_sizes, &gradients);
-    }
-    auto all_entries = collective::Allgather(local_entries);
-
-    // Gather all the cat_bits.
-    auto gathered_cat_bits = collective::SpecialAllgatherV(cat_bits, cat_bits_sizes);
-
-    // Gather all the gradients.
-    auto const num_gradients = gradients.size();
-    auto const all_gradients = collective::Allgather(gradients);
-
-    auto const total_entries = num_entries * world;
-    auto const gradients_per_entry = num_gradients / num_entries;
-    auto const gradients_per_side = gradients_per_entry / 2;
-    common::ParallelFor(total_entries, ctx_->Threads(), [&] (auto i) {
-      // Copy the cat_bits back into all expand entries.
-      all_entries[i].split.cat_bits.resize(gathered_cat_bits.sizes[i]);
-      std::copy_n(gathered_cat_bits.result.cbegin() + gathered_cat_bits.offsets[i],
-                  gathered_cat_bits.sizes[i], all_entries[i].split.cat_bits.begin());
-
-      // Copy the gradients back into all expand entries.
-      all_entries[i].split.left_sum.resize(gradients_per_side);
-      std::copy_n(all_gradients.cbegin() + i * gradients_per_entry, gradients_per_side,
-                  all_entries[i].split.left_sum.begin());
-      all_entries[i].split.right_sum.resize(gradients_per_side);
-      std::copy_n(all_gradients.cbegin() + i * gradients_per_entry + gradients_per_side,
-                  gradients_per_side, all_entries[i].split.right_sum.begin());
-    });
-
-    return all_entries;
-  }
-
  public:
   void EvaluateSplits(RegTree const &tree, common::Span<const BoundedHistCollection *> hist,
                       common::HistogramCuts const &cut, std::vector<MultiExpandEntry> *p_entries) {
@@ -676,7 +632,7 @@ class HistMultiEvaluator {
     if (is_col_split_) {
       // With column-wise data split, we gather the best splits from all the workers and update the
       // expand entries accordingly.
-      auto all_entries = Allgather(entries);
+      auto all_entries = AllgatherColumnSplit(entries);
       for (auto worker = 0; worker < collective::GetWorldSize(); ++worker) {
         for (std::size_t nidx_in_set = 0; nidx_in_set < entries.size(); ++nidx_in_set) {
           entries[nidx_in_set].split.Update(
diff --git a/src/tree/hist/expand_entry.h b/src/tree/hist/expand_entry.h
index d6315877d..fd16397e1 100644
--- a/src/tree/hist/expand_entry.h
+++ b/src/tree/hist/expand_entry.h
@@ -90,7 +90,6 @@ struct ExpandEntryImpl {
     }
 
     self->split.is_cat = get<Boolean const>(split["is_cat"]);
-
     self->LoadGrad(split);
   }
 };
@@ -106,8 +105,8 @@ struct CPUExpandEntry : public ExpandEntryImpl<CPUExpandEntry> {
   void SaveGrad(Json* p_out) const {
     auto& out = *p_out;
     auto save = [&](std::string const& name, GradStats const& sum) {
-      out[name] = F32Array{2};
-      auto& array = get<F32Array>(out[name]);
+      out[name] = F64Array{2};
+      auto& array = get<F64Array>(out[name]);
       array[0] = sum.GetGrad();
       array[1] = sum.GetHess();
     };
@@ -115,9 +114,9 @@ struct CPUExpandEntry : public ExpandEntryImpl<CPUExpandEntry> {
     save("right_sum", this->split.right_sum);
   }
   void LoadGrad(Json const& in) {
-    auto const& left_sum = get<F32Array const>(in["left_sum"]);
+    auto const& left_sum = get<F64Array const>(in["left_sum"]);
     this->split.left_sum = GradStats{left_sum[0], left_sum[1]};
-    auto const& right_sum = get<F32Array const>(in["right_sum"]);
+    auto const& right_sum = get<F64Array const>(in["right_sum"]);
     this->split.right_sum = GradStats{right_sum[0], right_sum[1]};
   }
 
@@ -173,8 +172,8 @@ struct MultiExpandEntry : public ExpandEntryImpl<MultiExpandEntry> {
   void SaveGrad(Json* p_out) const {
     auto& out = *p_out;
     auto save = [&](std::string const& name, std::vector<GradientPairPrecise> const& sum) {
-      out[name] = F32Array{sum.size() * 2};
-      auto& array = get<F32Array>(out[name]);
+      out[name] = F64Array{sum.size() * 2};
+      auto& array = get<F64Array>(out[name]);
       for (std::size_t i = 0, j = 0; i < sum.size(); i++, j += 2) {
         array[j] = sum[i].GetGrad();
         array[j + 1] = sum[i].GetHess();
@@ -185,7 +184,7 @@ struct MultiExpandEntry : public ExpandEntryImpl<MultiExpandEntry> {
   }
   void LoadGrad(Json const& in) {
     auto load = [&](std::string const& name, std::vector<GradientPairPrecise>* p_sum) {
-      auto const& array = get<F32Array const>(in[name]);
+      auto const& array = get<F64Array const>(in[name]);
       auto& sum = *p_sum;
       sum.resize(array.size() / 2);
       for (std::size_t i = 0, j = 0; i < sum.size(); ++i, j += 2) {
diff --git a/src/tree/tree_model.cc b/src/tree/tree_model.cc
index f18b51926..45834cc77 100644
--- a/src/tree/tree_model.cc
+++ b/src/tree/tree_model.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright 2015-2023, XGBoost Contributors
+ * Copyright 2015-2024, XGBoost Contributors
  * \file tree_model.cc
  * \brief model structure for tree
  */
@@ -8,6 +8,7 @@
 #include <xgboost/json.h>
 #include <xgboost/tree_model.h>
 
+#include <array>  // for array
 #include <cmath>
 #include <iomanip>
 #include <limits>
@@ -15,7 +16,7 @@
 #include <type_traits>
 
 #include "../common/categorical.h"
-#include "../common/common.h"  // for EscapeU8
+#include "../common/common.h"    // for EscapeU8
 #include "../predictor/predict_fn.h"
 #include "io_utils.h"  // for GetElem
 #include "param.h"
@@ -31,26 +32,50 @@ namespace tree {
 DMLC_REGISTER_PARAMETER(TrainParam);
 }
 
+namespace {
+template <typename Float>
+std::enable_if_t<std::is_floating_point_v<Float>, std::string> ToStr(Float value) {
+  int32_t constexpr kFloatMaxPrecision = std::numeric_limits<float>::max_digits10;
+  static_assert(std::is_floating_point<Float>::value,
+                "Use std::to_string instead for non-floating point values.");
+  std::stringstream ss;
+  ss << std::setprecision(kFloatMaxPrecision) << value;
+  return ss.str();
+}
+
+template <typename Float>
+std::string ToStr(linalg::VectorView<Float> value, bst_target_t limit) {
+  int32_t constexpr kFloatMaxPrecision = std::numeric_limits<float>::max_digits10;
+  static_assert(std::is_floating_point<Float>::value,
+                "Use std::to_string instead for non-floating point values.");
+  std::stringstream ss;
+  ss << std::setprecision(kFloatMaxPrecision);
+  if (value.Size() == 1) {
+    ss << value(0);
+    return ss.str();
+  }
+  CHECK_GE(limit, 2);
+  auto n = std::min(static_cast<bst_target_t>(value.Size() - 1), limit - 1);
+  ss << "[";
+  for (std::size_t i = 0; i < n; ++i) {
+    ss << value(i) << ", ";
+  }
+  if (value.Size() > limit) {
+    ss << "..., ";
+  }
+  ss << value(value.Size() - 1) << "]";
+  return ss.str();
+}
+}  // namespace
 /*!
  * \brief Base class for dump model implementation, modeling closely after code generator.
  */
 class TreeGenerator {
  protected:
-  static int32_t constexpr kFloatMaxPrecision =
-      std::numeric_limits<bst_float>::max_digits10;
   FeatureMap const& fmap_;
   std::stringstream ss_;
   bool const with_stats_;
 
-  template <typename Float>
-  static std::string ToStr(Float value) {
-    static_assert(std::is_floating_point<Float>::value,
-                  "Use std::to_string instead for non-floating point values.");
-    std::stringstream ss;
-    ss << std::setprecision(kFloatMaxPrecision) << value;
-    return ss.str();
-  }
-
   static std::string Tabs(uint32_t n) {
     std::string res;
     for (uint32_t i = 0; i < n; ++i) {
@@ -258,10 +283,10 @@ class TextGenerator : public TreeGenerator {
         kLeafTemplate,
         {{"{tabs}",  SuperT::Tabs(depth)},
          {"{nid}",   std::to_string(nid)},
-         {"{leaf}",  SuperT::ToStr(tree[nid].LeafValue())},
+         {"{leaf}",  ToStr(tree[nid].LeafValue())},
          {"{stats}", with_stats_ ?
           SuperT::Match(kStatTemplate,
-                        {{"{cover}", SuperT::ToStr(tree.Stat(nid).sum_hess)}}) : ""}});
+                        {{"{cover}", ToStr(tree.Stat(nid).sum_hess)}}) : ""}});
     return result;
   }
 
@@ -311,14 +336,14 @@ class TextGenerator : public TreeGenerator {
     static std::string const kQuantitiveTemplate =
         "{tabs}{nid}:[{fname}<{cond}] yes={left},no={right},missing={missing}";
     auto cond = tree[nid].SplitCond();
-    return SplitNodeImpl(tree, nid, kQuantitiveTemplate, SuperT::ToStr(cond), depth);
+    return SplitNodeImpl(tree, nid, kQuantitiveTemplate, ToStr(cond), depth);
   }
 
   std::string PlainNode(RegTree const& tree, int32_t nid, uint32_t depth) const override {
     auto cond = tree[nid].SplitCond();
     static std::string const kNodeTemplate =
         "{tabs}{nid}:[{fname}<{cond}] yes={left},no={right},missing={missing}";
-    return SplitNodeImpl(tree, nid, kNodeTemplate, SuperT::ToStr(cond), depth);
+    return SplitNodeImpl(tree, nid, kNodeTemplate, ToStr(cond), depth);
   }
 
   std::string Categorical(RegTree const &tree, int32_t nid,
@@ -336,8 +361,8 @@ class TextGenerator : public TreeGenerator {
     static std::string const kStatTemplate = ",gain={loss_chg},cover={sum_hess}";
     std::string const result = SuperT::Match(
         kStatTemplate,
-        {{"{loss_chg}", SuperT::ToStr(tree.Stat(nid).loss_chg)},
-         {"{sum_hess}", SuperT::ToStr(tree.Stat(nid).sum_hess)}});
+        {{"{loss_chg}", ToStr(tree.Stat(nid).loss_chg)},
+         {"{sum_hess}", ToStr(tree.Stat(nid).sum_hess)}});
     return result;
   }
 
@@ -393,11 +418,11 @@ class JsonGenerator : public TreeGenerator {
     std::string result = SuperT::Match(
         kLeafTemplate,
         {{"{nid}",  std::to_string(nid)},
-         {"{leaf}", SuperT::ToStr(tree[nid].LeafValue())},
+         {"{leaf}", ToStr(tree[nid].LeafValue())},
          {"{stat}", with_stats_ ? SuperT::Match(
              kStatTemplate,
              {{"{sum_hess}",
-               SuperT::ToStr(tree.Stat(nid).sum_hess)}})  : ""}});
+               ToStr(tree.Stat(nid).sum_hess)}})  : ""}});
     return result;
   }
 
@@ -468,7 +493,7 @@ class JsonGenerator : public TreeGenerator {
         R"I("split_condition": {cond}, "yes": {left}, "no": {right}, )I"
         R"I("missing": {missing})I";
     bst_float cond = tree[nid].SplitCond();
-    return SplitNodeImpl(tree, nid, kQuantitiveTemplate, SuperT::ToStr(cond), depth);
+    return SplitNodeImpl(tree, nid, kQuantitiveTemplate, ToStr(cond), depth);
   }
 
   std::string PlainNode(RegTree const& tree, int32_t nid, uint32_t depth) const override {
@@ -477,7 +502,7 @@ class JsonGenerator : public TreeGenerator {
         R"I( "nodeid": {nid}, "depth": {depth}, "split": "{fname}", )I"
         R"I("split_condition": {cond}, "yes": {left}, "no": {right}, )I"
         R"I("missing": {missing})I";
-    return SplitNodeImpl(tree, nid, kNodeTemplate, SuperT::ToStr(cond), depth);
+    return SplitNodeImpl(tree, nid, kNodeTemplate, ToStr(cond), depth);
   }
 
   std::string NodeStat(RegTree const& tree, int32_t nid) const override {
@@ -485,8 +510,8 @@ class JsonGenerator : public TreeGenerator {
         R"S(, "gain": {loss_chg}, "cover": {sum_hess})S";
     auto result = SuperT::Match(
         kStatTemplate,
-        {{"{loss_chg}", SuperT::ToStr(tree.Stat(nid).loss_chg)},
-         {"{sum_hess}", SuperT::ToStr(tree.Stat(nid).sum_hess)}});
+        {{"{loss_chg}", ToStr(tree.Stat(nid).loss_chg)},
+         {"{sum_hess}", ToStr(tree.Stat(nid).sum_hess)}});
     return result;
   }
 
@@ -622,11 +647,11 @@ class GraphvizGenerator : public TreeGenerator {
 
  protected:
   template <bool is_categorical>
-  std::string BuildEdge(RegTree const &tree, bst_node_t nid, int32_t child, bool left) const {
+  std::string BuildEdge(RegTree const &tree, bst_node_t nidx, int32_t child, bool left) const {
     static std::string const kEdgeTemplate =
         "    {nid} -> {child} [label=\"{branch}\" color=\"{color}\"]\n";
     // Is this the default child for missing value?
-    bool is_missing = tree[nid].DefaultChild() == child;
+    bool is_missing = tree.DefaultChild(nidx) == child;
     std::string branch;
     if (is_categorical) {
       branch = std::string{left ? "no" : "yes"} + std::string{is_missing ? ", missing" : ""};
@@ -635,7 +660,7 @@ class GraphvizGenerator : public TreeGenerator {
     }
     std::string buffer =
         SuperT::Match(kEdgeTemplate,
-                {{"{nid}", std::to_string(nid)},
+                {{"{nid}", std::to_string(nidx)},
                  {"{child}", std::to_string(child)},
                  {"{color}", is_missing ? param_.yes_color : param_.no_color},
                  {"{branch}", branch}});
@@ -644,68 +669,77 @@ class GraphvizGenerator : public TreeGenerator {
 
   // Only indicator is different, so we combine all different node types into this
   // function.
-  std::string PlainNode(RegTree const& tree, int32_t nid, uint32_t) const override {
-    auto split_index = tree[nid].SplitIndex();
-    auto cond = tree[nid].SplitCond();
+  std::string PlainNode(RegTree const& tree, bst_node_t nidx, uint32_t) const override {
+    auto split_index = tree.SplitIndex(nidx);
+    auto cond = tree.SplitCond(nidx);
     static std::string const kNodeTemplate = "    {nid} [ label=\"{fname}{<}{cond}\" {params}]\n";
 
     bool has_less =
         (split_index >= fmap_.Size()) || fmap_.TypeOf(split_index) != FeatureMap::kIndicator;
     std::string result =
-        SuperT::Match(kNodeTemplate, {{"{nid}", std::to_string(nid)},
+        SuperT::Match(kNodeTemplate, {{"{nid}", std::to_string(nidx)},
                                       {"{fname}", GetFeatureName(fmap_, split_index)},
                                       {"{<}", has_less ? "<" : ""},
-                                      {"{cond}", has_less ? SuperT::ToStr(cond) : ""},
+                                      {"{cond}", has_less ? ToStr(cond) : ""},
                                       {"{params}", param_.condition_node_params}});
 
-    result += BuildEdge<false>(tree, nid, tree[nid].LeftChild(), true);
-    result += BuildEdge<false>(tree, nid, tree[nid].RightChild(), false);
+    result += BuildEdge<false>(tree, nidx, tree.LeftChild(nidx), true);
+    result += BuildEdge<false>(tree, nidx, tree.RightChild(nidx), false);
 
     return result;
   };
 
-  std::string Categorical(RegTree const& tree, int32_t nid, uint32_t) const override {
+  std::string Categorical(RegTree const& tree, bst_node_t nidx, uint32_t) const override {
     static std::string const kLabelTemplate =
         "    {nid} [ label=\"{fname}:{cond}\" {params}]\n";
-    auto cats = GetSplitCategories(tree, nid);
+    auto cats = GetSplitCategories(tree, nidx);
     auto cats_str = PrintCatsAsSet(cats);
-    auto split_index = tree[nid].SplitIndex();
+    auto split_index = tree.SplitIndex(nidx);
 
     std::string result =
-        SuperT::Match(kLabelTemplate, {{"{nid}", std::to_string(nid)},
+        SuperT::Match(kLabelTemplate, {{"{nid}", std::to_string(nidx)},
                                        {"{fname}", GetFeatureName(fmap_, split_index)},
                                        {"{cond}", cats_str},
                                        {"{params}", param_.condition_node_params}});
 
-    result += BuildEdge<true>(tree, nid, tree[nid].LeftChild(), true);
-    result += BuildEdge<true>(tree, nid, tree[nid].RightChild(), false);
+    result += BuildEdge<true>(tree, nidx, tree.LeftChild(nidx), true);
+    result += BuildEdge<true>(tree, nidx, tree.RightChild(nidx), false);
 
     return result;
   }
 
-  std::string LeafNode(RegTree const& tree, int32_t nid, uint32_t) const override {
-    static std::string const kLeafTemplate =
-        "    {nid} [ label=\"leaf={leaf-value}\" {params}]\n";
-    auto result = SuperT::Match(kLeafTemplate, {
-        {"{nid}",        std::to_string(nid)},
-        {"{leaf-value}", ToStr(tree[nid].LeafValue())},
-        {"{params}",     param_.leaf_node_params}});
-    return result;
-  };
+  std::string LeafNode(RegTree const& tree, bst_node_t nidx, uint32_t) const override {
+    static std::string const kLeafTemplate = "    {nid} [ label=\"leaf={leaf-value}\" {params}]\n";
+    // hardcoded limit to avoid dumping long arrays into dot graph.
+    bst_target_t constexpr kLimit{3};
+    if (tree.IsMultiTarget()) {
+      auto value = tree.GetMultiTargetTree()->LeafValue(nidx);
+      auto result = SuperT::Match(kLeafTemplate, {{"{nid}", std::to_string(nidx)},
+                                                  {"{leaf-value}", ToStr(value, kLimit)},
+                                                  {"{params}", param_.leaf_node_params}});
+      return result;
+    } else {
+      auto value = tree[nidx].LeafValue();
+      auto result = SuperT::Match(kLeafTemplate, {{"{nid}", std::to_string(nidx)},
+                                                  {"{leaf-value}", ToStr(value)},
+                                                  {"{params}", param_.leaf_node_params}});
+      return result;
+    }
+  }
 
-  std::string BuildTree(RegTree const& tree, int32_t nid, uint32_t depth) override {
-    if (tree[nid].IsLeaf()) {
-      return this->LeafNode(tree, nid, depth);
+  std::string BuildTree(RegTree const& tree, bst_node_t nidx, uint32_t depth) override {
+    if (tree.IsLeaf(nidx)) {
+      return this->LeafNode(tree, nidx, depth);
     }
     static std::string const kNodeTemplate = "{parent}\n{left}\n{right}";
-    auto node = tree.GetSplitTypes()[nid] == FeatureType::kCategorical
-                    ? this->Categorical(tree, nid, depth)
-                    : this->PlainNode(tree, nid, depth);
+    auto node = tree.GetSplitTypes()[nidx] == FeatureType::kCategorical
+                    ? this->Categorical(tree, nidx, depth)
+                    : this->PlainNode(tree, nidx, depth);
     auto result = SuperT::Match(
         kNodeTemplate,
         {{"{parent}", node},
-         {"{left}",   this->BuildTree(tree, tree[nid].LeftChild(), depth+1)},
-         {"{right}",  this->BuildTree(tree, tree[nid].RightChild(), depth+1)}});
+         {"{left}",   this->BuildTree(tree, tree.LeftChild(nidx), depth+1)},
+         {"{right}",  this->BuildTree(tree, tree.RightChild(nidx), depth+1)}});
     return result;
   }
 
@@ -733,7 +767,9 @@ XGBOOST_REGISTER_TREE_IO(GraphvizGenerator, "dot")
 constexpr bst_node_t RegTree::kRoot;
 
 std::string RegTree::DumpModel(const FeatureMap& fmap, bool with_stats, std::string format) const {
-  CHECK(!IsMultiTarget());
+  if (this->IsMultiTarget() && format != "dot") {
+    LOG(FATAL) << format << " tree dump " << MTNotImplemented();
+  }
   std::unique_ptr<TreeGenerator> builder{TreeGenerator::Create(format, fmap, with_stats)};
   builder->BuildTree(*this);
 
diff --git a/src/tree/updater_approx.cc b/src/tree/updater_approx.cc
index 94e7547ee..68317fc41 100644
--- a/src/tree/updater_approx.cc
+++ b/src/tree/updater_approx.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright 2021-2023 by XGBoost contributors
+ * Copyright 2021-2024, XGBoost contributors
  *
  * \brief Implementation for the approx tree method.
  */
@@ -107,7 +107,10 @@ class GloablApproxBuilder {
     for (auto const &g : gpair) {
       root_sum.Add(g);
     }
-    collective::GlobalSum(p_fmat->Info(), reinterpret_cast<double *>(&root_sum), 2);
+    auto rc = collective::GlobalSum(ctx_, p_fmat->Info(),
+                                    linalg::MakeVec(reinterpret_cast<double *>(&root_sum), 2));
+    collective::SafeColl(rc);
+
     std::vector<CPUExpandEntry> nodes{best};
     this->histogram_builder_.BuildRootHist(p_fmat, p_tree, partitioner_,
                                            linalg::MakeTensorView(ctx_, gpair, gpair.size(), 1),
diff --git a/src/tree/updater_colmaker.cc b/src/tree/updater_colmaker.cc
index ef166fae5..45018da17 100644
--- a/src/tree/updater_colmaker.cc
+++ b/src/tree/updater_colmaker.cc
@@ -106,6 +106,9 @@ class ColMaker: public TreeUpdater {
     if (dmat->Info().HasCategorical()) {
       LOG(FATAL) << error::NoCategorical("Updater `grow_colmaker` or `exact` tree method");
     }
+    if (param->colsample_bynode - 1.0 != 0.0) {
+      LOG(FATAL) << "column sample by node is not yet supported by the exact tree method";
+    }
     this->LazyGetColumnDensity(dmat);
     // rescale learning rate according to size of trees
     interaction_constraints_.Configure(*param, dmat->Info().num_row_);
@@ -440,9 +443,8 @@ class ColMaker: public TreeUpdater {
     }
 
     // update the solution candidate
-    virtual void UpdateSolution(const SortedCSCPage &batch,
-                                const std::vector<bst_feature_t> &feat_set,
-                                const std::vector<GradientPair> &gpair, DMatrix *) {
+    void UpdateSolution(SortedCSCPage const &batch, const std::vector<bst_feature_t> &feat_set,
+                        const std::vector<GradientPair> &gpair) {
       // start enumeration
       const auto num_features = feat_set.size();
       CHECK(this->ctx_);
@@ -466,17 +468,15 @@ class ColMaker: public TreeUpdater {
             }
           });
     }
+
     // find splits at current level, do split per level
-    inline void FindSplit(int depth,
-                          const std::vector<int> &qexpand,
-                          const std::vector<GradientPair> &gpair,
-                          DMatrix *p_fmat,
-                          RegTree *p_tree) {
+    void FindSplit(bst_node_t depth, const std::vector<int> &qexpand,
+                   std::vector<GradientPair> const &gpair, DMatrix *p_fmat, RegTree *p_tree) {
       auto evaluator = tree_evaluator_.GetEvaluator();
 
       auto feat_set = column_sampler_->GetFeatureSet(depth);
       for (const auto &batch : p_fmat->GetBatches<SortedCSCPage>(ctx_)) {
-        this->UpdateSolution(batch, feat_set->HostVector(), gpair, p_fmat);
+        this->UpdateSolution(batch, feat_set->HostVector(), gpair);
       }
       // after this each thread's stemp will get the best candidates, aggregate results
       this->SyncBestSolution(qexpand);
diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu
index 4c535e7eb..ffd58239d 100644
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@@ -1,5 +1,5 @@
 /**
- * Copyright 2017-2023 by XGBoost contributors
+ * Copyright 2017-2024, XGBoost contributors
  */
 #include <thrust/copy.h>
 #include <thrust/reduce.h>
@@ -735,7 +735,9 @@ struct GPUHistMakerDevice {
         dh::Reduce(ctx_->CUDACtx()->CTP(), gpair_it, gpair_it + gpair.size(),
                    GradientPairInt64{}, thrust::plus<GradientPairInt64>{});
     using ReduceT = typename decltype(root_sum_quantised)::ValueT;
-    collective::GlobalSum(info_, reinterpret_cast<ReduceT*>(&root_sum_quantised), 2);
+    auto rc = collective::GlobalSum(
+        ctx_, info_, linalg::MakeVec(reinterpret_cast<ReduceT*>(&root_sum_quantised), 2));
+    collective::SafeColl(rc);
 
     hist.AllocateHistograms({kRootNIdx});
     this->BuildHist(kRootNIdx);
diff --git a/src/tree/updater_quantile_hist.cc b/src/tree/updater_quantile_hist.cc
index c2aaedafa..ced277773 100644
--- a/src/tree/updater_quantile_hist.cc
+++ b/src/tree/updater_quantile_hist.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright 2017-2023, XGBoost Contributors
+ * Copyright 2017-2024, XGBoost Contributors
  * \file updater_quantile_hist.cc
  * \brief use quantized feature values to construct a tree
  * \author Philip Cho, Tianqi Checn, Egor Smirnov
@@ -149,9 +149,6 @@ class MultiTargetHistBuilder {
   }
 
   void InitData(DMatrix *p_fmat, RegTree const *p_tree) {
-    if (collective::IsDistributed()) {
-      LOG(FATAL) << "Distributed training for vector-leaf is not yet supported.";
-    }
     monitor_->Start(__func__);
 
     p_last_fmat_ = p_fmat;
@@ -202,8 +199,10 @@ class MultiTargetHistBuilder {
       }
     }
     CHECK(root_sum.CContiguous());
-    collective::GlobalSum(p_fmat->Info(), reinterpret_cast<double *>(root_sum.Values().data()),
-                          root_sum.Size() * 2);
+    auto rc = collective::GlobalSum(
+        ctx_, p_fmat->Info(),
+        linalg::MakeVec(reinterpret_cast<double *>(root_sum.Values().data()), root_sum.Size() * 2));
+    collective::SafeColl(rc);
 
     histogram_builder_->BuildRootHist(p_fmat, p_tree, partitioner_, gpair, best, HistBatch(param_));
 
@@ -411,7 +410,9 @@ class HistUpdater {
         for (auto const &grad : gpair_h) {
           grad_stat.Add(grad.GetGrad(), grad.GetHess());
         }
-        collective::GlobalSum(p_fmat->Info(), reinterpret_cast<double *>(&grad_stat), 2);
+        auto rc = collective::GlobalSum(ctx_, p_fmat->Info(),
+                                        linalg::MakeVec(reinterpret_cast<double *>(&grad_stat), 2));
+        collective::SafeColl(rc);
       }
 
       auto weight = evaluator_->InitRoot(GradStats{grad_stat});
@@ -474,6 +475,7 @@ class QuantileHistMaker : public TreeUpdater {
   std::unique_ptr<HistUpdater> p_impl_{nullptr};
   std::unique_ptr<MultiTargetHistBuilder> p_mtimpl_{nullptr};
   std::shared_ptr<common::ColumnSampler> column_sampler_;
+
   common::Monitor monitor_;
   ObjInfo const *task_{nullptr};
   HistMakerTrainParam hist_param_;
diff --git a/tests/buildkite/conftest.sh b/tests/buildkite/conftest.sh
index 3df79b58d..c6e8ef65a 100755
--- a/tests/buildkite/conftest.sh
+++ b/tests/buildkite/conftest.sh
@@ -24,7 +24,7 @@ set -x
 
 CUDA_VERSION=11.8.0
 NCCL_VERSION=2.16.5-1
-RAPIDS_VERSION=23.12
+RAPIDS_VERSION=24.02
 SPARK_VERSION=3.4.0
 JDK_VERSION=8
 R_VERSION=4.3.2
diff --git a/tests/ci_build/lint_python.py b/tests/ci_build/lint_python.py
index 91b748b4c..741ef7558 100644
--- a/tests/ci_build/lint_python.py
+++ b/tests/ci_build/lint_python.py
@@ -100,6 +100,7 @@ class LintersPaths:
         # demo
         "demo/json-model/json_parser.py",
         "demo/guide-python/external_memory.py",
+        "demo/guide-python/sklearn_examples.py",
         "demo/guide-python/continuation.py",
         "demo/guide-python/callbacks.py",
         "demo/guide-python/cat_in_the_dat.py",
diff --git a/tests/ci_build/test_r_package.py b/tests/ci_build/test_r_package.py
index dd73f850b..ddcf48674 100644
--- a/tests/ci_build/test_r_package.py
+++ b/tests/ci_build/test_r_package.py
@@ -277,6 +277,19 @@ def test_with_cmake(args: argparse.Namespace) -> None:
                     "Release",
                 ]
             )
+        elif args.compiler == "none":
+            subprocess.check_call(
+                [
+                    "cmake",
+                    os.path.pardir,
+                    "-DUSE_OPENMP=ON",
+                    "-DR_LIB=ON",
+                    "-DCMAKE_CONFIGURATION_TYPES=Release",
+                    "-G",
+                    "Unix Makefiles",
+                ]
+            )
+            subprocess.check_call(["make", "-j", "install"])
         else:
             raise ValueError("Wrong compiler")
     with DirectoryExcursion(R_PACKAGE):
@@ -333,9 +346,9 @@ if __name__ == "__main__":
     parser.add_argument(
         "--compiler",
         type=str,
-        choices=["mingw", "msvc"],
+        choices=["mingw", "msvc", "none"],
         help="Compiler used for compiling CXX code. Only relevant for windows build",
-        default="mingw",
+        default="none",
         required=False,
     )
     parser.add_argument(
diff --git a/tests/cpp/CMakeLists.txt b/tests/cpp/CMakeLists.txt
index 4d2ed6b18..aa13a971f 100644
--- a/tests/cpp/CMakeLists.txt
+++ b/tests/cpp/CMakeLists.txt
@@ -34,14 +34,14 @@ if(PLUGIN_SYCL)
     ${xgboost_SOURCE_DIR}/rabit/include)
 
   target_compile_definitions(plugin_sycl_test PUBLIC -DXGBOOST_USE_SYCL=1)
-
   target_link_libraries(plugin_sycl_test PUBLIC -fsycl)
+  target_link_libraries(plugin_sycl_test PRIVATE ${GTEST_LIBRARIES})
 
   set_target_properties(plugin_sycl_test PROPERTIES
-  COMPILE_FLAGS -fsycl
-  CXX_STANDARD 17
-  CXX_STANDARD_REQUIRED ON
-  POSITION_INDEPENDENT_CODE ON)
+    COMPILE_FLAGS -fsycl
+    CXX_STANDARD 17
+    CXX_STANDARD_REQUIRED ON
+    POSITION_INDEPENDENT_CODE ON)
   if(USE_OPENMP)
     find_package(OpenMP REQUIRED)
     set_target_properties(plugin_sycl_test PROPERTIES
diff --git a/tests/cpp/collective/test_rabit_communicator.cc b/tests/cpp/collective/test_rabit_communicator.cc
index ba22d8fdb..9711e1aed 100644
--- a/tests/cpp/collective/test_rabit_communicator.cc
+++ b/tests/cpp/collective/test_rabit_communicator.cc
@@ -1,13 +1,12 @@
-/*!
- * Copyright 2022 XGBoost contributors
+/**
+ * Copyright 2022-2024, XGBoost contributors
  */
 #include <gtest/gtest.h>
 
 #include "../../../src/collective/rabit_communicator.h"
+#include "../helpers.h"
 
-namespace xgboost {
-namespace collective {
-
+namespace xgboost::collective {
 TEST(RabitCommunicatorSimpleTest, ThrowOnWorldSizeTooSmall) {
   auto construct = []() { RabitCommunicator comm{0, 0}; };
   EXPECT_THROW(construct(), dmlc::Error);
@@ -35,5 +34,37 @@ TEST(RabitCommunicatorSimpleTest, IsNotDistributed) {
   EXPECT_FALSE(comm.IsDistributed());
 }
 
-}  // namespace collective
-}  // namespace xgboost
+namespace {
+void VerifyVectorAllgatherV() {
+  auto n_workers = collective::GetWorldSize();
+  ASSERT_EQ(n_workers, 3);
+  auto rank = collective::GetRank();
+  // Construct input that has different length for each worker.
+  std::vector<std::vector<char>> inputs;
+  for (std::int32_t i = 0; i < rank + 1; ++i) {
+    std::vector<char> in;
+    for (std::int32_t j = 0; j < rank + 1; ++j) {
+      in.push_back(static_cast<char>(j));
+    }
+    inputs.emplace_back(std::move(in));
+  }
+
+  auto outputs = VectorAllgatherV(inputs);
+
+  ASSERT_EQ(outputs.size(), (1 + n_workers) * n_workers / 2);
+  auto const& res = outputs;
+
+  for (std::int32_t i = 0; i < n_workers; ++i) {
+    std::int32_t k = 0;
+    for (auto v : res[i]) {
+      ASSERT_EQ(v, k++);
+    }
+  }
+}
+}  // namespace
+
+TEST(VectorAllgatherV, Basic) {
+  std::int32_t n_workers{3};
+  RunWithInMemoryCommunicator(n_workers, VerifyVectorAllgatherV);
+}
+}  // namespace xgboost::collective
diff --git a/tests/cpp/common/test_json.cc b/tests/cpp/common/test_json.cc
index d361552ce..3ee041a33 100644
--- a/tests/cpp/common/test_json.cc
+++ b/tests/cpp/common/test_json.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright 2019-2023, XGBoost Contributors
+ * Copyright 2019-2024, XGBoost Contributors
  */
 #include <gtest/gtest.h>
 
@@ -639,6 +639,40 @@ TEST(Json, TypedArray) {
       ASSERT_EQ(arr[i + 8], i);
     }
   }
+
+  {
+    Json f64{Object{}};
+    auto array = F64Array();
+    auto& vec = array.GetArray();
+    // Construct test data
+    vec.resize(18);
+    std::iota(vec.begin(), vec.end(), 0.0);
+    // special values
+    vec.push_back(std::numeric_limits<double>::epsilon());
+    vec.push_back(std::numeric_limits<double>::max());
+    vec.push_back(std::numeric_limits<double>::min());
+    vec.push_back(std::numeric_limits<double>::denorm_min());
+    vec.push_back(std::numeric_limits<double>::quiet_NaN());
+
+    static_assert(
+        std::is_same_v<double, typename std::remove_reference_t<decltype(vec)>::value_type>);
+
+    f64["f64"] = std::move(array);
+    ASSERT_TRUE(IsA<F64Array>(f64["f64"]));
+    std::vector<char> out;
+    Json::Dump(f64, &out, std::ios::binary);
+
+    auto loaded = Json::Load(StringView{out.data(), out.size()}, std::ios::binary);
+    ASSERT_TRUE(IsA<F64Array>(loaded["f64"]));
+    auto const& result = get<F64Array const>(loaded["f64"]);
+
+    auto& vec1 = get<F64Array const>(f64["f64"]);
+    ASSERT_EQ(result.size(), vec1.size());
+    for (std::size_t i = 0; i < vec1.size() - 1; ++i) {
+      ASSERT_EQ(result[i], vec1[i]);
+    }
+    ASSERT_TRUE(std::isnan(result.back()));
+  }
 }
 
 TEST(UBJson, Basic) {
@@ -677,8 +711,24 @@ TEST(UBJson, Basic) {
     ASSERT_FLOAT_EQ(3.14, get<Number>(get<Array>(ret["test"])[1]));
     ASSERT_FLOAT_EQ(2.71, get<Number>(get<Array>(ret["test"])[0]));
   }
+  {
+    // boolean
+    Json boolean{Object{}};
+    boolean["foo"] = Boolean{false};
+    std::vector<char> out;
+    Json::Dump(boolean, &out, std::ios::binary);
+    auto loaded = Json::Load(StringView{out.data(), out.size()}, std::ios::binary);
+
+    ASSERT_EQ(boolean, loaded);
+
+    boolean["foo"] = Boolean{true};
+    Json::Dump(boolean, &out, std::ios::binary);
+    loaded = Json::Load(StringView{out.data(), out.size()}, std::ios::binary);
+    ASSERT_EQ(boolean, loaded);
+  }
 }
 
+
 TEST(Json, TypeCheck) {
   Json config{Object{}};
   config["foo"] = String{"bar"};
diff --git a/tests/cpp/common/test_linalg.cu b/tests/cpp/common/test_linalg.cu
index d14260635..bf217842b 100644
--- a/tests/cpp/common/test_linalg.cu
+++ b/tests/cpp/common/test_linalg.cu
@@ -1,7 +1,11 @@
 /**
- * Copyright 2021-2023 by XGBoost Contributors
+ * Copyright 2021-2024, XGBoost Contributors
  */
 #include <gtest/gtest.h>
+#include <thrust/equal.h>     // for equal
+#include <thrust/sequence.h>  // for sequence
+
+#include "../../../src/common/cuda_context.cuh"
 #include "../../../src/common/linalg_op.cuh"
 #include "../helpers.h"
 #include "xgboost/context.h"
@@ -84,4 +88,23 @@ void TestSlice() {
 TEST(Linalg, GPUElementWise) { TestElementWiseKernel(); }
 
 TEST(Linalg, GPUTensorView) { TestSlice(); }
+
+TEST(Linalg, GPUIter) {
+  auto ctx = MakeCUDACtx(1);
+  auto cuctx = ctx.CUDACtx();
+
+  dh::device_vector<double> data(2 * 3 * 4);
+  thrust::sequence(cuctx->CTP(), data.begin(), data.end(), 1.0);
+
+  auto t = MakeTensorView(&ctx, dh::ToSpan(data), 2, 3, 4);
+  static_assert(!std::is_const_v<decltype(t)::element_type>);
+  static_assert(!std::is_const_v<decltype(t)::value_type>);
+
+  auto n = std::distance(linalg::tcbegin(t), linalg::tcend(t));
+  ASSERT_EQ(n, t.Size());
+  ASSERT_FALSE(t.Empty());
+
+  bool eq = thrust::equal(cuctx->CTP(), data.cbegin(), data.cend(), linalg::tcbegin(t));
+  ASSERT_TRUE(eq);
+}
 }  // namespace xgboost::linalg
diff --git a/tests/cpp/common/test_parameter.cc b/tests/cpp/common/test_parameter.cc
index 5e8021a1e..5288366f8 100644
--- a/tests/cpp/common/test_parameter.cc
+++ b/tests/cpp/common/test_parameter.cc
@@ -97,4 +97,9 @@ TEST(XGBoostParameter, Update) {
     ASSERT_NEAR(p.f, 2.71828f, kRtEps);
     ASSERT_NEAR(p.d, 2.71828, kRtEps);  // default
   }
+
+  // Just in case dmlc's use of global memory has any impact in parameters.
+  UpdatableParam a, b;
+  a.UpdateAllowUnknown(xgboost::Args{{"f", "2.71828"}});
+  ASSERT_NE(a.f, b.f);
 }
diff --git a/tests/cpp/data/test_metainfo.cc b/tests/cpp/data/test_metainfo.cc
index b3f3a67ca..cb631146b 100644
--- a/tests/cpp/data/test_metainfo.cc
+++ b/tests/cpp/data/test_metainfo.cc
@@ -1,6 +1,9 @@
-// Copyright 2016-2021 by Contributors
+/**
+ * Copyright 2016-2024, XGBoost contributors
+ */
 #include "test_metainfo.h"
 
+#include <gmock/gmock.h>
 #include <dmlc/io.h>
 #include <xgboost/data.h>
 
@@ -9,7 +12,7 @@
 
 #include "../../../src/common/version.h"
 #include "../filesystem.h"  // dmlc::TemporaryDirectory
-#include "../helpers.h"
+#include "../helpers.h"     // for GMockTHrow
 #include "xgboost/base.h"
 
 namespace xgboost {
@@ -46,6 +49,8 @@ TEST(MetaInfo, GetSet) {
 
 TEST(MetaInfo, GetSetFeature) {
   xgboost::MetaInfo info;
+  ASSERT_THAT([&] { info.SetFeatureInfo("", nullptr, 0); },
+              GMockThrow("Unknown feature info name"));
   EXPECT_THROW(info.SetFeatureInfo("", nullptr, 0), dmlc::Error);
   EXPECT_THROW(info.SetFeatureInfo("foo", nullptr, 0), dmlc::Error);
   EXPECT_NO_THROW(info.SetFeatureInfo("feature_name", nullptr, 0));
@@ -86,7 +91,8 @@ void VerifyGetSetFeatureColumnSplit() {
   std::transform(types.cbegin(), types.cend(), c_types.begin(),
                  [](auto const &str) { return str.c_str(); });
   info.num_col_ = kCols;
-  EXPECT_THROW(info.SetFeatureInfo(u8"feature_type", c_types.data(), c_types.size()), dmlc::Error);
+  ASSERT_THAT([&] { info.SetFeatureInfo(u8"feature_type", c_types.data(), c_types.size()); },
+              GMockThrow("Length of feature_type must be equal to number of columns"));
   info.num_col_ = kCols * world_size;
   EXPECT_NO_THROW(info.SetFeatureInfo(u8"feature_type", c_types.data(), c_types.size()));
   std::vector<std::string> expected_type_names{u8"float", u8"c",     u8"float",
@@ -103,7 +109,8 @@ void VerifyGetSetFeatureColumnSplit() {
   std::transform(names.cbegin(), names.cend(), c_names.begin(),
                  [](auto const &str) { return str.c_str(); });
   info.num_col_ = kCols;
-  EXPECT_THROW(info.SetFeatureInfo(u8"feature_name", c_names.data(), c_names.size()), dmlc::Error);
+  ASSERT_THAT([&] { info.SetFeatureInfo(u8"feature_name", c_names.data(), c_names.size()); },
+              GMockThrow("Length of feature_name must be equal to number of columns"));
   info.num_col_ = kCols * world_size;
   EXPECT_NO_THROW(info.SetFeatureInfo(u8"feature_name", c_names.data(), c_names.size()));
   std::vector<std::string> expected_names{u8"0.feature0", u8"0.feature1", u8"1.feature0",
diff --git a/tests/cpp/helpers.h b/tests/cpp/helpers.h
index 744793ba0..37bed3944 100644
--- a/tests/cpp/helpers.h
+++ b/tests/cpp/helpers.h
@@ -1,8 +1,9 @@
 /**
- * Copyright 2016-2024 by XGBoost contributors
+ * Copyright 2016-2024, XGBoost contributors
  */
 #pragma once
 
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include <sys/stat.h>
 #include <sys/types.h>
@@ -12,7 +13,7 @@
 #include <xgboost/learner.h>  // for LearnerModelParam
 #include <xgboost/model.h>    // for Configurable
 
-#include <cstdint>            // std::int32_t
+#include <cstdint>  // std::int32_t
 #include <cstdio>
 #include <fstream>
 #include <iostream>
@@ -573,30 +574,7 @@ class DeclareUnifiedDistributedTest(MetricTest) : public BaseMGPUTest{};
 
 inline DeviceOrd FstCU() { return DeviceOrd::CUDA(0); }
 
-/**
- * @brief poor man's gmock for message matching.
- *
- * @tparam Error The type of expected execption.
- *
- * @param submsg A substring of the actual error message.
- * @param fn The function that throws Error
- */
-template <typename Error, typename Fn>
-void ExpectThrow(std::string submsg, Fn&& fn) {
-  try {
-    fn();
-  } catch (Error const& exc) {
-    auto actual = std::string{exc.what()};
-    ASSERT_NE(actual.find(submsg), std::string::npos)
-        << "Expecting substring `" << submsg << "` from the error message."
-        << " Got:\n"
-        << actual << "\n";
-    return;
-  } catch (std::exception const& exc) {
-    auto actual = exc.what();
-    ASSERT_TRUE(false) << "An unexpected type of exception is thrown. what:" << actual;
-    return;
-  }
-  ASSERT_TRUE(false) << "No exception is thrown";
+inline auto GMockThrow(StringView msg) {
+  return ::testing::ThrowsMessage<dmlc::Error>(::testing::HasSubstr(msg));
 }
 }  // namespace xgboost
diff --git a/tests/cpp/plugin/federated/test_federated_comm.cc b/tests/cpp/plugin/federated/test_federated_comm.cc
index 0d0692b5f..16edc685f 100644
--- a/tests/cpp/plugin/federated/test_federated_comm.cc
+++ b/tests/cpp/plugin/federated/test_federated_comm.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright 2022-2023, XGBoost contributors
+ * Copyright 2022-2024, XGBoost contributors
  */
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
@@ -9,7 +9,7 @@
 
 #include "../../../../plugin/federated/federated_comm.h"
 #include "../../collective/test_worker.h"  // for SocketTest
-#include "../../helpers.h"                 // for ExpectThrow
+#include "../../helpers.h"                 // for GMockThrow
 #include "test_worker.h"                   // for TestFederated
 #include "xgboost/json.h"                  // for Json
 
@@ -20,19 +20,19 @@ class FederatedCommTest : public SocketTest {};
 
 TEST_F(FederatedCommTest, ThrowOnWorldSizeTooSmall) {
   auto construct = [] { FederatedComm comm{"localhost", 0, 0, 0}; };
-  ASSERT_THAT(construct,
-              ::testing::ThrowsMessage<dmlc::Error>(::testing::HasSubstr("Invalid world size")));
+  ASSERT_THAT(construct, GMockThrow("Invalid world size"));
 }
 
 TEST_F(FederatedCommTest, ThrowOnRankTooSmall) {
   auto construct = [] { FederatedComm comm{"localhost", 0, 1, -1}; };
-  ASSERT_THAT(construct,
-              ::testing::ThrowsMessage<dmlc::Error>(::testing::HasSubstr("Invalid worker rank.")));
+  ASSERT_THAT(construct, GMockThrow("Invalid worker rank."));
 }
 
 TEST_F(FederatedCommTest, ThrowOnRankTooBig) {
-  auto construct = [] { FederatedComm comm{"localhost", 0, 1, 1}; };
-  ExpectThrow<dmlc::Error>("Invalid worker rank.", construct);
+  auto construct = [] {
+    FederatedComm comm{"localhost", 0, 1, 1};
+  };
+  ASSERT_THAT(construct, GMockThrow("Invalid worker rank."));
 }
 
 TEST_F(FederatedCommTest, ThrowOnWorldSizeNotInteger) {
@@ -43,7 +43,7 @@ TEST_F(FederatedCommTest, ThrowOnWorldSizeNotInteger) {
     config["federated_rank"] = Integer(0);
     FederatedComm comm{DefaultRetry(), std::chrono::seconds{DefaultTimeoutSec()}, "", config};
   };
-  ExpectThrow<dmlc::Error>("got: `String`", construct);
+  ASSERT_THAT(construct, GMockThrow("got: `String`"));
 }
 
 TEST_F(FederatedCommTest, ThrowOnRankNotInteger) {
@@ -54,7 +54,7 @@ TEST_F(FederatedCommTest, ThrowOnRankNotInteger) {
     config["federated_rank"] = std::string("0");
     FederatedComm comm(DefaultRetry(), std::chrono::seconds{DefaultTimeoutSec()}, "", config);
   };
-  ExpectThrow<dmlc::Error>("got: `String`", construct);
+  ASSERT_THAT(construct, GMockThrow("got: `String`"));
 }
 
 TEST_F(FederatedCommTest, GetWorldSizeAndRank) {
diff --git a/tests/cpp/plugin/sycl_helpers.h b/tests/cpp/plugin/sycl_helpers.h
new file mode 100644
index 000000000..c5cdd3ea5
--- /dev/null
+++ b/tests/cpp/plugin/sycl_helpers.h
@@ -0,0 +1,30 @@
+/*!
+ * Copyright 2022-2024 XGBoost contributors
+ */
+#pragma once
+
+#include "../helpers.h"
+
+namespace xgboost::sycl {
+template<typename T, typename Container>
+void VerifySyclVector(const USMVector<T, MemoryType::shared>& sycl_vector,
+                      const Container& host_vector) {
+  ASSERT_EQ(sycl_vector.Size(), host_vector.size());
+
+  size_t size = sycl_vector.Size();
+  for (size_t i = 0; i < size; ++i) {
+    ASSERT_EQ(sycl_vector[i], host_vector[i]);
+  }
+}
+
+template<typename T, typename Container>
+void VerifySyclVector(const std::vector<T>& sycl_vector, const Container& host_vector) {
+  ASSERT_EQ(sycl_vector.size(), host_vector.size());
+
+  size_t size = sycl_vector.size();
+  for (size_t i = 0; i < size; ++i) {
+    ASSERT_EQ(sycl_vector[i], host_vector[i]);
+  }
+}
+
+}  // namespace xgboost::sycl
diff --git a/tests/cpp/plugin/test_sycl_gradient_index.cc b/tests/cpp/plugin/test_sycl_gradient_index.cc
new file mode 100644
index 000000000..4d605ce7a
--- /dev/null
+++ b/tests/cpp/plugin/test_sycl_gradient_index.cc
@@ -0,0 +1,80 @@
+/**
+ * Copyright 2021-2024 by XGBoost contributors
+ */
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wtautological-constant-compare"
+#pragma GCC diagnostic ignored "-W#pragma-messages"
+#include "../../../src/data/gradient_index.h"       // for GHistIndexMatrix
+#pragma GCC diagnostic pop
+
+#include "../../../plugin/sycl/data/gradient_index.h"
+#include "../../../plugin/sycl/device_manager.h"
+#include "sycl_helpers.h"
+#include "../helpers.h"
+
+namespace xgboost::sycl::data {
+
+TEST(SyclGradientIndex, HistogramCuts) {
+  size_t max_bins = 8;
+
+  Context ctx;
+  ctx.UpdateAllowUnknown(Args{{"device", "sycl"}});
+
+  DeviceManager device_manager;
+  auto qu = device_manager.GetQueue(ctx.Device());
+
+  auto p_fmat = RandomDataGenerator{512, 16, 0.5}.GenerateDMatrix(true);
+
+  xgboost::common::HistogramCuts cut = 
+    xgboost::common::SketchOnDMatrix(&ctx, p_fmat.get(), max_bins);
+
+  common::HistogramCuts cut_sycl;
+  cut_sycl.Init(qu, cut);
+
+  VerifySyclVector(cut_sycl.Ptrs(), cut.cut_ptrs_.HostVector());
+  VerifySyclVector(cut_sycl.Values(), cut.cut_values_.HostVector());
+  VerifySyclVector(cut_sycl.MinValues(), cut.min_vals_.HostVector());
+}
+
+TEST(SyclGradientIndex, Init) {
+  size_t n_rows = 128;
+  size_t n_columns = 7;
+
+  Context ctx;
+  ctx.UpdateAllowUnknown(Args{{"device", "sycl"}});
+
+  DeviceManager device_manager;
+  auto qu = device_manager.GetQueue(ctx.Device());
+
+  auto p_fmat = RandomDataGenerator{n_rows, n_columns, 0.3}.GenerateDMatrix();
+
+  sycl::DeviceMatrix dmat;
+  dmat.Init(qu, p_fmat.get());
+
+  int max_bins = 256;
+  common::GHistIndexMatrix gmat_sycl;
+  gmat_sycl.Init(qu, &ctx, dmat, max_bins);
+
+  xgboost::GHistIndexMatrix gmat{&ctx, p_fmat.get(), max_bins, 0.3, false};
+
+  {
+    ASSERT_EQ(gmat_sycl.max_num_bins, max_bins);
+    ASSERT_EQ(gmat_sycl.nfeatures, n_columns);
+  }
+
+  {
+    VerifySyclVector(gmat_sycl.hit_count, gmat.hit_count);
+  }
+
+  {
+    std::vector<size_t> feature_count_sycl(n_columns, 0);
+    gmat_sycl.GetFeatureCounts(feature_count_sycl.data());
+
+    std::vector<size_t> feature_count(n_columns, 0);
+    gmat.GetFeatureCounts(feature_count.data());
+    VerifySyclVector(feature_count_sycl, feature_count);
+  }
+}
+
+}  // namespace xgboost::sycl::data
diff --git a/tests/cpp/plugin/test_sycl_partition_builder.cc b/tests/cpp/plugin/test_sycl_partition_builder.cc
index 90bc757eb..7e3126a79 100644
--- a/tests/cpp/plugin/test_sycl_partition_builder.cc
+++ b/tests/cpp/plugin/test_sycl_partition_builder.cc
@@ -13,6 +13,108 @@
 
 namespace xgboost::sycl::common {
 
+void TestPartitioning(float sparsity, int max_bins) {
+  const size_t num_rows = 16;
+  const size_t num_columns = 1;
+
+  Context ctx;
+  ctx.UpdateAllowUnknown(Args{{"device", "sycl"}});
+
+  DeviceManager device_manager;
+  auto qu = device_manager.GetQueue(ctx.Device());
+
+  auto p_fmat = RandomDataGenerator{num_rows, num_columns, sparsity}.GenerateDMatrix();
+  sycl::DeviceMatrix dmat;
+  dmat.Init(qu, p_fmat.get());
+
+  common::GHistIndexMatrix gmat;
+  gmat.Init(qu, &ctx, dmat, max_bins);
+
+  RowSetCollection row_set_collection;
+  auto& row_indices = row_set_collection.Data();
+  row_indices.Resize(&qu, num_rows);
+  size_t* p_row_indices = row_indices.Data();
+
+  qu.submit([&](::sycl::handler& cgh) {
+    cgh.parallel_for<>(::sycl::range<1>(num_rows),
+                       [p_row_indices](::sycl::item<1> pid) {
+      const size_t idx = pid.get_id(0);
+      p_row_indices[idx] = idx;
+    });
+  }).wait_and_throw();
+  row_set_collection.Init();
+
+  RegTree tree;
+  tree.ExpandNode(0, 0, 0, false, 0, 0, 0, 0, 0, 0, 0);
+
+  const size_t n_nodes = row_set_collection.Size();
+  PartitionBuilder partition_builder;
+  partition_builder.Init(&qu, n_nodes, [&](size_t nid) {
+    return row_set_collection[nid].Size();
+  });
+
+  std::vector<tree::ExpandEntry> nodes;
+  nodes.emplace_back(tree::ExpandEntry(0, tree.GetDepth(0)));
+
+  ::sycl::event event;
+  std::vector<int32_t> split_conditions = {2};
+  partition_builder.Partition(gmat, nodes, row_set_collection,
+                    split_conditions, &tree, &event);
+  qu.wait_and_throw();
+
+  size_t* data_result = const_cast<size_t*>(row_set_collection[0].begin);
+  partition_builder.MergeToArray(0, data_result, &event);
+  qu.wait_and_throw();
+
+  bst_float split_pt = gmat.cut.Values()[split_conditions[0]];
+
+  std::vector<uint8_t> ridx_left(num_rows, 0);
+  std::vector<uint8_t> ridx_right(num_rows, 0);
+  for (auto &batch : gmat.p_fmat->GetBatches<SparsePage>()) {
+    const auto& data_vec = batch.data.HostVector();
+    const auto& offset_vec = batch.offset.HostVector();
+
+    size_t begin = offset_vec[0];
+    for (size_t idx = 0; idx < offset_vec.size() - 1; ++idx) {
+      size_t end = offset_vec[idx + 1];
+      if (begin < end) {
+        const auto& entry = data_vec[begin];
+        if (entry.fvalue < split_pt) {
+          ridx_left[idx] = 1;
+        } else {
+          ridx_right[idx] = 1;
+        }
+      } else {
+        // missing value
+        if (tree[0].DefaultLeft()) {
+          ridx_left[idx] = 1;
+        } else {
+          ridx_right[idx] = 1;
+        }
+      }
+      begin = end;
+    }
+  }
+  auto n_left  = std::accumulate(ridx_left.begin(),  ridx_left.end(),  0);
+  auto n_right = std::accumulate(ridx_right.begin(), ridx_right.end(), 0);
+
+  std::vector<size_t> row_indices_host(num_rows);
+  qu.memcpy(row_indices_host.data(), row_indices.Data(), num_rows * sizeof(size_t));
+  qu.wait_and_throw();
+
+  ASSERT_EQ(n_left,  partition_builder.GetNLeftElems(0));
+  for (size_t i = 0; i < n_left; ++i) {
+    auto idx = row_indices_host[i];
+    ASSERT_EQ(ridx_left[idx], 1);
+  }
+
+  ASSERT_EQ(n_right, partition_builder.GetNRightElems(0));
+  for (size_t i = 0; i < n_right; ++i) {
+    auto idx = row_indices_host[num_rows - 1 - i];
+    ASSERT_EQ(ridx_right[idx], 1);
+  }
+}
+
 TEST(SyclPartitionBuilder, BasicTest) {
   constexpr size_t kNodes = 5;
   // Number of rows for each node
@@ -67,7 +169,7 @@ TEST(SyclPartitionBuilder, BasicTest) {
   std::vector<size_t> v(*std::max_element(rows.begin(), rows.end()));
   size_t row_id = 0;
   for(size_t nid = 0; nid < kNodes; ++nid) {
-    builder.MergeToArray(nid, v.data(), event);
+    builder.MergeToArray(nid, v.data(), &event);
     qu.wait();
 
     // Check that row_id for left side are correct
@@ -88,4 +190,20 @@ TEST(SyclPartitionBuilder, BasicTest) {
   }
 }
 
+TEST(SyclPartitionBuilder, PartitioningSparce) {
+  TestPartitioning(0.3, 256);
+}
+
+TEST(SyclPartitionBuilder, PartitioningDence8Bits) {
+  TestPartitioning(0.0, 256);
+}
+
+TEST(SyclPartitionBuilder, PartitioningDence16Bits) {
+  TestPartitioning(0.0, 256 + 1);
+}
+
+TEST(SyclPartitionBuilder, PartitioningDence32Bits) {
+  TestPartitioning(0.0, (1u << 16) + 1);
+}
+
 }  // namespace xgboost::common
diff --git a/tests/cpp/plugin/test_sycl_row_set_collection.cc b/tests/cpp/plugin/test_sycl_row_set_collection.cc
new file mode 100644
index 000000000..f527d9f16
--- /dev/null
+++ b/tests/cpp/plugin/test_sycl_row_set_collection.cc
@@ -0,0 +1,78 @@
+/**
+ * Copyright 2020-2023 by XGBoost contributors
+ */
+#include <gtest/gtest.h>
+
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "../../../plugin/sycl/common/row_set.h"
+#include "../../../plugin/sycl/device_manager.h"
+#include "../helpers.h"
+
+namespace xgboost::sycl::common {
+TEST(SyclRowSetCollection, AddSplits) {
+  const size_t num_rows = 16;
+
+  DeviceManager device_manager;
+  auto qu = device_manager.GetQueue(DeviceOrd::SyclDefault());
+
+  RowSetCollection row_set_collection;
+
+  auto& row_indices = row_set_collection.Data();
+  row_indices.Resize(&qu, num_rows);
+  size_t* p_row_indices = row_indices.Data();
+
+  qu.submit([&](::sycl::handler& cgh) {
+    cgh.parallel_for<>(::sycl::range<1>(num_rows),
+                       [p_row_indices](::sycl::item<1> pid) {
+      const size_t idx = pid.get_id(0);
+      p_row_indices[idx] = idx;
+    });
+  }).wait_and_throw();
+  row_set_collection.Init();
+
+  CHECK_EQ(row_set_collection.Size(), 1);
+  {
+    size_t nid_test = 0;
+    auto& elem = row_set_collection[nid_test];
+    CHECK_EQ(elem.begin, row_indices.Begin());
+    CHECK_EQ(elem.end, row_indices.End());
+    CHECK_EQ(elem.node_id , 0);
+  }
+
+  size_t nid = 0;
+  size_t nid_left = 1;
+  size_t nid_right = 2;
+  size_t n_left = 4;
+  size_t n_right = num_rows - n_left;
+  row_set_collection.AddSplit(nid, nid_left, nid_right, n_left, n_right);
+  CHECK_EQ(row_set_collection.Size(), 3);
+
+  {
+    size_t nid_test = 0;
+    auto& elem = row_set_collection[nid_test];
+    CHECK_EQ(elem.begin, nullptr);
+    CHECK_EQ(elem.end, nullptr);
+    CHECK_EQ(elem.node_id , -1);
+  }
+
+  {
+    size_t nid_test = 1;
+    auto& elem = row_set_collection[nid_test];
+    CHECK_EQ(elem.begin, row_indices.Begin());
+    CHECK_EQ(elem.end, row_indices.Begin() + n_left);
+    CHECK_EQ(elem.node_id , nid_test);
+  }
+
+  {
+    size_t nid_test = 2;
+    auto& elem = row_set_collection[nid_test];
+    CHECK_EQ(elem.begin, row_indices.Begin() + n_left);
+    CHECK_EQ(elem.end, row_indices.End());
+    CHECK_EQ(elem.node_id , nid_test);
+  }
+
+}
+}  // namespace xgboost::sycl::common
diff --git a/tests/cpp/predictor/test_cpu_predictor.cc b/tests/cpp/predictor/test_cpu_predictor.cc
index 8f3955c05..669827ee4 100644
--- a/tests/cpp/predictor/test_cpu_predictor.cc
+++ b/tests/cpp/predictor/test_cpu_predictor.cc
@@ -148,7 +148,7 @@ TEST(CPUPredictor, GHistIndexTraining) {
   auto adapter = data::ArrayAdapter(columnar.c_str());
   std::shared_ptr<DMatrix> p_full{
       DMatrix::Create(&adapter, std::numeric_limits<float>::quiet_NaN(), 1)};
-  TestTrainingPrediction(&ctx, kRows, kBins, p_full, p_hist);
+  TestTrainingPrediction(&ctx, kRows, kBins, p_full, p_hist, true);
 }
 
 TEST(CPUPredictor, CategoricalPrediction) {
diff --git a/tests/cpp/predictor/test_predictor.cc b/tests/cpp/predictor/test_predictor.cc
index 3420de78e..0d1c64186 100644
--- a/tests/cpp/predictor/test_predictor.cc
+++ b/tests/cpp/predictor/test_predictor.cc
@@ -118,7 +118,8 @@ TEST(Predictor, PredictionCache) {
 }
 
 void TestTrainingPrediction(Context const *ctx, size_t rows, size_t bins,
-                            std::shared_ptr<DMatrix> p_full, std::shared_ptr<DMatrix> p_hist) {
+                            std::shared_ptr<DMatrix> p_full, std::shared_ptr<DMatrix> p_hist,
+                            bool check_contribs) {
   size_t constexpr kCols = 16;
   size_t constexpr kClasses = 3;
   size_t constexpr kIters = 3;
@@ -161,6 +162,28 @@ void TestTrainingPrediction(Context const *ctx, size_t rows, size_t bins,
   for (size_t i = 0; i < rows; ++i) {
     EXPECT_NEAR(from_hist.ConstHostVector()[i], from_full.ConstHostVector()[i], kRtEps);
   }
+
+  if (check_contribs) {
+    // Contributions
+    HostDeviceVector<float> from_full_contribs;
+    learner->Predict(p_full, false, &from_full_contribs, 0, 0, false, false, true);
+    HostDeviceVector<float> from_hist_contribs;
+    learner->Predict(p_hist, false, &from_hist_contribs, 0, 0, false, false, true);
+    for (size_t i = 0; i < from_full_contribs.ConstHostVector().size(); ++i) {
+      EXPECT_NEAR(from_hist_contribs.ConstHostVector()[i],
+                  from_full_contribs.ConstHostVector()[i], kRtEps);
+    }
+
+    // Contributions (approximate method)
+    HostDeviceVector<float> from_full_approx_contribs;
+    learner->Predict(p_full, false, &from_full_approx_contribs, 0, 0, false, false, false, true);
+    HostDeviceVector<float> from_hist_approx_contribs;
+    learner->Predict(p_hist, false, &from_hist_approx_contribs, 0, 0, false, false, false, true);
+    for (size_t i = 0; i < from_full_approx_contribs.ConstHostVector().size(); ++i) {
+      EXPECT_NEAR(from_hist_approx_contribs.ConstHostVector()[i],
+                  from_full_approx_contribs.ConstHostVector()[i], kRtEps);
+    }
+  }
 }
 
 void TestInplacePrediction(Context const *ctx, std::shared_ptr<DMatrix> x, bst_row_t rows,
diff --git a/tests/cpp/predictor/test_predictor.h b/tests/cpp/predictor/test_predictor.h
index c2b28883a..a65b60579 100644
--- a/tests/cpp/predictor/test_predictor.h
+++ b/tests/cpp/predictor/test_predictor.h
@@ -89,7 +89,8 @@ void TestBasic(DMatrix* dmat, Context const * ctx);
 
 // p_full and p_hist should come from the same data set.
 void TestTrainingPrediction(Context const* ctx, size_t rows, size_t bins,
-                            std::shared_ptr<DMatrix> p_full, std::shared_ptr<DMatrix> p_hist);
+                            std::shared_ptr<DMatrix> p_full, std::shared_ptr<DMatrix> p_hist,
+                            bool check_contribs = false);
 
 void TestInplacePrediction(Context const* ctx, std::shared_ptr<DMatrix> x, bst_row_t rows,
                            bst_feature_t cols);
diff --git a/tests/cpp/test_learner.cc b/tests/cpp/test_learner.cc
index 8dccea10b..6350bae92 100644
--- a/tests/cpp/test_learner.cc
+++ b/tests/cpp/test_learner.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright (c) 2017-2023, XGBoost contributors
+ * Copyright 2017-2024, XGBoost contributors
  */
 #include <gtest/gtest.h>
 #include <gmock/gmock.h>
@@ -82,9 +82,7 @@ TEST(Learner, ParameterValidation) {
 
   // whitespace
   learner->SetParam("tree method", "exact");
-  EXPECT_THAT([&] { learner->Configure(); },
-              ::testing::ThrowsMessage<dmlc::Error>(
-                  ::testing::HasSubstr(R"("tree method" contains whitespace)")));
+  ASSERT_THAT([&] { learner->Configure(); }, GMockThrow(R"("tree method" contains whitespace)"));
 }
 
 TEST(Learner, CheckGroup) {
diff --git a/tests/cpp/test_serialization.cc b/tests/cpp/test_serialization.cc
index bf23991c1..eacfcc58f 100644
--- a/tests/cpp/test_serialization.cc
+++ b/tests/cpp/test_serialization.cc
@@ -60,7 +60,7 @@ void CompareJSON(Json l, Json r) {
     }
     break;
   }
-  case Value::ValueKind::kNumberArray: {
+  case Value::ValueKind::kF32Array: {
     auto const& l_arr = get<F32Array const>(l);
     auto const& r_arr = get<F32Array const>(r);
     ASSERT_EQ(l_arr.size(), r_arr.size());
@@ -69,6 +69,15 @@ void CompareJSON(Json l, Json r) {
     }
     break;
   }
+  case Value::ValueKind::kF64Array: {
+    auto const& l_arr = get<F64Array const>(l);
+    auto const& r_arr = get<F64Array const>(r);
+    ASSERT_EQ(l_arr.size(), r_arr.size());
+    for (size_t i = 0; i < l_arr.size(); ++i) {
+      ASSERT_NEAR(l_arr[i], r_arr[i], kRtEps);
+    }
+    break;
+  }
   case Value::ValueKind::kU8Array: {
     CompareIntArray<U8Array>(l, r);
     break;
diff --git a/tests/cpp/tree/test_multi_target_tree_model.cc b/tests/cpp/tree/test_multi_target_tree_model.cc
index 550b8837c..0b5745a20 100644
--- a/tests/cpp/tree/test_multi_target_tree_model.cc
+++ b/tests/cpp/tree/test_multi_target_tree_model.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright 2023 by XGBoost Contributors
+ * Copyright 2023-2024, XGBoost Contributors
  */
 #include <gtest/gtest.h>
 #include <xgboost/context.h>     // for Context
@@ -7,16 +7,23 @@
 #include <xgboost/tree_model.h>  // for RegTree
 
 namespace xgboost {
-TEST(MultiTargetTree, JsonIO) {
+namespace {
+auto MakeTreeForTest() {
   bst_target_t n_targets{3};
   bst_feature_t n_features{4};
   RegTree tree{n_targets, n_features};
-  ASSERT_TRUE(tree.IsMultiTarget());
+  CHECK(tree.IsMultiTarget());
   linalg::Vector<float> base_weight{{1.0f, 2.0f, 3.0f}, {3ul}, DeviceOrd::CPU()};
   linalg::Vector<float> left_weight{{2.0f, 3.0f, 4.0f}, {3ul}, DeviceOrd::CPU()};
   linalg::Vector<float> right_weight{{3.0f, 4.0f, 5.0f}, {3ul}, DeviceOrd::CPU()};
   tree.ExpandNode(RegTree::kRoot, /*split_idx=*/1, 0.5f, true, base_weight.HostView(),
                   left_weight.HostView(), right_weight.HostView());
+  return tree;
+}
+}  // namespace
+
+TEST(MultiTargetTree, JsonIO) {
+  auto tree = MakeTreeForTest();
   ASSERT_EQ(tree.NumNodes(), 3);
   ASSERT_EQ(tree.NumTargets(), 3);
   ASSERT_EQ(tree.GetMultiTargetTree()->Size(), 3);
@@ -44,4 +51,28 @@ TEST(MultiTargetTree, JsonIO) {
   loaded.SaveModel(&jtree1);
   check_jtree(jtree1, tree);
 }
+
+TEST(MultiTargetTree, DumpDot) {
+  auto tree = MakeTreeForTest();
+  auto n_features = tree.NumFeatures();
+  FeatureMap fmap;
+  for (bst_feature_t f = 0; f < n_features; ++f) {
+    auto name = "feat_" + std::to_string(f);
+    fmap.PushBack(f, name.c_str(), "q");
+  }
+  auto str = tree.DumpModel(fmap, true, "dot");
+  ASSERT_NE(str.find("leaf=[2, 3, 4]"), std::string::npos);
+  ASSERT_NE(str.find("leaf=[3, 4, 5]"), std::string::npos);
+
+  {
+    bst_target_t n_targets{4};
+    bst_feature_t n_features{4};
+    RegTree tree{n_targets, n_features};
+    linalg::Vector<float> weight{{1.0f, 2.0f, 3.0f, 4.0f}, {4ul}, DeviceOrd::CPU()};
+    tree.ExpandNode(RegTree::kRoot, /*split_idx=*/1, 0.5f, true, weight.HostView(),
+                    weight.HostView(), weight.HostView());
+    auto str = tree.DumpModel(fmap, true, "dot");
+    ASSERT_NE(str.find("leaf=[1, 2, ..., 4]"), std::string::npos);
+  }
+}
 }  // namespace xgboost
diff --git a/tests/cpp/tree/test_quantile_hist.cc b/tests/cpp/tree/test_quantile_hist.cc
index cf806536a..4021c9959 100644
--- a/tests/cpp/tree/test_quantile_hist.cc
+++ b/tests/cpp/tree/test_quantile_hist.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright 2018-2023 by XGBoost Contributors
+ * Copyright 2018-2024, XGBoost Contributors
  */
 #include <gtest/gtest.h>
 #include <xgboost/host_device_vector.h>
@@ -18,7 +18,6 @@
 #include "xgboost/data.h"
 
 namespace xgboost::tree {
-
 namespace {
 template <typename ExpandEntry>
 void TestPartitioner(bst_target_t n_targets) {
@@ -253,5 +252,5 @@ void TestColumnSplit(bst_target_t n_targets) {
 
 TEST(QuantileHist, ColumnSplit) { TestColumnSplit(1); }
 
-TEST(QuantileHist, DISABLED_ColumnSplitMultiTarget) { TestColumnSplit(3); }
+TEST(QuantileHist, ColumnSplitMultiTarget) { TestColumnSplit(3); }
 }  // namespace xgboost::tree
diff --git a/tests/python-gpu/test_from_cupy.py b/tests/python-gpu/test_from_cupy.py
index 79814a1bb..85d54c78d 100644
--- a/tests/python-gpu/test_from_cupy.py
+++ b/tests/python-gpu/test_from_cupy.py
@@ -202,7 +202,10 @@ class TestFromCupy:
         n = 100
         X = cp.random.random((n, 2))
         m = xgb.QuantileDMatrix(X.toDlpack())
-        with pytest.raises(xgb.core.XGBoostError):
+
+        with pytest.raises(
+            xgb.core.XGBoostError, match="Slicing DMatrix is not supported"
+        ):
             m.slice(rindex=[0, 1, 2])
 
     @pytest.mark.skipif(**tm.no_cupy())
diff --git a/tests/python-gpu/test_gpu_ranking.py b/tests/python-gpu/test_gpu_ranking.py
index 2579b17de..b7c5c3adb 100644
--- a/tests/python-gpu/test_gpu_ranking.py
+++ b/tests/python-gpu/test_gpu_ranking.py
@@ -6,6 +6,7 @@ import pytest
 
 import xgboost
 from xgboost import testing as tm
+from xgboost.testing.ranking import run_normalization
 
 pytestmark = tm.timeout(30)
 
@@ -126,3 +127,7 @@ def test_with_mq2008(objective, metric) -> None:
     dtest = xgboost.DMatrix(x_test, y_test, qid=qid_test)
 
     comp_training_with_rank_objective(dtrain, dtest, objective, metric)
+
+
+def test_normalization() -> None:
+    run_normalization("cuda")
diff --git a/tests/python/test_ranking.py b/tests/python/test_ranking.py
index 8bdeb070f..49508f594 100644
--- a/tests/python/test_ranking.py
+++ b/tests/python/test_ranking.py
@@ -13,6 +13,7 @@ import xgboost
 from xgboost import testing as tm
 from xgboost.testing.data import RelDataCV, simulate_clicks, sort_ltr_samples
 from xgboost.testing.params import lambdarank_parameter_strategy
+from xgboost.testing.ranking import run_normalization
 
 
 def test_ndcg_custom_gain():
@@ -53,6 +54,20 @@ def test_ndcg_custom_gain():
     assert byxgb.evals_result() == bynp.evals_result()
     assert byxgb_json == bynp_json
 
+    # test pairwise can handle max_rel > 31, while ndcg metric is using custom gain
+    X, y, q, w = tm.make_ltr(n_samples=1024, n_features=4, n_query_groups=3, max_rel=33)
+    ranknet = xgboost.XGBRanker(
+        tree_method="hist",
+        ndcg_exp_gain=False,
+        n_estimators=10,
+        objective="rank:pairwise",
+    )
+    ranknet.fit(X, y, qid=q, eval_set=[(X, y)], eval_qid=[q])
+    history = ranknet.evals_result()
+    assert (
+        history["validation_0"]["ndcg@32"][0] < history["validation_0"]["ndcg@32"][-1]
+    )
+
 
 def test_ranking_with_unweighted_data():
     Xrow = np.array([1, 2, 6, 8, 11, 14, 16, 17])
@@ -188,6 +203,10 @@ def test_unbiased() -> None:
     assert df["ti+"].iloc[-1] < df["ti+"].iloc[0]
 
 
+def test_normalization() -> None:
+    run_normalization("cpu")
+
+
 class TestRanking:
     @classmethod
     def setup_class(cls):
diff --git a/tests/python/test_shap.py b/tests/python/test_shap.py
index 88149c054..097298f07 100644
--- a/tests/python/test_shap.py
+++ b/tests/python/test_shap.py
@@ -2,7 +2,6 @@ import itertools
 import re
 
 import numpy as np
-import scipy
 import scipy.special
 
 import xgboost as xgb
@@ -256,3 +255,30 @@ class TestSHAP:
         brute_force[-1, -1] += base_score
         fast_method = bst.predict(xgb.DMatrix(X[0:1, :]), pred_interactions=True)
         assert np.linalg.norm(brute_force - fast_method[0, :, :]) < 1e-4
+
+    def test_shap_values(self) -> None:
+        from sklearn.datasets import make_classification, make_regression
+
+        def assert_same(X: np.ndarray, y: np.ndarray) -> None:
+            Xy = xgb.DMatrix(X, y)
+            booster = xgb.train({}, Xy, num_boost_round=4)
+            shap_dm = booster.predict(Xy, pred_contribs=True)
+            Xy = xgb.QuantileDMatrix(X, y)
+            shap_qdm = booster.predict(Xy, pred_contribs=True)
+            np.testing.assert_allclose(shap_dm, shap_qdm)
+
+            margin = booster.predict(Xy, output_margin=True)
+            np.testing.assert_allclose(
+                np.sum(shap_qdm, axis=len(shap_qdm.shape) - 1), margin, 1e-3, 1e-3
+            )
+
+            shap_dm = booster.predict(Xy, pred_interactions=True)
+            Xy = xgb.QuantileDMatrix(X, y)
+            shap_qdm = booster.predict(Xy, pred_interactions=True)
+            np.testing.assert_allclose(shap_dm, shap_qdm)
+
+        X, y = make_regression()
+        assert_same(X, y)
+
+        X, y = make_classification()
+        assert_same(X, y)
diff --git a/tests/python/test_updaters.py b/tests/python/test_updaters.py
index e7641348d..8ec1fdd9d 100644
--- a/tests/python/test_updaters.py
+++ b/tests/python/test_updaters.py
@@ -35,10 +35,24 @@ class TestTreeMethod:
     def test_exact(self, param, num_rounds, dataset):
         if dataset.name.endswith("-l1"):
             return
-        param['tree_method'] = 'exact'
+        param["tree_method"] = "exact"
         param = dataset.set_params(param)
         result = train_result(param, dataset.get_dmat(), num_rounds)
-        assert tm.non_increasing(result['train'][dataset.metric])
+        assert tm.non_increasing(result["train"][dataset.metric])
+
+    def test_exact_sample_by_node_error(self) -> None:
+        X, y, w = tm.make_regression(128, 12, False)
+        with pytest.raises(ValueError, match="column sample by node"):
+            xgb.train(
+                {"tree_method": "exact", "colsample_bynode": 0.999},
+                xgb.DMatrix(X, y, weight=w),
+            )
+
+        xgb.train(
+            {"tree_method": "exact", "colsample_bynode": 1.0},
+            xgb.DMatrix(X, y, weight=w),
+            num_boost_round=2,
+        )
 
     @given(
         exact_parameter_strategy,
diff --git a/tests/python/test_with_sklearn.py b/tests/python/test_with_sklearn.py
index 344628e4f..507470724 100644
--- a/tests/python/test_with_sklearn.py
+++ b/tests/python/test_with_sklearn.py
@@ -517,6 +517,12 @@ def test_regression_with_custom_objective():
         labels = y[test_index]
     assert mean_squared_error(preds, labels) < 25
 
+    w = rng.uniform(low=0.0, high=1.0, size=X.shape[0])
+    reg = xgb.XGBRegressor(objective=tm.ls_obj, n_estimators=25)
+    reg.fit(X, y, sample_weight=w)
+    y_pred = reg.predict(X)
+    assert mean_squared_error(y_true=y, y_pred=y_pred, sample_weight=w) < 25
+
     # Test that the custom objective function is actually used
     class XGBCustomObjectiveException(Exception):
         pass
@@ -1456,3 +1462,16 @@ def test_intercept() -> None:
     result = reg.intercept_
     assert result.dtype == np.float32
     assert result[0] < 0.5
+
+
+def test_fit_none() -> None:
+    with pytest.raises(TypeError, match="NoneType"):
+        xgb.XGBClassifier().fit(None, [0, 1])
+
+    X = rng.normal(size=4).reshape(2, 2)
+
+    with pytest.raises(ValueError, match="Invalid classes"):
+        xgb.XGBClassifier().fit(X, None)
+
+    with pytest.raises(ValueError, match="labels"):
+        xgb.XGBRegressor().fit(X, None)
diff --git a/tests/test_distributed/test_with_dask/test_with_dask.py b/tests/test_distributed/test_with_dask/test_with_dask.py
index fdf0d64c4..ffea1d058 100644
--- a/tests/test_distributed/test_with_dask/test_with_dask.py
+++ b/tests/test_distributed/test_with_dask/test_with_dask.py
@@ -1750,9 +1750,20 @@ class TestWithDask:
             )
             tm.non_increasing(results_native["validation_0"]["rmse"])
 
+            reg = xgb.dask.DaskXGBRegressor(
+                n_estimators=rounds, objective=tm.ls_obj, tree_method="hist"
+            )
+            rng = da.random.RandomState(1994)
+            w = rng.uniform(low=0.0, high=1.0, size=y.shape[0])
+            reg.fit(
+                X, y, sample_weight=w, eval_set=[(X, y)], sample_weight_eval_set=[w]
+            )
+            results_custom = reg.evals_result()
+            tm.non_increasing(results_custom["validation_0"]["rmse"])
+
     def test_no_duplicated_partition(self) -> None:
-        """Assert each worker has the correct amount of data, and DMatrix initialization doesn't
-        generate unnecessary copies of data.
+        """Assert each worker has the correct amount of data, and DMatrix initialization
+        doesn't generate unnecessary copies of data.
 
         """
         with LocalCluster(n_workers=2, dashboard_address=":0") as cluster: