diff --git a/.github/workflows/r_tests.yml b/.github/workflows/r_tests.yml
index 917245ec6..d004ab15c 100644
--- a/.github/workflows/r_tests.yml
+++ b/.github/workflows/r_tests.yml
@@ -54,7 +54,6 @@ jobs:
       matrix:
         config:
           - {os: windows-latest, r: 'release', compiler: 'mingw', build: 'autotools'}
-          - {os: windows-latest, r: '4.3.0', compiler: 'msvc', build: 'cmake'}
     env:
       R_REMOTES_NO_ERRORS_FROM_WARNINGS: true
       RSPM: ${{ matrix.config.rspm }}
diff --git a/R-package/DESCRIPTION b/R-package/DESCRIPTION
index 7c01d50c6..bbaf3e75d 100644
--- a/R-package/DESCRIPTION
+++ b/R-package/DESCRIPTION
@@ -65,6 +65,6 @@ Imports:
     data.table (>= 1.9.6),
     jsonlite (>= 1.0)
 Roxygen: list(markdown = TRUE)
-RoxygenNote: 7.2.3
+RoxygenNote: 7.3.0
 Encoding: UTF-8
 SystemRequirements: GNU make, C++17
diff --git a/R-package/NAMESPACE b/R-package/NAMESPACE
index a29c9b1e0..398b0da5a 100644
--- a/R-package/NAMESPACE
+++ b/R-package/NAMESPACE
@@ -1,5 +1,6 @@
 # Generated by roxygen2: do not edit by hand
 
+S3method("[",xgb.Booster)
 S3method("[",xgb.DMatrix)
 S3method("dimnames<-",xgb.DMatrix)
 S3method(coef,xgb.Booster)
@@ -7,6 +8,7 @@ S3method(dim,xgb.DMatrix)
 S3method(dimnames,xgb.DMatrix)
 S3method(getinfo,xgb.Booster)
 S3method(getinfo,xgb.DMatrix)
+S3method(length,xgb.Booster)
 S3method(predict,xgb.Booster)
 S3method(print,xgb.Booster)
 S3method(print,xgb.DMatrix)
@@ -62,6 +64,7 @@ export(xgb.plot.tree)
 export(xgb.save)
 export(xgb.save.raw)
 export(xgb.set.config)
+export(xgb.slice.Booster)
 export(xgb.train)
 export(xgboost)
 import(methods)
diff --git a/R-package/R/callbacks.R b/R-package/R/callbacks.R
index b3d6bdb1a..02e0a7cd4 100644
--- a/R-package/R/callbacks.R
+++ b/R-package/R/callbacks.R
@@ -280,7 +280,6 @@ cb.reset.parameters <- function(new_params) {
 #' \code{iteration},
 #' \code{begin_iteration},
 #' \code{end_iteration},
-#' \code{num_parallel_tree}.
 #'
 #' @seealso
 #' \code{\link{callbacks}},
@@ -291,7 +290,6 @@ cb.early.stop <- function(stopping_rounds, maximize = FALSE,
                           metric_name = NULL, verbose = TRUE) {
   # state variables
   best_iteration <- -1
-  best_ntreelimit <- -1
   best_score <- Inf
   best_msg <- NULL
   metric_idx <- 1
@@ -358,12 +356,10 @@ cb.early.stop <- function(stopping_rounds, maximize = FALSE,
         # If the difference is due to floating-point truncation, update best_score
         best_score <- attr_best_score
       }
-      xgb.attr(env$bst, "best_iteration") <- best_iteration
-      xgb.attr(env$bst, "best_ntreelimit") <- best_ntreelimit
+      xgb.attr(env$bst, "best_iteration") <- best_iteration - 1
       xgb.attr(env$bst, "best_score") <- best_score
     } else {
       env$basket$best_iteration <- best_iteration
-      env$basket$best_ntreelimit <- best_ntreelimit
     }
   }
 
@@ -385,14 +381,13 @@ cb.early.stop <- function(stopping_rounds, maximize = FALSE,
       )
       best_score <<- score
       best_iteration <<- i
-      best_ntreelimit <<- best_iteration * env$num_parallel_tree
       # save the property to attributes, so they will occur in checkpoint
       if (!is.null(env$bst)) {
         xgb.attributes(env$bst) <- list(
           best_iteration = best_iteration - 1, # convert to 0-based index
           best_score = best_score,
-          best_msg = best_msg,
-          best_ntreelimit = best_ntreelimit)
+          best_msg = best_msg
+        )
       }
     } else if (i - best_iteration >= stopping_rounds) {
       env$stop_condition <- TRUE
@@ -475,8 +470,6 @@ cb.save.model <- function(save_period = 0, save_name = "xgboost.ubj") {
 #' \code{data},
 #' \code{end_iteration},
 #' \code{params},
-#' \code{num_parallel_tree},
-#' \code{num_class}.
 #'
 #' @return
 #' Predictions are returned inside of the \code{pred} element, which is either a vector or a matrix,
@@ -499,19 +492,21 @@ cb.cv.predict <- function(save_models = FALSE) {
       stop("'cb.cv.predict' callback requires 'basket' and 'bst_folds' lists in its calling frame")
 
     N <- nrow(env$data)
-    pred <-
-      if (env$num_class > 1) {
-        matrix(NA_real_, N, env$num_class)
-      } else {
-        rep(NA_real_, N)
-      }
+    pred <- NULL
 
-    iterationrange <- c(1, NVL(env$basket$best_iteration, env$end_iteration) + 1)
+    iterationrange <- c(1, NVL(env$basket$best_iteration, env$end_iteration))
     if (NVL(env$params[['booster']], '') == 'gblinear') {
-      iterationrange <- c(1, 1)  # must be 0 for gblinear
+      iterationrange <- "all"
     }
     for (fd in env$bst_folds) {
       pr <- predict(fd$bst, fd$watchlist[[2]], iterationrange = iterationrange, reshape = TRUE)
+      if (is.null(pred)) {
+        if (NCOL(pr) > 1L) {
+          pred <- matrix(NA_real_, N, ncol(pr))
+        } else {
+          pred <- matrix(NA_real_, N)
+        }
+      }
       if (is.matrix(pred)) {
         pred[fd$index, ] <- pr
       } else {
diff --git a/R-package/R/utils.R b/R-package/R/utils.R
index 945d86132..e8ae787fc 100644
--- a/R-package/R/utils.R
+++ b/R-package/R/utils.R
@@ -208,7 +208,7 @@ xgb.iter.eval <- function(bst, watchlist, iter, feval) {
     res <- sapply(seq_along(watchlist), function(j) {
       w <- watchlist[[j]]
       ## predict using all trees
-      preds <- predict(bst, w, outputmargin = TRUE, iterationrange = c(1, 1))
+      preds <- predict(bst, w, outputmargin = TRUE, iterationrange = "all")
       eval_res <- feval(preds, w)
       out <- eval_res$value
       names(out) <- paste0(evnames[j], "-", eval_res$metric)
diff --git a/R-package/R/xgb.Booster.R b/R-package/R/xgb.Booster.R
index cee7e9fc5..7613c9152 100644
--- a/R-package/R/xgb.Booster.R
+++ b/R-package/R/xgb.Booster.R
@@ -89,7 +89,6 @@ xgb.get.handle <- function(object) {
 #' @param outputmargin Whether the prediction should be returned in the form of original untransformed
 #'        sum of predictions from boosting iterations' results. E.g., setting `outputmargin=TRUE` for
 #'        logistic regression would return log-odds instead of probabilities.
-#' @param ntreelimit Deprecated, use `iterationrange` instead.
 #' @param predleaf Whether to predict pre-tree leaf indices.
 #' @param predcontrib Whether to return feature contributions to individual predictions (see Details).
 #' @param approxcontrib Whether to use a fast approximation for feature contributions (see Details).
@@ -99,11 +98,17 @@ xgb.get.handle <- function(object) {
 #'        or `predinteraction` is `TRUE`.
 #' @param training Whether the predictions are used for training. For dart booster,
 #'        training predicting will perform dropout.
-#' @param iterationrange Specifies which trees are used in prediction. For
-#'        example, take a random forest with 100 rounds.
-#'        With `iterationrange=c(1, 21)`, only the trees built during `[1, 21)` (half open set)
-#'        rounds are used in this prediction. The index is 1-based just like an R vector. When set
-#'        to `c(1, 1)`, XGBoost will use all trees.
+#' @param iterationrange Sequence of rounds/iterations from the model to use for prediction, specified by passing
+#'        a two-dimensional vector with the start and end numbers in the sequence (same format as R's `seq` - i.e.
+#'        base-1 indexing, and inclusive of both ends).
+#'
+#'        For example, passing `c(1,20)` will predict using the first twenty iterations, while passing `c(1,1)` will
+#'        predict using only the first one.
+#'
+#'        If passing `NULL`, will either stop at the best iteration if the model used early stopping, or use all
+#'        of the iterations (rounds) otherwise.
+#'
+#'        If passing "all", will use all of the rounds regardless of whether the model had early stopping or not.
 #' @param strict_shape Default is `FALSE`. When set to `TRUE`, the output
 #'        type and shape of predictions are invariant to the model type.
 #' @param ... Not used.
@@ -189,7 +194,7 @@ xgb.get.handle <- function(object) {
 #' # use all trees by default
 #' pred <- predict(bst, test$data)
 #' # use only the 1st tree
-#' pred1 <- predict(bst, test$data, iterationrange = c(1, 2))
+#' pred1 <- predict(bst, test$data, iterationrange = c(1, 1))
 #'
 #' # Predicting tree leafs:
 #' # the result is an nsamples X ntrees matrix
@@ -260,11 +265,11 @@ xgb.get.handle <- function(object) {
 #' all.equal(pred, pred_labels)
 #' # prediction from using only 5 iterations should result
 #' # in the same error as seen in iteration 5:
-#' pred5 <- predict(bst, as.matrix(iris[, -5]), iterationrange = c(1, 6))
+#' pred5 <- predict(bst, as.matrix(iris[, -5]), iterationrange = c(1, 5))
 #' sum(pred5 != lb) / length(lb)
 #'
 #' @export
-predict.xgb.Booster <- function(object, newdata, missing = NA, outputmargin = FALSE, ntreelimit = NULL,
+predict.xgb.Booster <- function(object, newdata, missing = NA, outputmargin = FALSE,
                                 predleaf = FALSE, predcontrib = FALSE, approxcontrib = FALSE, predinteraction = FALSE,
                                 reshape = FALSE, training = FALSE, iterationrange = NULL, strict_shape = FALSE, ...) {
   if (!inherits(newdata, "xgb.DMatrix")) {
@@ -275,25 +280,21 @@ predict.xgb.Booster <- function(object, newdata, missing = NA, outputmargin = FA
     )
   }
 
-  if (NVL(xgb.booster_type(object), '') == 'gblinear' || is.null(ntreelimit))
-    ntreelimit <- 0
 
-  if (ntreelimit != 0 && is.null(iterationrange)) {
-    ## only ntreelimit, initialize iteration range
-    iterationrange <- c(0, 0)
-  } else if (ntreelimit == 0 && !is.null(iterationrange)) {
-    ## only iteration range, handle 1-based indexing
-    iterationrange <- c(iterationrange[1] - 1, iterationrange[2] - 1)
-  } else if (ntreelimit != 0 && !is.null(iterationrange)) {
-    ## both are specified, let libgxgboost throw an error
+  if (!is.null(iterationrange)) {
+    if (is.character(iterationrange)) {
+      stopifnot(iterationrange == "all")
+      iterationrange <- c(0, 0)
+    } else {
+      iterationrange[1] <- iterationrange[1] - 1 # base-0 indexing
+    }
   } else {
     ## no limit is supplied, use best
     best_iteration <- xgb.best_iteration(object)
     if (is.null(best_iteration)) {
       iterationrange <- c(0, 0)
     } else {
-      ## We don't need to + 1 as R is 1-based index.
-      iterationrange <- c(0, as.integer(best_iteration))
+      iterationrange <- c(0, as.integer(best_iteration) + 1L)
     }
   }
   ## Handle the 0 length values.
@@ -312,7 +313,6 @@ predict.xgb.Booster <- function(object, newdata, missing = NA, outputmargin = FA
     strict_shape = box(TRUE),
     iteration_begin = box(as.integer(iterationrange[1])),
     iteration_end = box(as.integer(iterationrange[2])),
-    ntree_limit = box(as.integer(ntreelimit)),
     type = box(as.integer(0))
   )
 
@@ -343,24 +343,24 @@ predict.xgb.Booster <- function(object, newdata, missing = NA, outputmargin = FA
   )
   names(predts) <- c("shape", "results")
   shape <- predts$shape
-  ret <- predts$results
+  arr <- predts$results
 
-  n_ret <- length(ret)
+  n_ret <- length(arr)
   n_row <- nrow(newdata)
   if (n_row != shape[1]) {
     stop("Incorrect predict shape.")
   }
 
-  arr <- array(data = ret, dim = rev(shape))
+  .Call(XGSetArrayDimInplace_R, arr, rev(shape))
 
   cnames <- if (!is.null(colnames(newdata))) c(colnames(newdata), "BIAS") else NULL
   n_groups <- shape[2]
 
   ## Needed regardless of whether strict shape is being used.
   if (predcontrib) {
-    dimnames(arr) <- list(cnames, NULL, NULL)
+    .Call(XGSetArrayDimNamesInplace_R, arr, list(cnames, NULL, NULL))
   } else if (predinteraction) {
-    dimnames(arr) <- list(cnames, cnames, NULL, NULL)
+    .Call(XGSetArrayDimNamesInplace_R, arr, list(cnames, cnames, NULL, NULL))
   }
   if (strict_shape) {
     return(arr) # strict shape is calculated by libxgboost uniformly.
@@ -368,43 +368,51 @@ predict.xgb.Booster <- function(object, newdata, missing = NA, outputmargin = FA
 
   if (predleaf) {
     ## Predict leaf
-    arr <- if (n_ret == n_row) {
-      matrix(arr, ncol = 1)
+    if (n_ret == n_row) {
+      .Call(XGSetArrayDimInplace_R, arr, c(n_row, 1L))
     } else {
-      matrix(arr, nrow = n_row, byrow = TRUE)
+      arr <- matrix(arr, nrow = n_row, byrow = TRUE)
     }
   } else if (predcontrib) {
     ## Predict contribution
     arr <- aperm(a = arr, perm = c(2, 3, 1)) # [group, row, col]
-    arr <- if (n_ret == n_row) {
-      matrix(arr, ncol =  1, dimnames = list(NULL, cnames))
+    if (n_ret == n_row) {
+      .Call(XGSetArrayDimInplace_R, arr, c(n_row, 1L))
+      .Call(XGSetArrayDimNamesInplace_R, arr, list(NULL, cnames))
     } else if (n_groups != 1) {
       ## turns array into list of matrices
-      lapply(seq_len(n_groups), function(g) arr[g, , ])
+      arr <- lapply(seq_len(n_groups), function(g) arr[g, , ])
     } else {
       ## remove the first axis (group)
-      dn <- dimnames(arr)
-      matrix(arr[1, , ], nrow = dim(arr)[2], ncol = dim(arr)[3], dimnames = c(dn[2], dn[3]))
+      newdim <- dim(arr)[2:3]
+      newdn <- dimnames(arr)[2:3]
+      arr <- arr[1, , ]
+      .Call(XGSetArrayDimInplace_R, arr, newdim)
+      .Call(XGSetArrayDimNamesInplace_R, arr, newdn)
     }
   } else if (predinteraction) {
     ## Predict interaction
     arr <- aperm(a = arr, perm = c(3, 4, 1, 2)) # [group, row, col, col]
-    arr <- if (n_ret == n_row) {
-      matrix(arr, ncol = 1, dimnames = list(NULL, cnames))
+    if (n_ret == n_row) {
+      .Call(XGSetArrayDimInplace_R, arr, c(n_row, 1L))
+      .Call(XGSetArrayDimNamesInplace_R, arr, list(NULL, cnames))
     } else if (n_groups != 1) {
       ## turns array into list of matrices
-      lapply(seq_len(n_groups), function(g) arr[g, , , ])
+      arr <- lapply(seq_len(n_groups), function(g) arr[g, , , ])
     } else {
       ## remove the first axis (group)
       arr <- arr[1, , , , drop = FALSE]
-      array(arr, dim = dim(arr)[2:4], dimnames(arr)[2:4])
+      newdim <- dim(arr)[2:4]
+      newdn <- dimnames(arr)[2:4]
+      .Call(XGSetArrayDimInplace_R, arr, newdim)
+      .Call(XGSetArrayDimNamesInplace_R, arr, newdn)
     }
   } else {
     ## Normal prediction
-    arr <- if (reshape && n_groups != 1) {
-      matrix(arr, ncol = n_groups, byrow = TRUE)
+    if (reshape && n_groups != 1) {
+      arr <- matrix(arr, ncol = n_groups, byrow = TRUE)
     } else {
-      as.vector(ret)
+      .Call(XGSetArrayDimInplace_R, arr, NULL)
     }
   }
   return(arr)
@@ -492,7 +500,7 @@ xgb.attr <- function(object, name) {
     return(NULL)
   }
   if (!is.null(out)) {
-    if (name %in% c("best_iteration", "best_ntreelimit", "best_score")) {
+    if (name %in% c("best_iteration", "best_score")) {
       out <- as.numeric(out)
     }
   }
@@ -685,16 +693,94 @@ setinfo.xgb.Booster <- function(object, name, info) {
 }
 
 #' @title Get number of boosting in a fitted booster
-#' @param model A fitted `xgb.Booster` model.
+#' @param model,x A fitted `xgb.Booster` model.
 #' @return The number of rounds saved in the model, as an integer.
 #' @details Note that setting booster parameters related to training
 #' continuation / updates through \link{xgb.parameters<-} will reset the
 #' number of rounds to zero.
 #' @export
+#' @rdname xgb.get.num.boosted.rounds
 xgb.get.num.boosted.rounds <- function(model) {
   return(.Call(XGBoosterBoostedRounds_R, xgb.get.handle(model)))
 }
 
+#' @rdname xgb.get.num.boosted.rounds
+#' @export
+length.xgb.Booster <- function(x) {
+  return(xgb.get.num.boosted.rounds(x))
+}
+
+#' @title Slice Booster by Rounds
+#' @description Creates a new booster including only a selected range of rounds / iterations
+#' from an existing booster, as given by the sequence `seq(start, end, step)`.
+#' @details Note that any R attributes that the booster might have, will not be copied into
+#' the resulting object.
+#' @param model,x A fitted `xgb.Booster` object, which is to be sliced by taking only a subset
+#' of its rounds / iterations.
+#' @param start Start of the slice (base-1 and inclusive, like R's \link{seq}).
+#' @param end End of the slice (base-1 and inclusive, like R's \link{seq}).
+#'
+#' Passing a value of zero here is equivalent to passing the full number of rounds in the
+#' booster object.
+#' @param step Step size of the slice. Passing '1' will take every round in the sequence defined by
+#' `(start, end)`, while passing '2' will take every second value, and so on.
+#' @return A sliced booster object containing only the requested rounds.
+#' @examples
+#' data(mtcars)
+#' y <- mtcars$mpg
+#' x <- as.matrix(mtcars[, -1])
+#' dm <- xgb.DMatrix(x, label = y, nthread = 1)
+#' model <- xgb.train(data = dm, params = list(nthread = 1), nrounds = 5)
+#' model_slice <- xgb.slice.Booster(model, 1, 3)
+#' # Prediction for first three rounds
+#' predict(model, x, predleaf = TRUE)[, 1:3]
+#'
+#' # The new model has only those rounds, so
+#' # a full prediction from it is equivalent
+#' predict(model_slice, x, predleaf = TRUE)
+#' @export
+#' @rdname xgb.slice.Booster
+xgb.slice.Booster <- function(model, start, end = xgb.get.num.boosted.rounds(model), step = 1L) {
+  # This makes the slice mimic the behavior of R's 'seq',
+  # which truncates on the end of the slice when the step
+  # doesn't reach it.
+  if (end > start && step > 1) {
+    d <- (end - start + 1) / step
+    if (d != floor(d)) {
+      end <- start + step * ceiling(d) - 1
+    }
+  }
+  return(
+    .Call(
+      XGBoosterSlice_R,
+      xgb.get.handle(model),
+      start - 1,
+      end,
+      step
+    )
+  )
+}
+
+#' @export
+#' @rdname xgb.slice.Booster
+#' @param i The indices - must be an increasing sequence as generated by e.g. `seq(...)`.
+`[.xgb.Booster` <- function(x, i) {
+  if (missing(i)) {
+    return(xgb.slice.Booster(x, 1, 0))
+  }
+  if (length(i) == 1) {
+    return(xgb.slice.Booster(x, i, i))
+  }
+  steps <- diff(i)
+  if (any(steps < 0)) {
+    stop("Can only slice booster with ascending sequences.")
+  }
+  if (length(unique(steps)) > 1) {
+    stop("Can only slice booster with fixed-step sequences.")
+  }
+  return(xgb.slice.Booster(x, i[1L], i[length(i)], steps[1L]))
+}
+
 #' @title Get Features Names from Booster
 #' @description Returns the feature / variable / column names from a fitted
 #' booster object, which are set automatically during the call to \link{xgb.train}
@@ -710,12 +796,6 @@ variable.names.xgb.Booster <- function(object, ...) {
   return(getinfo(object, "feature_name"))
 }
 
-xgb.ntree <- function(bst) {
-  config <- xgb.config(bst)
-  out <- strtoi(config$learner$gradient_booster$gbtree_model_param$num_trees)
-  return(out)
-}
-
 xgb.nthread <- function(bst) {
   config <- xgb.config(bst)
   out <- strtoi(config$learner$generic_param$nthread)
diff --git a/R-package/R/xgb.cv.R b/R-package/R/xgb.cv.R
index a960957ca..eb0495631 100644
--- a/R-package/R/xgb.cv.R
+++ b/R-package/R/xgb.cv.R
@@ -103,7 +103,6 @@
 #'         parameter or randomly generated.
 #'   \item \code{best_iteration} iteration number with the best evaluation metric value
 #'         (only available with early stopping).
-#'   \item \code{best_ntreelimit} and the \code{ntreelimit} Deprecated attributes, use \code{best_iteration} instead.
 #'   \item \code{pred} CV prediction values available when \code{prediction} is set.
 #'         It is either vector or matrix (see \code{\link{cb.cv.predict}}).
 #'   \item \code{models} a list of the CV folds' models. It is only available with the explicit
@@ -218,7 +217,6 @@ xgb.cv <- function(params = list(), data, nrounds, nfold, label = NULL, missing
 
   # extract parameters that can affect the relationship b/w #trees and #iterations
   num_class <- max(as.numeric(NVL(params[['num_class']], 1)), 1) # nolint
-  num_parallel_tree <- max(as.numeric(NVL(params[['num_parallel_tree']], 1)), 1) # nolint
 
   # those are fixed for CV (no training continuation)
   begin_iteration <- 1
@@ -318,7 +316,7 @@ print.xgb.cv.synchronous <- function(x, verbose = FALSE, ...) {
       })
     }
 
-    for (n in c('niter', 'best_iteration', 'best_ntreelimit')) {
+    for (n in c('niter', 'best_iteration')) {
       if (is.null(x[[n]]))
         next
       cat(n, ': ', x[[n]], '\n', sep = '')
diff --git a/R-package/R/xgb.importance.R b/R-package/R/xgb.importance.R
index 44f2eb9b3..547d9677b 100644
--- a/R-package/R/xgb.importance.R
+++ b/R-package/R/xgb.importance.R
@@ -113,19 +113,12 @@
 #' xgb.importance(model = mbst)
 #'
 #' @export
-xgb.importance <- function(feature_names = NULL, model = NULL, trees = NULL,
+xgb.importance <- function(model = NULL, feature_names = getinfo(model, "feature_name"), trees = NULL,
                            data = NULL, label = NULL, target = NULL) {
 
   if (!(is.null(data) && is.null(label) && is.null(target)))
     warning("xgb.importance: parameters 'data', 'label' and 'target' are deprecated")
 
-  if (is.null(feature_names)) {
-    model_feature_names <- xgb.feature_names(model)
-    if (NROW(model_feature_names)) {
-      feature_names <- model_feature_names
-    }
-  }
-
   if (!(is.null(feature_names) || is.character(feature_names)))
     stop("feature_names: Has to be a character vector")
 
diff --git a/R-package/R/xgb.model.dt.tree.R b/R-package/R/xgb.model.dt.tree.R
index df0e672a9..ff416b73e 100644
--- a/R-package/R/xgb.model.dt.tree.R
+++ b/R-package/R/xgb.model.dt.tree.R
@@ -2,11 +2,8 @@
 #'
 #' Parse a boosted tree model text dump into a `data.table` structure.
 #'
-#' @param feature_names Character vector of feature names. If the model already
-#'        contains feature names, those will be used when \code{feature_names=NULL} (default value).
-#'
-#'        Note that, if the model already contains feature names, it's \bold{not} possible to override them here.
-#' @param model Object of class `xgb.Booster`.
+#' @param model Object of class `xgb.Booster`. If it contains feature names (they can be set through
+#'        \link{setinfo}), they will be used in the output from this function.
 #' @param text Character vector previously generated by the function [xgb.dump()]
 #'        (called with parameter `with_stats = TRUE`). `text` takes precedence over `model`.
 #' @param trees An integer vector of tree indices that should be used.
@@ -58,7 +55,7 @@
 #'
 #' # This bst model already has feature_names stored with it, so those would be used when
 #' # feature_names is not set:
-#' (dt <- xgb.model.dt.tree(model = bst))
+#' dt <- xgb.model.dt.tree(bst)
 #'
 #' # How to match feature names of splits that are following a current 'Yes' branch:
 #' merge(
@@ -69,7 +66,7 @@
 #' ]
 #'
 #' @export
-xgb.model.dt.tree <- function(feature_names = NULL, model = NULL, text = NULL,
+xgb.model.dt.tree <- function(model = NULL, text = NULL,
                               trees = NULL, use_int_id = FALSE, ...) {
   check.deprecation(...)
 
@@ -79,24 +76,15 @@ xgb.model.dt.tree <- function(feature_names = NULL, model = NULL, text = NULL,
          "  (or NULL if 'model' was provided).")
   }
 
-  model_feature_names <- NULL
-  if (inherits(model, "xgb.Booster")) {
-    model_feature_names <- xgb.feature_names(model)
-    if (NROW(model_feature_names) && !is.null(feature_names)) {
-      stop("'model' contains feature names. Cannot override them.")
-    }
-  }
-  if (is.null(feature_names) && !is.null(model) && !is.null(model_feature_names))
-    feature_names <- model_feature_names
-
-  if (!(is.null(feature_names) || is.character(feature_names))) {
-    stop("feature_names: must be a character vector")
-  }
-
   if (!(is.null(trees) || is.numeric(trees))) {
     stop("trees: must be a vector of integers.")
   }
 
+  feature_names <- NULL
+  if (inherits(model, "xgb.Booster")) {
+    feature_names <- xgb.feature_names(model)
+  }
+
   from_text <- TRUE
   if (is.null(text)) {
     text <- xgb.dump(model = model, with_stats = TRUE)
@@ -134,7 +122,7 @@ xgb.model.dt.tree <- function(feature_names = NULL, model = NULL, text = NULL,
   branch_rx_w_names <- paste0("\\d+:\\[(.+)<(", anynumber_regex, ")\\] yes=(\\d+),no=(\\d+),missing=(\\d+),",
                               "gain=(", anynumber_regex, "),cover=(", anynumber_regex, ")")
   text_has_feature_names <- FALSE
-  if (NROW(model_feature_names)) {
+  if (NROW(feature_names)) {
     branch_rx <- branch_rx_w_names
     text_has_feature_names <- TRUE
   } else {
@@ -148,9 +136,6 @@ xgb.model.dt.tree <- function(feature_names = NULL, model = NULL, text = NULL,
       }
     }
   }
-  if (text_has_feature_names && is.null(model) && !is.null(feature_names)) {
-    stop("'text' contains feature names. Cannot override them.")
-  }
   branch_cols <- c("Feature", "Split", "Yes", "No", "Missing", "Gain", "Cover")
   td[
     isLeaf == FALSE,
diff --git a/R-package/R/xgb.plot.multi.trees.R b/R-package/R/xgb.plot.multi.trees.R
index 88616cfb7..e6d678ee7 100644
--- a/R-package/R/xgb.plot.multi.trees.R
+++ b/R-package/R/xgb.plot.multi.trees.R
@@ -62,13 +62,13 @@
 #' }
 #'
 #' @export
-xgb.plot.multi.trees <- function(model, feature_names = NULL, features_keep = 5, plot_width = NULL, plot_height = NULL,
+xgb.plot.multi.trees <- function(model, features_keep = 5, plot_width = NULL, plot_height = NULL,
                                  render = TRUE, ...) {
   if (!requireNamespace("DiagrammeR", quietly = TRUE)) {
     stop("DiagrammeR is required for xgb.plot.multi.trees")
   }
   check.deprecation(...)
-  tree.matrix <- xgb.model.dt.tree(feature_names = feature_names, model = model)
+  tree.matrix <- xgb.model.dt.tree(model = model)
 
   # first number of the path represents the tree, then the following numbers are related to the path to follow
   # root init
diff --git a/R-package/R/xgb.plot.tree.R b/R-package/R/xgb.plot.tree.R
index c75a42e84..5ed1e70f6 100644
--- a/R-package/R/xgb.plot.tree.R
+++ b/R-package/R/xgb.plot.tree.R
@@ -2,9 +2,8 @@
 #'
 #' Read a tree model text dump and plot the model.
 #'
-#' @param feature_names Character vector used to overwrite the feature names
-#'        of the model. The default (`NULL`) uses the original feature names.
-#' @param model Object of class `xgb.Booster`.
+#' @param model Object of class `xgb.Booster`. If it contains feature names (they can be set through
+#'        \link{setinfo}), they will be used in the output from this function.
 #' @param trees An integer vector of tree indices that should be used.
 #'        The default (`NULL`) uses all trees.
 #'        Useful, e.g., in multiclass classification to get only
@@ -103,7 +102,7 @@
 #' }
 #'
 #' @export
-xgb.plot.tree <- function(feature_names = NULL, model = NULL, trees = NULL, plot_width = NULL, plot_height = NULL,
+xgb.plot.tree <- function(model = NULL, trees = NULL, plot_width = NULL, plot_height = NULL,
                           render = TRUE, show_node_id = FALSE, style = c("R", "xgboost"), ...) {
   check.deprecation(...)
   if (!inherits(model, "xgb.Booster")) {
@@ -120,17 +119,12 @@ xgb.plot.tree <- function(feature_names = NULL, model = NULL, trees = NULL, plot
     if (NROW(trees) != 1L || !render || show_node_id) {
       stop("style='xgboost' is only supported for single, rendered tree, without node IDs.")
     }
-    if (!is.null(feature_names)) {
-      stop(
-        "style='xgboost' cannot override 'feature_names'. Will automatically take them from the model."
-      )
-    }
 
     txt <- xgb.dump(model, dump_format = "dot")
     return(DiagrammeR::grViz(txt[[trees + 1]], width = plot_width, height = plot_height))
   }
 
-  dt <- xgb.model.dt.tree(feature_names = feature_names, model = model, trees = trees)
+  dt <- xgb.model.dt.tree(model = model, trees = trees)
 
   dt[, label := paste0(Feature, "\nCover: ", Cover, ifelse(Feature == "Leaf", "\nValue: ", "\nGain: "), Gain)]
   if (show_node_id)
diff --git a/R-package/R/xgb.train.R b/R-package/R/xgb.train.R
index a313ed32f..f0f2332b5 100644
--- a/R-package/R/xgb.train.R
+++ b/R-package/R/xgb.train.R
@@ -393,7 +393,6 @@ xgb.train <- function(params = list(), data, nrounds, watchlist = list(),
   # Note: it might look like these aren't used, but they need to be defined in this
   # environment for the callbacks for work correctly.
   num_class <- max(as.numeric(NVL(params[['num_class']], 1)), 1) # nolint
-  num_parallel_tree <- max(as.numeric(NVL(params[['num_parallel_tree']], 1)), 1) # nolint
 
   if (is_update && nrounds > niter_init)
     stop("nrounds cannot be larger than ", niter_init, " (nrounds of xgb_model)")
diff --git a/R-package/demo/predict_first_ntree.R b/R-package/demo/predict_first_ntree.R
index 02c168b77..179c18c70 100644
--- a/R-package/demo/predict_first_ntree.R
+++ b/R-package/demo/predict_first_ntree.R
@@ -15,7 +15,7 @@ cat('start testing prediction from first n trees\n')
 labels <- getinfo(dtest, 'label')
 
 ### predict using first 1 tree
-ypred1 <- predict(bst, dtest, ntreelimit = 1)
+ypred1 <- predict(bst, dtest, iterationrange = c(1, 1))
 # by default, we predict using all the trees
 ypred2 <- predict(bst, dtest)
 
diff --git a/R-package/man/cb.cv.predict.Rd b/R-package/man/cb.cv.predict.Rd
index ded899e8a..4cabac1c9 100644
--- a/R-package/man/cb.cv.predict.Rd
+++ b/R-package/man/cb.cv.predict.Rd
@@ -35,8 +35,6 @@ Callback function expects the following values to be set in its calling frame:
 \code{data},
 \code{end_iteration},
 \code{params},
-\code{num_parallel_tree},
-\code{num_class}.
 }
 \seealso{
 \code{\link{callbacks}}
diff --git a/R-package/man/cb.early.stop.Rd b/R-package/man/cb.early.stop.Rd
index 7b6efa842..7cd51a3ce 100644
--- a/R-package/man/cb.early.stop.Rd
+++ b/R-package/man/cb.early.stop.Rd
@@ -55,7 +55,6 @@ Callback function expects the following values to be set in its calling frame:
 \code{iteration},
 \code{begin_iteration},
 \code{end_iteration},
-\code{num_parallel_tree}.
 }
 \seealso{
 \code{\link{callbacks}},
diff --git a/R-package/man/predict.xgb.Booster.Rd b/R-package/man/predict.xgb.Booster.Rd
index 66194c64f..7a6dd6c13 100644
--- a/R-package/man/predict.xgb.Booster.Rd
+++ b/R-package/man/predict.xgb.Booster.Rd
@@ -9,7 +9,6 @@
   newdata,
   missing = NA,
   outputmargin = FALSE,
-  ntreelimit = NULL,
   predleaf = FALSE,
   predcontrib = FALSE,
   approxcontrib = FALSE,
@@ -36,8 +35,6 @@ missing values in data (e.g., 0 or some other extreme value).}
 sum of predictions from boosting iterations' results. E.g., setting \code{outputmargin=TRUE} for
 logistic regression would return log-odds instead of probabilities.}
 
-\item{ntreelimit}{Deprecated, use \code{iterationrange} instead.}
-
 \item{predleaf}{Whether to predict pre-tree leaf indices.}
 
 \item{predcontrib}{Whether to return feature contributions to individual predictions (see Details).}
@@ -53,11 +50,18 @@ or \code{predinteraction} is \code{TRUE}.}
 \item{training}{Whether the predictions are used for training. For dart booster,
 training predicting will perform dropout.}
 
-\item{iterationrange}{Specifies which trees are used in prediction. For
-example, take a random forest with 100 rounds.
-With \code{iterationrange=c(1, 21)}, only the trees built during \verb{[1, 21)} (half open set)
-rounds are used in this prediction. The index is 1-based just like an R vector. When set
-to \code{c(1, 1)}, XGBoost will use all trees.}
+\item{iterationrange}{Sequence of rounds/iterations from the model to use for prediction, specified by passing
+a two-dimensional vector with the start and end numbers in the sequence (same format as R's \code{seq} - i.e.
+base-1 indexing, and inclusive of both ends).
+
+\if{html}{\out{<div class="sourceCode">}}\preformatted{   For example, passing `c(1,20)` will predict using the first twenty iterations, while passing `c(1,1)` will
+   predict using only the first one.
+
+   If passing `NULL`, will either stop at the best iteration if the model used early stopping, or use all
+   of the iterations (rounds) otherwise.
+
+   If passing "all", will use all of the rounds regardless of whether the model had early stopping or not.
+}\if{html}{\out{</div>}}}
 
 \item{strict_shape}{Default is \code{FALSE}. When set to \code{TRUE}, the output
 type and shape of predictions are invariant to the model type.}
@@ -145,7 +149,7 @@ bst <- xgb.train(
 # use all trees by default
 pred <- predict(bst, test$data)
 # use only the 1st tree
-pred1 <- predict(bst, test$data, iterationrange = c(1, 2))
+pred1 <- predict(bst, test$data, iterationrange = c(1, 1))
 
 # Predicting tree leafs:
 # the result is an nsamples X ntrees matrix
@@ -216,7 +220,7 @@ str(pred)
 all.equal(pred, pred_labels)
 # prediction from using only 5 iterations should result
 # in the same error as seen in iteration 5:
-pred5 <- predict(bst, as.matrix(iris[, -5]), iterationrange = c(1, 6))
+pred5 <- predict(bst, as.matrix(iris[, -5]), iterationrange = c(1, 5))
 sum(pred5 != lb) / length(lb)
 
 }
diff --git a/R-package/man/xgb.cv.Rd b/R-package/man/xgb.cv.Rd
index 2d8508c4d..9f6103a52 100644
--- a/R-package/man/xgb.cv.Rd
+++ b/R-package/man/xgb.cv.Rd
@@ -135,7 +135,6 @@ It is created by the \code{\link{cb.evaluation.log}} callback.
 parameter or randomly generated.
 \item \code{best_iteration} iteration number with the best evaluation metric value
 (only available with early stopping).
-\item \code{best_ntreelimit} and the \code{ntreelimit} Deprecated attributes, use \code{best_iteration} instead.
 \item \code{pred} CV prediction values available when \code{prediction} is set.
 It is either vector or matrix (see \code{\link{cb.cv.predict}}).
 \item \code{models} a list of the CV folds' models. It is only available with the explicit
diff --git a/R-package/man/xgb.get.num.boosted.rounds.Rd b/R-package/man/xgb.get.num.boosted.rounds.Rd
index 74c94d95b..551dc4a83 100644
--- a/R-package/man/xgb.get.num.boosted.rounds.Rd
+++ b/R-package/man/xgb.get.num.boosted.rounds.Rd
@@ -2,12 +2,15 @@
 % Please edit documentation in R/xgb.Booster.R
 \name{xgb.get.num.boosted.rounds}
 \alias{xgb.get.num.boosted.rounds}
+\alias{length.xgb.Booster}
 \title{Get number of boosting in a fitted booster}
 \usage{
 xgb.get.num.boosted.rounds(model)
+
+\method{length}{xgb.Booster}(x)
 }
 \arguments{
-\item{model}{A fitted \code{xgb.Booster} model.}
+\item{model, x}{A fitted \code{xgb.Booster} model.}
 }
 \value{
 The number of rounds saved in the model, as an integer.
diff --git a/R-package/man/xgb.importance.Rd b/R-package/man/xgb.importance.Rd
index fca1b70c4..73b91e8b4 100644
--- a/R-package/man/xgb.importance.Rd
+++ b/R-package/man/xgb.importance.Rd
@@ -5,8 +5,8 @@
 \title{Feature importance}
 \usage{
 xgb.importance(
-  feature_names = NULL,
   model = NULL,
+  feature_names = getinfo(model, "feature_name"),
   trees = NULL,
   data = NULL,
   label = NULL,
@@ -14,11 +14,11 @@ xgb.importance(
 )
 }
 \arguments{
+\item{model}{Object of class \code{xgb.Booster}.}
+
 \item{feature_names}{Character vector used to overwrite the feature names
 of the model. The default is \code{NULL} (use original feature names).}
 
-\item{model}{Object of class \code{xgb.Booster}.}
-
 \item{trees}{An integer vector of tree indices that should be included
 into the importance calculation (only for the "gbtree" booster).
 The default (\code{NULL}) parses all trees.
diff --git a/R-package/man/xgb.model.dt.tree.Rd b/R-package/man/xgb.model.dt.tree.Rd
index e63bd4b10..75f1cd0f4 100644
--- a/R-package/man/xgb.model.dt.tree.Rd
+++ b/R-package/man/xgb.model.dt.tree.Rd
@@ -5,7 +5,6 @@
 \title{Parse model text dump}
 \usage{
 xgb.model.dt.tree(
-  feature_names = NULL,
   model = NULL,
   text = NULL,
   trees = NULL,
@@ -14,13 +13,8 @@ xgb.model.dt.tree(
 )
 }
 \arguments{
-\item{feature_names}{Character vector of feature names. If the model already
-contains feature names, those will be used when \code{feature_names=NULL} (default value).
-
-\if{html}{\out{<div class="sourceCode">}}\preformatted{   Note that, if the model already contains feature names, it's \\bold\{not\} possible to override them here.
-}\if{html}{\out{</div>}}}
-
-\item{model}{Object of class \code{xgb.Booster}.}
+\item{model}{Object of class \code{xgb.Booster}. If it contains feature names (they can be set through
+\link{setinfo}), they will be used in the output from this function.}
 
 \item{text}{Character vector previously generated by the function \code{\link[=xgb.dump]{xgb.dump()}}
 (called with parameter \code{with_stats = TRUE}). \code{text} takes precedence over \code{model}.}
@@ -81,7 +75,7 @@ bst <- xgboost(
 
 # This bst model already has feature_names stored with it, so those would be used when
 # feature_names is not set:
-(dt <- xgb.model.dt.tree(model = bst))
+dt <- xgb.model.dt.tree(bst)
 
 # How to match feature names of splits that are following a current 'Yes' branch:
 merge(
diff --git a/R-package/man/xgb.plot.multi.trees.Rd b/R-package/man/xgb.plot.multi.trees.Rd
index d98a3482c..7fa75c85d 100644
--- a/R-package/man/xgb.plot.multi.trees.Rd
+++ b/R-package/man/xgb.plot.multi.trees.Rd
@@ -6,7 +6,6 @@
 \usage{
 xgb.plot.multi.trees(
   model,
-  feature_names = NULL,
   features_keep = 5,
   plot_width = NULL,
   plot_height = NULL,
@@ -15,10 +14,8 @@ xgb.plot.multi.trees(
 )
 }
 \arguments{
-\item{model}{Object of class \code{xgb.Booster}.}
-
-\item{feature_names}{Character vector used to overwrite the feature names
-of the model. The default (\code{NULL}) uses the original feature names.}
+\item{model}{Object of class \code{xgb.Booster}. If it contains feature names (they can be set through
+\link{setinfo}), they will be used in the output from this function.}
 
 \item{features_keep}{Number of features to keep in each position of the multi trees,
 by default 5.}
diff --git a/R-package/man/xgb.plot.tree.Rd b/R-package/man/xgb.plot.tree.Rd
index a09bb7183..69d37301d 100644
--- a/R-package/man/xgb.plot.tree.Rd
+++ b/R-package/man/xgb.plot.tree.Rd
@@ -5,7 +5,6 @@
 \title{Plot boosted trees}
 \usage{
 xgb.plot.tree(
-  feature_names = NULL,
   model = NULL,
   trees = NULL,
   plot_width = NULL,
@@ -17,10 +16,8 @@ xgb.plot.tree(
 )
 }
 \arguments{
-\item{feature_names}{Character vector used to overwrite the feature names
-of the model. The default (\code{NULL}) uses the original feature names.}
-
-\item{model}{Object of class \code{xgb.Booster}.}
+\item{model}{Object of class \code{xgb.Booster}. If it contains feature names (they can be set through
+\link{setinfo}), they will be used in the output from this function.}
 
 \item{trees}{An integer vector of tree indices that should be used.
 The default (\code{NULL}) uses all trees.
diff --git a/R-package/man/xgb.slice.Booster.Rd b/R-package/man/xgb.slice.Booster.Rd
new file mode 100644
index 000000000..759139901
--- /dev/null
+++ b/R-package/man/xgb.slice.Booster.Rd
@@ -0,0 +1,57 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/xgb.Booster.R
+\name{xgb.slice.Booster}
+\alias{xgb.slice.Booster}
+\alias{[.xgb.Booster}
+\title{Slice Booster by Rounds}
+\usage{
+xgb.slice.Booster(
+  model,
+  start,
+  end = xgb.get.num.boosted.rounds(model),
+  step = 1L
+)
+
+\method{[}{xgb.Booster}(x, i)
+}
+\arguments{
+\item{model, x}{A fitted \code{xgb.Booster} object, which is to be sliced by taking only a subset
+of its rounds / iterations.}
+
+\item{start}{Start of the slice (base-1 and inclusive, like R's \link{seq}).}
+
+\item{end}{End of the slice (base-1 and inclusive, like R's \link{seq}).
+
+Passing a value of zero here is equivalent to passing the full number of rounds in the
+booster object.}
+
+\item{step}{Step size of the slice. Passing '1' will take every round in the sequence defined by
+\verb{(start, end)}, while passing '2' will take every second value, and so on.}
+
+\item{i}{The indices - must be an increasing sequence as generated by e.g. \code{seq(...)}.}
+}
+\value{
+A sliced booster object containing only the requested rounds.
+}
+\description{
+Creates a new booster including only a selected range of rounds / iterations
+from an existing booster, as given by the sequence \code{seq(start, end, step)}.
+}
+\details{
+Note that any R attributes that the booster might have, will not be copied into
+the resulting object.
+}
+\examples{
+data(mtcars)
+y <- mtcars$mpg
+x <- as.matrix(mtcars[, -1])
+dm <- xgb.DMatrix(x, label = y, nthread = 1)
+model <- xgb.train(data = dm, params = list(nthread = 1), nrounds = 5)
+model_slice <- xgb.slice.Booster(model, 1, 3)
+# Prediction for first three rounds
+predict(model, x, predleaf = TRUE)[, 1:3]
+
+# The new model has only those rounds, so
+# a full prediction from it is equivalent
+predict(model_slice, x, predleaf = TRUE)
+}
diff --git a/R-package/src/init.c b/R-package/src/init.c
index 81c28c401..fff5d9f90 100644
--- a/R-package/src/init.c
+++ b/R-package/src/init.c
@@ -42,6 +42,8 @@ extern SEXP XGBoosterSetAttr_R(SEXP, SEXP, SEXP);
 extern SEXP XGBoosterSetParam_R(SEXP, SEXP, SEXP);
 extern SEXP XGBoosterUpdateOneIter_R(SEXP, SEXP, SEXP);
 extern SEXP XGCheckNullPtr_R(SEXP);
+extern SEXP XGSetArrayDimInplace_R(SEXP, SEXP);
+extern SEXP XGSetArrayDimNamesInplace_R(SEXP, SEXP);
 extern SEXP XGDMatrixCreateFromCSC_R(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP);
 extern SEXP XGDMatrixCreateFromCSR_R(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP);
 extern SEXP XGDMatrixCreateFromFile_R(SEXP, SEXP);
@@ -62,6 +64,7 @@ extern SEXP XGDMatrixSliceDMatrix_R(SEXP, SEXP);
 extern SEXP XGBSetGlobalConfig_R(SEXP);
 extern SEXP XGBGetGlobalConfig_R(void);
 extern SEXP XGBoosterFeatureScore_R(SEXP, SEXP);
+extern SEXP XGBoosterSlice_R(SEXP, SEXP, SEXP, SEXP);
 
 static const R_CallMethodDef CallEntries[] = {
   {"XGDuplicate_R",               (DL_FUNC) &XGDuplicate_R,               1},
@@ -90,6 +93,8 @@ static const R_CallMethodDef CallEntries[] = {
   {"XGBoosterSetParam_R",         (DL_FUNC) &XGBoosterSetParam_R,         3},
   {"XGBoosterUpdateOneIter_R",    (DL_FUNC) &XGBoosterUpdateOneIter_R,    3},
   {"XGCheckNullPtr_R",            (DL_FUNC) &XGCheckNullPtr_R,            1},
+  {"XGSetArrayDimInplace_R",      (DL_FUNC) &XGSetArrayDimInplace_R,      2},
+  {"XGSetArrayDimNamesInplace_R", (DL_FUNC) &XGSetArrayDimNamesInplace_R, 2},
   {"XGDMatrixCreateFromCSC_R",    (DL_FUNC) &XGDMatrixCreateFromCSC_R,    6},
   {"XGDMatrixCreateFromCSR_R",    (DL_FUNC) &XGDMatrixCreateFromCSR_R,    6},
   {"XGDMatrixCreateFromFile_R",   (DL_FUNC) &XGDMatrixCreateFromFile_R,   2},
@@ -110,6 +115,7 @@ static const R_CallMethodDef CallEntries[] = {
   {"XGBSetGlobalConfig_R",        (DL_FUNC) &XGBSetGlobalConfig_R,        1},
   {"XGBGetGlobalConfig_R",        (DL_FUNC) &XGBGetGlobalConfig_R,        0},
   {"XGBoosterFeatureScore_R",     (DL_FUNC) &XGBoosterFeatureScore_R,     2},
+  {"XGBoosterSlice_R",            (DL_FUNC) &XGBoosterSlice_R,            4},
   {NULL, NULL, 0}
 };
 
diff --git a/R-package/src/xgboost_R.cc b/R-package/src/xgboost_R.cc
index 63f36ad6a..1d01b9aae 100644
--- a/R-package/src/xgboost_R.cc
+++ b/R-package/src/xgboost_R.cc
@@ -263,6 +263,16 @@ XGB_DLL SEXP XGCheckNullPtr_R(SEXP handle) {
   return Rf_ScalarLogical(R_ExternalPtrAddr(handle) == nullptr);
 }
 
+XGB_DLL SEXP XGSetArrayDimInplace_R(SEXP arr, SEXP dims) {
+  Rf_setAttrib(arr, R_DimSymbol, dims);
+  return R_NilValue;
+}
+
+XGB_DLL SEXP XGSetArrayDimNamesInplace_R(SEXP arr, SEXP dim_names) {
+  Rf_setAttrib(arr, R_DimNamesSymbol, dim_names);
+  return R_NilValue;
+}
+
 namespace {
 void _DMatrixFinalizer(SEXP ext) {
   R_API_BEGIN();
@@ -1279,3 +1289,18 @@ XGB_DLL SEXP XGBoosterFeatureScore_R(SEXP handle, SEXP json_config) {
 
   return r_out;
 }
+
+XGB_DLL SEXP XGBoosterSlice_R(SEXP handle, SEXP begin_layer, SEXP end_layer, SEXP step) {
+  SEXP out = Rf_protect(XGBMakeEmptyAltrep());
+  R_API_BEGIN();
+  BoosterHandle handle_out = nullptr;
+  CHECK_CALL(XGBoosterSlice(R_ExternalPtrAddr(handle),
+                            Rf_asInteger(begin_layer),
+                            Rf_asInteger(end_layer),
+                            Rf_asInteger(step),
+                            &handle_out));
+  XGBAltrepSetPointer(out, handle_out);
+  R_API_END();
+  Rf_unprotect(1);
+  return out;
+}
diff --git a/R-package/src/xgboost_R.h b/R-package/src/xgboost_R.h
index 79d441792..ec30dbada 100644
--- a/R-package/src/xgboost_R.h
+++ b/R-package/src/xgboost_R.h
@@ -23,6 +23,22 @@
  */
 XGB_DLL SEXP XGCheckNullPtr_R(SEXP handle);
 
+/*!
+ * \brief set the dimensions of an array in-place
+ * \param arr
+ * \param dims dimensions to set to the array
+ * \return NULL value
+ */
+XGB_DLL SEXP XGSetArrayDimInplace_R(SEXP arr, SEXP dims);
+
+/*!
+ * \brief set the names of the dimensions of an array in-place
+ * \param arr
+ * \param dim_names names for the dimensions to set
+ * \return NULL value
+ */
+XGB_DLL SEXP XGSetArrayDimNamesInplace_R(SEXP arr, SEXP dim_names);
+
 /*!
  * \brief Set global configuration
  * \param json_str a JSON string representing the list of key-value pairs
@@ -386,4 +402,14 @@ XGB_DLL SEXP XGBoosterGetAttrNames_R(SEXP handle);
  */
 XGB_DLL SEXP XGBoosterFeatureScore_R(SEXP handle, SEXP json_config);
 
+/*!
+ * \brief Slice a fitted booster model (by rounds)
+ * \param handle handle to the fitted booster
+ * \param begin_layer start of the slice
+ * \param end_later end of the slice; end_layer=0 is equivalent to end_layer=num_boost_round
+ * \param step step size of the slice
+ * \return The sliced booster with the requested rounds only
+ */
+XGB_DLL SEXP XGBoosterSlice_R(SEXP handle, SEXP begin_layer, SEXP end_layer, SEXP step);
+
 #endif  // XGBOOST_WRAPPER_R_H_ // NOLINT(*)
diff --git a/R-package/tests/testthat/test_basic.R b/R-package/tests/testthat/test_basic.R
index 8dd934765..03a8ddbe1 100644
--- a/R-package/tests/testthat/test_basic.R
+++ b/R-package/tests/testthat/test_basic.R
@@ -33,15 +33,11 @@ test_that("train and predict binary classification", {
   pred <- predict(bst, test$data)
   expect_length(pred, 1611)
 
-  pred1 <- predict(bst, train$data, ntreelimit = 1)
+  pred1 <- predict(bst, train$data, iterationrange = c(1, 1))
   expect_length(pred1, 6513)
   err_pred1 <- sum((pred1 > 0.5) != train$label) / length(train$label)
   err_log <- attributes(bst)$evaluation_log[1, train_error]
   expect_lt(abs(err_pred1 - err_log), 10e-6)
-
-  pred2 <- predict(bst, train$data, iterationrange = c(1, 2))
-  expect_length(pred1, 6513)
-  expect_equal(pred1, pred2)
 })
 
 test_that("parameter validation works", {
@@ -117,8 +113,8 @@ test_that("dart prediction works", {
     nrounds = nrounds,
     objective = "reg:squarederror"
   )
-  pred_by_xgboost_0 <- predict(booster_by_xgboost, newdata = d, ntreelimit = 0)
-  pred_by_xgboost_1 <- predict(booster_by_xgboost, newdata = d, ntreelimit = nrounds)
+  pred_by_xgboost_0 <- predict(booster_by_xgboost, newdata = d, iterationrange = NULL)
+  pred_by_xgboost_1 <- predict(booster_by_xgboost, newdata = d, iterationrange = c(1, nrounds))
   expect_true(all(matrix(pred_by_xgboost_0, byrow = TRUE) == matrix(pred_by_xgboost_1, byrow = TRUE)))
 
   pred_by_xgboost_2 <- predict(booster_by_xgboost, newdata = d, training = TRUE)
@@ -139,8 +135,8 @@ test_that("dart prediction works", {
     data = dtrain,
     nrounds = nrounds
   )
-  pred_by_train_0 <- predict(booster_by_train, newdata = dtrain, ntreelimit = 0)
-  pred_by_train_1 <- predict(booster_by_train, newdata = dtrain, ntreelimit = nrounds)
+  pred_by_train_0 <- predict(booster_by_train, newdata = dtrain, iterationrange = NULL)
+  pred_by_train_1 <- predict(booster_by_train, newdata = dtrain, iterationrange = c(1, nrounds))
   pred_by_train_2 <- predict(booster_by_train, newdata = dtrain, training = TRUE)
 
   expect_true(all(matrix(pred_by_train_0, byrow = TRUE) == matrix(pred_by_xgboost_0, byrow = TRUE)))
@@ -162,7 +158,7 @@ test_that("train and predict softprob", {
   )
   expect_false(is.null(attributes(bst)$evaluation_log))
   expect_lt(attributes(bst)$evaluation_log[, min(train_merror)], 0.025)
-  expect_equal(xgb.get.num.boosted.rounds(bst) * 3, xgb.ntree(bst))
+  expect_equal(xgb.get.num.boosted.rounds(bst), 5)
   pred <- predict(bst, as.matrix(iris[, -5]))
   expect_length(pred, nrow(iris) * 3)
   # row sums add up to total probability of 1:
@@ -174,12 +170,12 @@ test_that("train and predict softprob", {
   err <- sum(pred_labels != lb) / length(lb)
   expect_equal(attributes(bst)$evaluation_log[5, train_merror], err, tolerance = 5e-6)
   # manually calculate error at the 1st iteration:
-  mpred <- predict(bst, as.matrix(iris[, -5]), reshape = TRUE, ntreelimit = 1)
+  mpred <- predict(bst, as.matrix(iris[, -5]), reshape = TRUE, iterationrange = c(1, 1))
   pred_labels <- max.col(mpred) - 1
   err <- sum(pred_labels != lb) / length(lb)
   expect_equal(attributes(bst)$evaluation_log[1, train_merror], err, tolerance = 5e-6)
 
-  mpred1 <- predict(bst, as.matrix(iris[, -5]), reshape = TRUE, iterationrange = c(1, 2))
+  mpred1 <- predict(bst, as.matrix(iris[, -5]), reshape = TRUE, iterationrange = c(1, 1))
   expect_equal(mpred, mpred1)
 
   d <- cbind(
@@ -213,7 +209,7 @@ test_that("train and predict softmax", {
   )
   expect_false(is.null(attributes(bst)$evaluation_log))
   expect_lt(attributes(bst)$evaluation_log[, min(train_merror)], 0.025)
-  expect_equal(xgb.get.num.boosted.rounds(bst) * 3, xgb.ntree(bst))
+  expect_equal(xgb.get.num.boosted.rounds(bst), 5)
 
   pred <- predict(bst, as.matrix(iris[, -5]))
   expect_length(pred, nrow(iris))
@@ -233,19 +229,15 @@ test_that("train and predict RF", {
     watchlist = list(train = xgb.DMatrix(train$data, label = lb))
   )
   expect_equal(xgb.get.num.boosted.rounds(bst), 1)
-  expect_equal(xgb.ntree(bst), 20)
 
   pred <- predict(bst, train$data)
   pred_err <- sum((pred > 0.5) != lb) / length(lb)
   expect_lt(abs(attributes(bst)$evaluation_log[1, train_error] - pred_err), 10e-6)
   # expect_lt(pred_err, 0.03)
 
-  pred <- predict(bst, train$data, ntreelimit = 20)
+  pred <- predict(bst, train$data, iterationrange = c(1, 1))
   pred_err_20 <- sum((pred > 0.5) != lb) / length(lb)
   expect_equal(pred_err_20, pred_err)
-
-  pred1 <- predict(bst, train$data, iterationrange = c(1, 2))
-  expect_equal(pred, pred1)
 })
 
 test_that("train and predict RF with softprob", {
@@ -261,7 +253,6 @@ test_that("train and predict RF with softprob", {
     watchlist = list(train = xgb.DMatrix(as.matrix(iris[, -5]), label = lb))
   )
   expect_equal(xgb.get.num.boosted.rounds(bst), 15)
-  expect_equal(xgb.ntree(bst), 15 * 3 * 4)
   # predict for all iterations:
   pred <- predict(bst, as.matrix(iris[, -5]), reshape = TRUE)
   expect_equal(dim(pred), c(nrow(iris), 3))
@@ -269,7 +260,7 @@ test_that("train and predict RF with softprob", {
   err <- sum(pred_labels != lb) / length(lb)
   expect_equal(attributes(bst)$evaluation_log[nrounds, train_merror], err, tolerance = 5e-6)
   # predict for 7 iterations and adjust for 4 parallel trees per iteration
-  pred <- predict(bst, as.matrix(iris[, -5]), reshape = TRUE, ntreelimit = 7 * 4)
+  pred <- predict(bst, as.matrix(iris[, -5]), reshape = TRUE, iterationrange = c(1, 7))
   err <- sum((max.col(pred) - 1) != lb) / length(lb)
   expect_equal(attributes(bst)$evaluation_log[7, train_merror], err, tolerance = 5e-6)
 })
diff --git a/R-package/tests/testthat/test_booster_slicing.R b/R-package/tests/testthat/test_booster_slicing.R
new file mode 100644
index 000000000..711ccd8b6
--- /dev/null
+++ b/R-package/tests/testthat/test_booster_slicing.R
@@ -0,0 +1,67 @@
+context("testing xgb.Booster slicing")
+
+data(agaricus.train, package = "xgboost")
+dm <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label, nthread = 1)
+# Note: here need large step sizes in order for the predictions
+# to have substantially different leaf assignments on each tree
+model <- xgb.train(
+  params = list(objective = "binary:logistic", nthread = 1, max_depth = 4, eta = 0.5),
+  data = dm,
+  nrounds = 20
+)
+pred <- predict(model, dm, predleaf = TRUE, reshape = TRUE)
+
+test_that("Slicing full model", {
+  new_model <- xgb.slice.Booster(model, 1, 0)
+  expect_equal(xgb.save.raw(new_model), xgb.save.raw(model))
+
+  new_model <- model[]
+  expect_equal(xgb.save.raw(new_model), xgb.save.raw(model))
+
+  new_model <- model[1:length(model)] # nolint
+  expect_equal(xgb.save.raw(new_model), xgb.save.raw(model))
+})
+
+test_that("Slicing sequence from start", {
+  new_model <- xgb.slice.Booster(model, 1, 10)
+  new_pred <- predict(new_model, dm, predleaf = TRUE, reshape = TRUE)
+  expect_equal(new_pred, pred[, seq(1, 10)])
+
+  new_model <- model[1:10]
+  new_pred <- predict(new_model, dm, predleaf = TRUE, reshape = TRUE)
+  expect_equal(new_pred, pred[, seq(1, 10)])
+})
+
+test_that("Slicing sequence from middle", {
+  new_model <- xgb.slice.Booster(model, 5, 10)
+  new_pred <- predict(new_model, dm, predleaf = TRUE, reshape = TRUE)
+  expect_equal(new_pred, pred[, seq(5, 10)])
+
+  new_model <- model[5:10]
+  new_pred <- predict(new_model, dm, predleaf = TRUE, reshape = TRUE)
+  expect_equal(new_pred, pred[, seq(5, 10)])
+})
+
+test_that("Slicing with non-unit step", {
+  for (s in 2:5) {
+    new_model <- xgb.slice.Booster(model, 1, 17, s)
+    new_pred <- predict(new_model, dm, predleaf = TRUE, reshape = TRUE)
+    expect_equal(new_pred, pred[, seq(1, 17, s)])
+
+    new_model <- model[seq(1, 17, s)]
+    new_pred <- predict(new_model, dm, predleaf = TRUE, reshape = TRUE)
+    expect_equal(new_pred, pred[, seq(1, 17, s)])
+  }
+})
+
+test_that("Slicing with non-unit step from middle", {
+  for (s in 2:5) {
+    new_model <- xgb.slice.Booster(model, 4, 17, s)
+    new_pred <- predict(new_model, dm, predleaf = TRUE, reshape = TRUE)
+    expect_equal(new_pred, pred[, seq(4, 17, s)])
+
+    new_model <- model[seq(4, 17, s)]
+    new_pred <- predict(new_model, dm, predleaf = TRUE, reshape = TRUE)
+    expect_equal(new_pred, pred[, seq(4, 17, s)])
+  }
+})
diff --git a/R-package/tests/testthat/test_callbacks.R b/R-package/tests/testthat/test_callbacks.R
index afa270c0b..c60d0c246 100644
--- a/R-package/tests/testthat/test_callbacks.R
+++ b/R-package/tests/testthat/test_callbacks.R
@@ -211,12 +211,11 @@ test_that("early stopping xgb.train works", {
   , "Stopping. Best iteration")
   expect_false(is.null(xgb.attr(bst, "best_iteration")))
   expect_lt(xgb.attr(bst, "best_iteration"), 19)
-  expect_equal(xgb.attr(bst, "best_iteration"), xgb.attr(bst, "best_ntreelimit"))
 
   pred <- predict(bst, dtest)
   expect_equal(length(pred), 1611)
   err_pred <- err(ltest, pred)
-  err_log <- attributes(bst)$evaluation_log[xgb.attr(bst, "best_iteration"), test_error]
+  err_log <- attributes(bst)$evaluation_log[xgb.attr(bst, "best_iteration") + 1, test_error]
   expect_equal(err_log, err_pred, tolerance = 5e-6)
 
   set.seed(11)
@@ -231,8 +230,7 @@ test_that("early stopping xgb.train works", {
   loaded <- xgb.load(fname)
 
   expect_false(is.null(xgb.attr(loaded, "best_iteration")))
-  expect_equal(xgb.attr(loaded, "best_iteration"), xgb.attr(bst, "best_ntreelimit"))
-  expect_equal(xgb.attr(loaded, "best_ntreelimit"), xgb.attr(bst, "best_ntreelimit"))
+  expect_equal(xgb.attr(loaded, "best_iteration"), xgb.attr(bst, "best_iteration"))
 })
 
 test_that("early stopping using a specific metric works", {
@@ -245,12 +243,11 @@ test_that("early stopping using a specific metric works", {
   , "Stopping. Best iteration")
   expect_false(is.null(xgb.attr(bst, "best_iteration")))
   expect_lt(xgb.attr(bst, "best_iteration"), 19)
-  expect_equal(xgb.attr(bst, "best_iteration"), xgb.attr(bst, "best_ntreelimit"))
 
-  pred <- predict(bst, dtest, ntreelimit = xgb.attr(bst, "best_ntreelimit"))
+  pred <- predict(bst, dtest, iterationrange = c(1, xgb.attr(bst, "best_iteration") + 1))
   expect_equal(length(pred), 1611)
   logloss_pred <- sum(-ltest * log(pred) - (1 - ltest) * log(1 - pred)) / length(ltest)
-  logloss_log <- attributes(bst)$evaluation_log[xgb.attr(bst, "best_iteration"), test_logloss]
+  logloss_log <- attributes(bst)$evaluation_log[xgb.attr(bst, "best_iteration") + 1, test_logloss]
   expect_equal(logloss_log, logloss_pred, tolerance = 1e-5)
 })
 
@@ -286,7 +283,6 @@ test_that("early stopping xgb.cv works", {
   , "Stopping. Best iteration")
   expect_false(is.null(cv$best_iteration))
   expect_lt(cv$best_iteration, 19)
-  expect_equal(cv$best_iteration, cv$best_ntreelimit)
   # the best error is min error:
   expect_true(cv$evaluation_log[, test_error_mean[cv$best_iteration] == min(test_error_mean)])
 })
@@ -354,3 +350,44 @@ test_that("prediction in xgb.cv for softprob works", {
   expect_equal(dim(cv$pred), c(nrow(iris), 3))
   expect_lt(diff(range(rowSums(cv$pred))), 1e-6)
 })
+
+test_that("prediction in xgb.cv works for multi-quantile", {
+  data(mtcars)
+  y <- mtcars$mpg
+  x <- as.matrix(mtcars[, -1])
+  dm <- xgb.DMatrix(x, label = y, nthread = 1)
+  cv <- xgb.cv(
+    data = dm,
+    params = list(
+      objective = "reg:quantileerror",
+      quantile_alpha = c(0.1, 0.2, 0.5, 0.8, 0.9),
+      nthread = 1
+    ),
+    nrounds = 5,
+    nfold = 3,
+    prediction = TRUE,
+    verbose = 0
+  )
+  expect_equal(dim(cv$pred), c(nrow(x), 5))
+})
+
+test_that("prediction in xgb.cv works for multi-output", {
+  data(mtcars)
+  y <- mtcars$mpg
+  x <- as.matrix(mtcars[, -1])
+  dm <- xgb.DMatrix(x, label = cbind(y, -y), nthread = 1)
+  cv <- xgb.cv(
+    data = dm,
+    params = list(
+      tree_method = "hist",
+      multi_strategy = "multi_output_tree",
+      objective = "reg:squarederror",
+      nthread = n_threads
+    ),
+    nrounds = 5,
+    nfold = 3,
+    prediction = TRUE,
+    verbose = 0
+  )
+  expect_equal(dim(cv$pred), c(nrow(x), 2))
+})
diff --git a/R-package/tests/testthat/test_glm.R b/R-package/tests/testthat/test_glm.R
index ae698d98f..349bcce8d 100644
--- a/R-package/tests/testthat/test_glm.R
+++ b/R-package/tests/testthat/test_glm.R
@@ -72,10 +72,10 @@ test_that("gblinear early stopping works", {
   booster <- xgb.train(
     param, dtrain, n, list(eval = dtest, train = dtrain), early_stopping_rounds = es_round
   )
-  expect_equal(xgb.attr(booster, "best_iteration"), 5)
+  expect_equal(xgb.attr(booster, "best_iteration"), 4)
   predt_es <- predict(booster, dtrain)
 
-  n <- xgb.attr(booster, "best_iteration") + es_round
+  n <- xgb.attr(booster, "best_iteration") + es_round + 1
   booster <- xgb.train(
     param, dtrain, n, list(eval = dtest, train = dtrain), early_stopping_rounds = es_round
   )
diff --git a/R-package/tests/testthat/test_helpers.R b/R-package/tests/testthat/test_helpers.R
index 372f2520c..badac0213 100644
--- a/R-package/tests/testthat/test_helpers.R
+++ b/R-package/tests/testthat/test_helpers.R
@@ -282,9 +282,6 @@ test_that("xgb.model.dt.tree works with and without feature names", {
     expect_equal(dim(dt.tree), c(188, 10))
   expect_output(str(dt.tree), 'Feature.*\\"Age\\"')
 
-  dt.tree.0 <- xgb.model.dt.tree(model = bst.Tree)
-  expect_equal(dt.tree, dt.tree.0)
-
   # when model contains no feature names:
   dt.tree.x <- xgb.model.dt.tree(model = bst.Tree.unnamed)
   expect_output(str(dt.tree.x), 'Feature.*\\"3\\"')
@@ -304,7 +301,7 @@ test_that("xgb.model.dt.tree throws error for gblinear", {
 
 test_that("xgb.importance works with and without feature names", {
   .skip_if_vcd_not_available()
-  importance.Tree <- xgb.importance(feature_names = feature.names, model = bst.Tree)
+  importance.Tree <- xgb.importance(feature_names = feature.names, model = bst.Tree.unnamed)
   if (!flag_32bit)
     expect_equal(dim(importance.Tree), c(7, 4))
   expect_equal(colnames(importance.Tree), c("Feature", "Gain", "Cover", "Frequency"))
@@ -330,9 +327,8 @@ test_that("xgb.importance works with and without feature names", {
   importance <- xgb.importance(feature_names = feature.names, model = bst.Tree, trees = trees)
 
   importance_from_dump <- function() {
-    model_text_dump <- xgb.dump(model = bst.Tree.unnamed, with_stats = TRUE, trees = trees)
+    model_text_dump <- xgb.dump(model = bst.Tree, with_stats = TRUE, trees = trees)
     imp <- xgb.model.dt.tree(
-      feature_names = feature.names,
       text = model_text_dump,
       trees = trees
     )[
diff --git a/R-package/tests/testthat/test_ranking.R b/R-package/tests/testthat/test_ranking.R
index 277c8f288..e49a32025 100644
--- a/R-package/tests/testthat/test_ranking.R
+++ b/R-package/tests/testthat/test_ranking.R
@@ -44,7 +44,7 @@ test_that('Test ranking with weighted data', {
   expect_true(all(diff(attributes(bst)$evaluation_log$train_auc) >= 0))
   expect_true(all(diff(attributes(bst)$evaluation_log$train_aucpr) >= 0))
   for (i in 1:10) {
-    pred <- predict(bst, newdata = dtrain, ntreelimit = i)
+    pred <- predict(bst, newdata = dtrain, iterationrange = c(1, i))
     # is_sorted[i]: is i-th group correctly sorted by the ranking predictor?
     is_sorted <- lapply(seq(1, 20, by = 5),
       function(k) {
diff --git a/demo/guide-python/continuation.py b/demo/guide-python/continuation.py
index 84afc3710..e32c48665 100644
--- a/demo/guide-python/continuation.py
+++ b/demo/guide-python/continuation.py
@@ -16,14 +16,14 @@ def training_continuation(tmpdir: str, use_pickle: bool) -> None:
     """Basic training continuation."""
     # Train 128 iterations in 1 session
     X, y = load_breast_cancer(return_X_y=True)
-    clf = xgboost.XGBClassifier(n_estimators=128)
-    clf.fit(X, y, eval_set=[(X, y)], eval_metric="logloss")
+    clf = xgboost.XGBClassifier(n_estimators=128, eval_metric="logloss")
+    clf.fit(X, y, eval_set=[(X, y)])
     print("Total boosted rounds:", clf.get_booster().num_boosted_rounds())
 
     # Train 128 iterations in 2 sessions, with the first one runs for 32 iterations and
     # the second one runs for 96 iterations
-    clf = xgboost.XGBClassifier(n_estimators=32)
-    clf.fit(X, y, eval_set=[(X, y)], eval_metric="logloss")
+    clf = xgboost.XGBClassifier(n_estimators=32, eval_metric="logloss")
+    clf.fit(X, y, eval_set=[(X, y)])
     assert clf.get_booster().num_boosted_rounds() == 32
 
     # load back the model, this could be a checkpoint
@@ -39,8 +39,8 @@ def training_continuation(tmpdir: str, use_pickle: bool) -> None:
         loaded = xgboost.XGBClassifier()
         loaded.load_model(path)
 
-    clf = xgboost.XGBClassifier(n_estimators=128 - 32)
-    clf.fit(X, y, eval_set=[(X, y)], eval_metric="logloss", xgb_model=loaded)
+    clf = xgboost.XGBClassifier(n_estimators=128 - 32, eval_metric="logloss")
+    clf.fit(X, y, eval_set=[(X, y)], xgb_model=loaded)
 
     print("Total boosted rounds:", clf.get_booster().num_boosted_rounds())
 
@@ -56,19 +56,24 @@ def training_continuation_early_stop(tmpdir: str, use_pickle: bool) -> None:
     n_estimators = 512
 
     X, y = load_breast_cancer(return_X_y=True)
-    clf = xgboost.XGBClassifier(n_estimators=n_estimators)
-    clf.fit(X, y, eval_set=[(X, y)], eval_metric="logloss", callbacks=[early_stop])
+    clf = xgboost.XGBClassifier(
+        n_estimators=n_estimators, eval_metric="logloss", callbacks=[early_stop]
+    )
+    clf.fit(X, y, eval_set=[(X, y)])
     print("Total boosted rounds:", clf.get_booster().num_boosted_rounds())
     best = clf.best_iteration
 
     # Train 512 iterations in 2 sessions, with the first one runs for 128 iterations and
     # the second one runs until early stop.
-    clf = xgboost.XGBClassifier(n_estimators=128)
+    clf = xgboost.XGBClassifier(
+        n_estimators=128, eval_metric="logloss", callbacks=[early_stop]
+    )
     # Reinitialize the early stop callback
     early_stop = xgboost.callback.EarlyStopping(
         rounds=early_stopping_rounds, save_best=True
     )
-    clf.fit(X, y, eval_set=[(X, y)], eval_metric="logloss", callbacks=[early_stop])
+    clf.set_params(callbacks=[early_stop])
+    clf.fit(X, y, eval_set=[(X, y)])
     assert clf.get_booster().num_boosted_rounds() == 128
 
     # load back the model, this could be a checkpoint
@@ -87,13 +92,13 @@ def training_continuation_early_stop(tmpdir: str, use_pickle: bool) -> None:
     early_stop = xgboost.callback.EarlyStopping(
         rounds=early_stopping_rounds, save_best=True
     )
-    clf = xgboost.XGBClassifier(n_estimators=n_estimators - 128)
+    clf = xgboost.XGBClassifier(
+        n_estimators=n_estimators - 128, eval_metric="logloss", callbacks=[early_stop]
+    )
     clf.fit(
         X,
         y,
         eval_set=[(X, y)],
-        eval_metric="logloss",
-        callbacks=[early_stop],
         xgb_model=loaded,
     )
 
diff --git a/demo/guide-python/quantile_regression.py b/demo/guide-python/quantile_regression.py
index 5d186714c..a9e4532ba 100644
--- a/demo/guide-python/quantile_regression.py
+++ b/demo/guide-python/quantile_regression.py
@@ -46,10 +46,11 @@ def quantile_loss(args: argparse.Namespace) -> None:
 
     X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng)
     # We will be using the `hist` tree method, quantile DMatrix can be used to preserve
-    # memory.
+    # memory (which has nothing to do with quantile regression itself, see its document
+    # for details).
     # Do not use the `exact` tree method for quantile regression, otherwise the
     # performance might drop.
-    Xy = xgb.QuantileDMatrix(X, y)
+    Xy = xgb.QuantileDMatrix(X_train, y_train)
     # use Xy as a reference
     Xy_test = xgb.QuantileDMatrix(X_test, y_test, ref=Xy)
 
diff --git a/demo/guide-python/sklearn_evals_result.py b/demo/guide-python/sklearn_evals_result.py
index 9aed58500..781ab81af 100644
--- a/demo/guide-python/sklearn_evals_result.py
+++ b/demo/guide-python/sklearn_evals_result.py
@@ -16,30 +16,35 @@ labels, y = np.unique(y, return_inverse=True)
 X_train, X_test = X[:1600], X[1600:]
 y_train, y_test = y[:1600], y[1600:]
 
-param_dist = {'objective':'binary:logistic', 'n_estimators':2}
+param_dist = {"objective": "binary:logistic", "n_estimators": 2}
 
-clf = xgb.XGBModel(**param_dist)
+clf = xgb.XGBModel(
+    **param_dist,
+    eval_metric="logloss",
+)
 # Or you can use: clf = xgb.XGBClassifier(**param_dist)
 
-clf.fit(X_train, y_train,
-        eval_set=[(X_train, y_train), (X_test, y_test)],
-        eval_metric='logloss',
-        verbose=True)
+clf.fit(
+    X_train,
+    y_train,
+    eval_set=[(X_train, y_train), (X_test, y_test)],
+    verbose=True,
+)
 
 # Load evals result by calling the evals_result() function
 evals_result = clf.evals_result()
 
-print('Access logloss metric directly from validation_0:')
-print(evals_result['validation_0']['logloss'])
+print("Access logloss metric directly from validation_0:")
+print(evals_result["validation_0"]["logloss"])
 
-print('')
-print('Access metrics through a loop:')
+print("")
+print("Access metrics through a loop:")
 for e_name, e_mtrs in evals_result.items():
-    print('- {}'.format(e_name))
+    print("- {}".format(e_name))
     for e_mtr_name, e_mtr_vals in e_mtrs.items():
-        print('   - {}'.format(e_mtr_name))
-        print('      - {}'.format(e_mtr_vals))
+        print("   - {}".format(e_mtr_name))
+        print("      - {}".format(e_mtr_vals))
 
-print('')
-print('Access complete dict:')
+print("")
+print("Access complete dict:")
 print(evals_result)
diff --git a/demo/guide-python/sklearn_examples.py b/demo/guide-python/sklearn_examples.py
index cf33e959a..0fe7a8e24 100644
--- a/demo/guide-python/sklearn_examples.py
+++ b/demo/guide-python/sklearn_examples.py
@@ -1,4 +1,4 @@
-'''
+"""
 Collection of examples for using sklearn interface
 ==================================================
 
@@ -8,7 +8,7 @@ For an introduction to XGBoost's scikit-learn estimator interface, see
 Created on 1 Apr 2015
 
 @author: Jamie Hall
-'''
+"""
 import pickle
 
 import numpy as np
@@ -22,8 +22,8 @@ rng = np.random.RandomState(31337)
 
 print("Zeros and Ones from the Digits dataset: binary classification")
 digits = load_digits(n_class=2)
-y = digits['target']
-X = digits['data']
+y = digits["target"]
+X = digits["data"]
 kf = KFold(n_splits=2, shuffle=True, random_state=rng)
 for train_index, test_index in kf.split(X):
     xgb_model = xgb.XGBClassifier(n_jobs=1).fit(X[train_index], y[train_index])
@@ -33,8 +33,8 @@ for train_index, test_index in kf.split(X):
 
 print("Iris: multiclass classification")
 iris = load_iris()
-y = iris['target']
-X = iris['data']
+y = iris["target"]
+X = iris["data"]
 kf = KFold(n_splits=2, shuffle=True, random_state=rng)
 for train_index, test_index in kf.split(X):
     xgb_model = xgb.XGBClassifier(n_jobs=1).fit(X[train_index], y[train_index])
@@ -53,9 +53,13 @@ for train_index, test_index in kf.split(X):
 
 print("Parameter optimization")
 xgb_model = xgb.XGBRegressor(n_jobs=1)
-clf = GridSearchCV(xgb_model,
-                   {'max_depth': [2, 4],
-                    'n_estimators': [50, 100]}, verbose=1, n_jobs=1, cv=3)
+clf = GridSearchCV(
+    xgb_model,
+    {"max_depth": [2, 4], "n_estimators": [50, 100]},
+    verbose=1,
+    n_jobs=1,
+    cv=3,
+)
 clf.fit(X, y)
 print(clf.best_score_)
 print(clf.best_params_)
@@ -69,9 +73,8 @@ print(np.allclose(clf.predict(X), clf2.predict(X)))
 
 # Early-stopping
 
-X = digits['data']
-y = digits['target']
+X = digits["data"]
+y = digits["target"]
 X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
-clf = xgb.XGBClassifier(n_jobs=1)
-clf.fit(X_train, y_train, early_stopping_rounds=10, eval_metric="auc",
-        eval_set=[(X_test, y_test)])
+clf = xgb.XGBClassifier(n_jobs=1, early_stopping_rounds=10, eval_metric="auc")
+clf.fit(X_train, y_train, eval_set=[(X_test, y_test)])
diff --git a/demo/guide-python/sklearn_parallel.py b/demo/guide-python/sklearn_parallel.py
index 2ebefffc7..55e0bff74 100644
--- a/demo/guide-python/sklearn_parallel.py
+++ b/demo/guide-python/sklearn_parallel.py
@@ -12,6 +12,7 @@ import xgboost as xgb
 if __name__ == "__main__":
     print("Parallel Parameter optimization")
     X, y = fetch_california_housing(return_X_y=True)
+    # Make sure the number of threads is balanced.
     xgb_model = xgb.XGBRegressor(
         n_jobs=multiprocessing.cpu_count() // 2, tree_method="hist"
     )
diff --git a/dev/change_scala_version.py b/dev/change_scala_version.py
new file mode 100644
index 000000000..d9438f76a
--- /dev/null
+++ b/dev/change_scala_version.py
@@ -0,0 +1,79 @@
+import argparse
+import pathlib
+import re
+import shutil
+
+
+def main(args):
+    if args.scala_version == "2.12":
+        scala_ver = "2.12"
+        scala_patchver = "2.12.18"
+    elif args.scala_version == "2.13":
+        scala_ver = "2.13"
+        scala_patchver = "2.13.11"
+    else:
+        raise ValueError(f"Unsupported Scala version: {args.scala_version}")
+
+    # Clean artifacts
+    if args.purge_artifacts:
+        for target in pathlib.Path("jvm-packages/").glob("**/target"):
+            if target.is_dir():
+                print(f"Removing {target}...")
+                shutil.rmtree(target)
+
+    # Update pom.xml
+    for pom in pathlib.Path("jvm-packages/").glob("**/pom.xml"):
+        print(f"Updating {pom}...")
+        with open(pom, "r", encoding="utf-8") as f:
+            lines = f.readlines()
+        with open(pom, "w", encoding="utf-8") as f:
+            replaced_scalaver = False
+            replaced_scala_binver = False
+            for line in lines:
+                for artifact in [
+                    "xgboost-jvm",
+                    "xgboost4j",
+                    "xgboost4j-gpu",
+                    "xgboost4j-spark",
+                    "xgboost4j-spark-gpu",
+                    "xgboost4j-flink",
+                    "xgboost4j-example",
+                ]:
+                    line = re.sub(
+                        f"<artifactId>{artifact}_[0-9\\.]*",
+                        f"<artifactId>{artifact}_{scala_ver}",
+                        line,
+                    )
+                # Only replace the first occurrence of scala.version
+                if not replaced_scalaver:
+                    line, nsubs = re.subn(
+                        r"<scala.version>[0-9\.]*",
+                        f"<scala.version>{scala_patchver}",
+                        line,
+                    )
+                    if nsubs > 0:
+                        replaced_scalaver = True
+                # Only replace the first occurrence of scala.binary.version
+                if not replaced_scala_binver:
+                    line, nsubs = re.subn(
+                        r"<scala.binary.version>[0-9\.]*",
+                        f"<scala.binary.version>{scala_ver}",
+                        line,
+                    )
+                    if nsubs > 0:
+                        replaced_scala_binver = True
+                f.write(line)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--purge-artifacts", action="store_true")
+    parser.add_argument(
+        "--scala-version",
+        type=str,
+        required=True,
+        help="Version of Scala to use in the JVM packages",
+        choices=["2.12", "2.13"],
+    )
+    parsed_args = parser.parse_args()
+    main(parsed_args)
diff --git a/dev/prepare_jvm_release.py b/dev/prepare_jvm_release.py
index 0cf5796a2..5d4d2e66f 100644
--- a/dev/prepare_jvm_release.py
+++ b/dev/prepare_jvm_release.py
@@ -2,7 +2,6 @@ import argparse
 import errno
 import glob
 import os
-import platform
 import re
 import shutil
 import subprocess
@@ -88,10 +87,6 @@ def main():
         help="Version of the release being prepared",
     )
     args = parser.parse_args()
-
-    if sys.platform != "darwin" or platform.machine() != "arm64":
-        raise NotImplementedError("Please run this script using an M1 Mac")
-
     version = args.release_version
     expected_git_tag = "v" + version
     current_git_tag = get_current_git_tag()
@@ -141,6 +136,7 @@ def main():
             ("linux", "x86_64"),
             ("windows", "x86_64"),
             ("macos", "x86_64"),
+            ("macos", "aarch64"),
         ]:
             output_dir = f"xgboost4j/src/main/resources/lib/{os_ident}/{arch}"
             maybe_makedirs(output_dir)
@@ -164,6 +160,10 @@ def main():
             url=f"{nightly_bucket_prefix}/{git_branch}/libxgboost4j/libxgboost4j_{commit_hash}.dylib",
             filename="xgboost4j/src/main/resources/lib/macos/x86_64/libxgboost4j.dylib",
         )
+        retrieve(
+            url=f"{nightly_bucket_prefix}/{git_branch}/libxgboost4j/libxgboost4j_m1_{commit_hash}.dylib",
+            filename="xgboost4j/src/main/resources/lib/macos/aarch64/libxgboost4j.dylib",
+        )
 
         with tempfile.TemporaryDirectory() as tempdir:
             # libxgboost4j.so for Linux x86_64, CPU only
@@ -210,13 +210,31 @@ def main():
         "2. Store the Sonatype credentials in .m2/settings.xml. See insturctions in "
         "https://central.sonatype.org/publish/publish-maven/"
     )
-    print("3. Now on a Mac machine, run:")
-    print("   GPG_TTY=$(tty) mvn deploy -Prelease -DskipTests")
+    print(
+        "3. Now on a Linux machine, run the following to build Scala 2.12 artifacts. "
+        "Make sure to use an Internet connection with fast upload speed:"
+    )
+    print(
+        "   # Skip native build, since we have all needed native binaries from CI\n"
+        "   export MAVEN_SKIP_NATIVE_BUILD=1\n"
+        "   GPG_TTY=$(tty) mvn deploy -Prelease -DskipTests"
+    )
     print(
         "4. Log into https://oss.sonatype.org/. On the left menu panel, click Staging "
-        "Repositories. Visit the URL https://oss.sonatype.org/content/repositories/mldmlc-1085 "
+        "Repositories. Visit the URL https://oss.sonatype.org/content/repositories/mldmlc-xxxx "
         "to inspect the staged JAR files. Finally, press Release button to publish the "
-        "artifacts to the Maven Central repository."
+        "artifacts to the Maven Central repository. The top-level metapackage should be "
+        "named xgboost-jvm_2.12."
+    )
+    print(
+        "5. Remove the Scala 2.12 artifacts and build Scala 2.13 artifacts:\n"
+        "   export MAVEN_SKIP_NATIVE_BUILD=1\n"
+        "   python dev/change_scala_version.py --scala-version 2.13 --purge-artifacts\n"
+        "   GPG_TTY=$(tty) mvn deploy -Prelease -DskipTests"
+    )
+    print(
+        "6. Go to https://oss.sonatype.org/ to release the Scala 2.13 artifacts. "
+        "The top-level metapackage should be named xgboost-jvm_2.13."
     )
 
 
diff --git a/dev/release-artifacts.py b/dev/release-artifacts.py
index 429fac078..d5f28f6fc 100644
--- a/dev/release-artifacts.py
+++ b/dev/release-artifacts.py
@@ -153,7 +153,7 @@ Following steps should be done manually:
 def download_r_packages(
     release: str, branch: str, rc: str, commit: str, outdir: str
 ) -> Tuple[Dict[str, str], List[str]]:
-    platforms = ["win64", "linux"]
+    platforms = ["linux"]
     dirname = os.path.join(outdir, "r-packages")
     if not os.path.exists(dirname):
         os.mkdir(dirname)
diff --git a/doc/tutorials/custom_metric_obj.rst b/doc/tutorials/custom_metric_obj.rst
index 118a099c1..36bd0c8d6 100644
--- a/doc/tutorials/custom_metric_obj.rst
+++ b/doc/tutorials/custom_metric_obj.rst
@@ -123,11 +123,11 @@ monitor our model's performance.  As mentioned above, the default metric for ``S
         elements = np.power(np.log1p(y) - np.log1p(predt), 2)
         return 'PyRMSLE', float(np.sqrt(np.sum(elements) / len(y)))
 
-Since we are demonstrating in Python, the metric or objective need not be a function,
-any callable object should suffice.  Similar to the objective function, our metric also
-accepts ``predt`` and ``dtrain`` as inputs, but returns the name of the metric itself and a
-floating point value as the result.  After passing it into XGBoost as argument of ``feval``
-parameter:
+Since we are demonstrating in Python, the metric or objective need not be a function, any
+callable object should suffice.  Similar to the objective function, our metric also
+accepts ``predt`` and ``dtrain`` as inputs, but returns the name of the metric itself and
+a floating point value as the result.  After passing it into XGBoost as argument of
+``custom_metric`` parameter:
 
 .. code-block:: python
 
@@ -136,7 +136,7 @@ parameter:
               dtrain=dtrain,
               num_boost_round=10,
               obj=squared_log,
-              feval=rmsle,
+              custom_metric=rmsle,
               evals=[(dtrain, 'dtrain'), (dtest, 'dtest')],
               evals_result=results)
 
diff --git a/include/xgboost/tree_model.h b/include/xgboost/tree_model.h
index 393dda59c..4c475da2e 100644
--- a/include/xgboost/tree_model.h
+++ b/include/xgboost/tree_model.h
@@ -398,8 +398,8 @@ class RegTree : public Model {
       if (!func(nidx)) {
         return;
       }
-      auto left = self[nidx].LeftChild();
-      auto right = self[nidx].RightChild();
+      auto left = self.LeftChild(nidx);
+      auto right = self.RightChild(nidx);
       if (left != RegTree::kInvalidNodeId) {
         nodes.push(left);
       }
diff --git a/jvm-packages/create_jni.py b/jvm-packages/create_jni.py
index e2b15dd2a..395bc79b0 100755
--- a/jvm-packages/create_jni.py
+++ b/jvm-packages/create_jni.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
-import errno
 import argparse
+import errno
 import glob
 import os
 import platform
@@ -19,13 +19,12 @@ CONFIG = {
     "USE_HDFS": "OFF",
     "USE_AZURE": "OFF",
     "USE_S3": "OFF",
-
     "USE_CUDA": "OFF",
     "USE_NCCL": "OFF",
     "USE_HIP": "OFF",
     "USE_RCCL": "OFF",
     "JVM_BINDINGS": "ON",
-    "LOG_CAPI_INVOCATION": "OFF"
+    "LOG_CAPI_INVOCATION": "OFF",
 }
 
 
@@ -72,18 +71,13 @@ def normpath(path):
         return normalized
 
 
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--log-capi-invocation', type=str, choices=['ON', 'OFF'], default='OFF')
-    parser.add_argument('--use-cuda', type=str, choices=['ON', 'OFF'], default='OFF')
-    parser.add_argument('--use-hip', type=str, choices=['ON', 'OFF'], default='OFF')
-    cli_args = parser.parse_args()
-
+def native_build(args):
     if sys.platform == "darwin":
         # Enable of your compiler supports OpenMP.
         CONFIG["USE_OPENMP"] = "OFF"
-        os.environ["JAVA_HOME"] = subprocess.check_output(
-            "/usr/libexec/java_home").strip().decode()
+        os.environ["JAVA_HOME"] = (
+            subprocess.check_output("/usr/libexec/java_home").strip().decode()
+        )
 
     print("building Java wrapper")
     with cd(".."):
@@ -92,7 +86,7 @@ if __name__ == "__main__":
         with cd(build_dir):
             if sys.platform == "win32":
                 # Force x64 build on Windows.
-                maybe_generator = ' -A x64'
+                maybe_generator = " -A x64"
             else:
                 maybe_generator = ""
             if sys.platform == "linux":
@@ -100,12 +94,12 @@ if __name__ == "__main__":
             else:
                 maybe_parallel_build = ""
 
-            if cli_args.log_capi_invocation == 'ON':
-                CONFIG['LOG_CAPI_INVOCATION'] = 'ON'
+            if cli_args.log_capi_invocation == "ON":
+                CONFIG["LOG_CAPI_INVOCATION"] = "ON"
 
-            if cli_args.use_cuda == 'ON':
-                CONFIG['USE_CUDA'] = 'ON'
-                CONFIG['USE_NCCL'] = 'ON'
+            if cli_args.use_cuda == "ON":
+                CONFIG["USE_CUDA"] = "ON"
+                CONFIG["USE_NCCL"] = "ON"
                 CONFIG["USE_DLOPEN_NCCL"] = "OFF"
             elif cli_args.use_hip== 'ON':
                 CONFIG['USE_HIP'] = 'ON'
@@ -123,7 +117,7 @@ if __name__ == "__main__":
             if gpu_arch_flag is not None:
                 args.append("%s" % gpu_arch_flag)
 
-            lib_dir = os.path.join(os.pardir, 'lib')
+            lib_dir = os.path.join(os.pardir, "lib")
             if os.path.exists(lib_dir):
                 shutil.rmtree(lib_dir)
             run("cmake .. " + " ".join(args) + maybe_generator)
@@ -133,8 +127,10 @@ if __name__ == "__main__":
             run(f'"{sys.executable}" mapfeat.py')
             run(f'"{sys.executable}" mknfold.py machine.txt 1')
 
-    xgboost4j = 'xgboost4j-gpu' if cli_args.use_cuda == 'ON' or cli_args.use_hip== 'ON' else 'xgboost4j'
-    xgboost4j_spark = 'xgboost4j-spark-gpu' if cli_args.use_cuda == 'ON' or cli_args.use_hip == 'ON' else 'xgboost4j-spark'
+    xgboost4j = "xgboost4j-gpu" if cli_args.use_cuda == "ON" or cli_args.use_hip== "ON" else "xgboost4j"
+    xgboost4j_spark = (
+        "xgboost4j-spark-gpu" if cli_args.use_cuda == "ON" or cli_args.use_hip == "ON" else "xgboost4j-spark"
+    )
 
     print("copying native library")
     library_name, os_folder = {
@@ -149,14 +145,19 @@ if __name__ == "__main__":
         "i86pc": "x86_64",  # on Solaris x86_64
         "sun4v": "sparc",  # on Solaris sparc
         "arm64": "aarch64",  # on macOS & Windows ARM 64-bit
-        "aarch64": "aarch64"
+        "aarch64": "aarch64",
     }[platform.machine().lower()]
-    output_folder = "{}/src/main/resources/lib/{}/{}".format(xgboost4j, os_folder, arch_folder)
+    output_folder = "{}/src/main/resources/lib/{}/{}".format(
+        xgboost4j, os_folder, arch_folder
+    )
     maybe_makedirs(output_folder)
     cp("../lib/" + library_name, output_folder)
 
     print("copying pure-Python tracker")
-    cp("../python-package/xgboost/tracker.py", "{}/src/main/resources".format(xgboost4j))
+    cp(
+        "../python-package/xgboost/tracker.py",
+        "{}/src/main/resources".format(xgboost4j),
+    )
 
     print("copying train/test files")
     maybe_makedirs("{}/src/test/resources".format(xgboost4j_spark))
@@ -172,3 +173,18 @@ if __name__ == "__main__":
     maybe_makedirs("{}/src/test/resources".format(xgboost4j))
     for file in glob.glob("../demo/data/agaricus.*"):
         cp(file, "{}/src/test/resources".format(xgboost4j))
+
+
+if __name__ == "__main__":
+    if "MAVEN_SKIP_NATIVE_BUILD" in os.environ:
+        print("MAVEN_SKIP_NATIVE_BUILD is set. Skipping native build...")
+    else:
+        parser = argparse.ArgumentParser()
+        parser.add_argument(
+            "--log-capi-invocation", type=str, choices=["ON", "OFF"], default="OFF"
+        )
+        parser.add_argument(
+            "--use-cuda", type=str, choices=["ON", "OFF"], default="OFF"
+        )
+        cli_args = parser.parse_args()
+        native_build(cli_args)
diff --git a/jvm-packages/pom.xml b/jvm-packages/pom.xml
index 08bfd841c..5b6f82b6a 100644
--- a/jvm-packages/pom.xml
+++ b/jvm-packages/pom.xml
@@ -5,7 +5,7 @@
     <modelVersion>4.0.0</modelVersion>
 
     <groupId>ml.dmlc</groupId>
-    <artifactId>xgboost-jvm</artifactId>
+    <artifactId>xgboost-jvm_2.12</artifactId>
     <version>2.1.0-SNAPSHOT</version>
     <packaging>pom</packaging>
     <name>XGBoost JVM Package</name>
@@ -43,10 +43,10 @@
         <maven.wagon.http.retryHandler.count>5</maven.wagon.http.retryHandler.count>
         <log.capi.invocation>OFF</log.capi.invocation>
         <use.cuda>OFF</use.cuda>
-        <cudf.version>23.10.0</cudf.version>
-        <spark.rapids.version>23.10.0</spark.rapids.version>
+        <cudf.version>23.12.1</cudf.version>
+        <spark.rapids.version>23.12.1</spark.rapids.version>
+        <cudf.classifier>cuda12</cudf.classifier>
         <use.hip>OFF</use.hip>
-        <cudf.classifier>cuda11</cudf.classifier>
         <scalatest.version>3.2.17</scalatest.version>
         <scala-collection-compat.version>2.11.0</scala-collection-compat.version>
 
@@ -91,14 +91,6 @@
             </modules>
         </profile>
 
-        <profile>
-            <id>scala-2.13</id>
-            <properties>
-                <scala.binary.version>2.13</scala.binary.version>
-                <scala.version>2.13.11</scala.version>
-            </properties>
-        </profile>
-
         <!-- gpu profile with both cpu and gpu test suites -->
         <profile>
             <id>gpu</id>
diff --git a/jvm-packages/xgboost4j-example/pom.xml b/jvm-packages/xgboost4j-example/pom.xml
index 3a56615d6..431c6766a 100644
--- a/jvm-packages/xgboost4j-example/pom.xml
+++ b/jvm-packages/xgboost4j-example/pom.xml
@@ -5,11 +5,11 @@
     <modelVersion>4.0.0</modelVersion>
     <parent>
         <groupId>ml.dmlc</groupId>
-        <artifactId>xgboost-jvm</artifactId>
+        <artifactId>xgboost-jvm_2.12</artifactId>
         <version>2.1.0-SNAPSHOT</version>
     </parent>
     <name>xgboost4j-example</name>
-    <artifactId>xgboost4j-example_${scala.binary.version}</artifactId>
+    <artifactId>xgboost4j-example_2.12</artifactId>
     <version>2.1.0-SNAPSHOT</version>
     <packaging>jar</packaging>
     <build>
@@ -26,7 +26,7 @@
     <dependencies>
         <dependency>
             <groupId>ml.dmlc</groupId>
-            <artifactId>xgboost4j-spark_${scala.binary.version}</artifactId>
+            <artifactId>xgboost4j-spark_2.12</artifactId>
             <version>${project.version}</version>
         </dependency>
         <dependency>
@@ -37,7 +37,7 @@
         </dependency>
         <dependency>
             <groupId>ml.dmlc</groupId>
-            <artifactId>xgboost4j-flink_${scala.binary.version}</artifactId>
+            <artifactId>xgboost4j-flink_2.12</artifactId>
             <version>${project.version}</version>
         </dependency>
     </dependencies>
diff --git a/jvm-packages/xgboost4j-flink/pom.xml b/jvm-packages/xgboost4j-flink/pom.xml
index 6f700ca0a..e3dfb3830 100644
--- a/jvm-packages/xgboost4j-flink/pom.xml
+++ b/jvm-packages/xgboost4j-flink/pom.xml
@@ -5,12 +5,12 @@
     <modelVersion>4.0.0</modelVersion>
     <parent>
         <groupId>ml.dmlc</groupId>
-        <artifactId>xgboost-jvm</artifactId>
+        <artifactId>xgboost-jvm_2.12</artifactId>
         <version>2.1.0-SNAPSHOT</version>
     </parent>
 
     <name>xgboost4j-flink</name>
-    <artifactId>xgboost4j-flink_${scala.binary.version}</artifactId>
+    <artifactId>xgboost4j-flink_2.12</artifactId>
     <version>2.1.0-SNAPSHOT</version>
     <properties>
       <flink-ml.version>2.2.0</flink-ml.version>
@@ -30,7 +30,7 @@
     <dependencies>
         <dependency>
             <groupId>ml.dmlc</groupId>
-            <artifactId>xgboost4j_${scala.binary.version}</artifactId>
+            <artifactId>xgboost4j_2.12</artifactId>
             <version>${project.version}</version>
         </dependency>
         <dependency>
diff --git a/jvm-packages/xgboost4j-gpu/pom.xml b/jvm-packages/xgboost4j-gpu/pom.xml
index bcc3fe3bb..2dc36d52d 100644
--- a/jvm-packages/xgboost4j-gpu/pom.xml
+++ b/jvm-packages/xgboost4j-gpu/pom.xml
@@ -5,10 +5,10 @@
     <modelVersion>4.0.0</modelVersion>
     <parent>
         <groupId>ml.dmlc</groupId>
-        <artifactId>xgboost-jvm</artifactId>
+        <artifactId>xgboost-jvm_2.12</artifactId>
         <version>2.1.0-SNAPSHOT</version>
     </parent>
-    <artifactId>xgboost4j-gpu_${scala.binary.version}</artifactId>
+    <artifactId>xgboost4j-gpu_2.12</artifactId>
     <name>xgboost4j-gpu</name>
     <version>2.1.0-SNAPSHOT</version>
     <packaging>jar</packaging>
diff --git a/jvm-packages/xgboost4j-spark-gpu/pom.xml b/jvm-packages/xgboost4j-spark-gpu/pom.xml
index a29b4e056..149f2f3a3 100644
--- a/jvm-packages/xgboost4j-spark-gpu/pom.xml
+++ b/jvm-packages/xgboost4j-spark-gpu/pom.xml
@@ -5,11 +5,11 @@
     <modelVersion>4.0.0</modelVersion>
     <parent>
         <groupId>ml.dmlc</groupId>
-        <artifactId>xgboost-jvm</artifactId>
+        <artifactId>xgboost-jvm_2.12</artifactId>
         <version>2.1.0-SNAPSHOT</version>
     </parent>
     <name>xgboost4j-spark-gpu</name>
-    <artifactId>xgboost4j-spark-gpu_${scala.binary.version}</artifactId>
+    <artifactId>xgboost4j-spark-gpu_2.12</artifactId>
     <build>
         <plugins>
             <plugin>
@@ -24,7 +24,7 @@
     <dependencies>
         <dependency>
             <groupId>ml.dmlc</groupId>
-            <artifactId>xgboost4j-gpu_${scala.binary.version}</artifactId>
+            <artifactId>xgboost4j-gpu_2.12</artifactId>
             <version>${project.version}</version>
         </dependency>
         <dependency>
diff --git a/jvm-packages/xgboost4j-spark/pom.xml b/jvm-packages/xgboost4j-spark/pom.xml
index 179b1c762..6f16335f0 100644
--- a/jvm-packages/xgboost4j-spark/pom.xml
+++ b/jvm-packages/xgboost4j-spark/pom.xml
@@ -5,11 +5,11 @@
     <modelVersion>4.0.0</modelVersion>
     <parent>
         <groupId>ml.dmlc</groupId>
-        <artifactId>xgboost-jvm</artifactId>
+        <artifactId>xgboost-jvm_2.12</artifactId>
         <version>2.1.0-SNAPSHOT</version>
     </parent>
     <name>xgboost4j-spark</name>
-    <artifactId>xgboost4j-spark_${scala.binary.version}</artifactId>
+    <artifactId>xgboost4j-spark_2.12</artifactId>
     <build>
         <plugins>
             <plugin>
@@ -24,7 +24,7 @@
     <dependencies>
         <dependency>
             <groupId>ml.dmlc</groupId>
-            <artifactId>xgboost4j_${scala.binary.version}</artifactId>
+            <artifactId>xgboost4j_2.12</artifactId>
             <version>${project.version}</version>
         </dependency>
         <dependency>
diff --git a/jvm-packages/xgboost4j/pom.xml b/jvm-packages/xgboost4j/pom.xml
index e05bbcf48..7eb186919 100644
--- a/jvm-packages/xgboost4j/pom.xml
+++ b/jvm-packages/xgboost4j/pom.xml
@@ -5,11 +5,11 @@
     <modelVersion>4.0.0</modelVersion>
     <parent>
         <groupId>ml.dmlc</groupId>
-        <artifactId>xgboost-jvm</artifactId>
+        <artifactId>xgboost-jvm_2.12</artifactId>
         <version>2.1.0-SNAPSHOT</version>
     </parent>
     <name>xgboost4j</name>
-    <artifactId>xgboost4j_${scala.binary.version}</artifactId>
+    <artifactId>xgboost4j_2.12</artifactId>
     <version>2.1.0-SNAPSHOT</version>
     <packaging>jar</packaging>
 
diff --git a/python-package/xgboost/dask/__init__.py b/python-package/xgboost/dask/__init__.py
index 6b4ae5b07..a9b51f35d 100644
--- a/python-package/xgboost/dask/__init__.py
+++ b/python-package/xgboost/dask/__init__.py
@@ -61,7 +61,7 @@ from typing import (
 import numpy
 
 from xgboost import collective, config
-from xgboost._typing import _T, FeatureNames, FeatureTypes, ModelIn
+from xgboost._typing import _T, FeatureNames, FeatureTypes
 from xgboost.callback import TrainingCallback
 from xgboost.compat import DataFrame, LazyLoader, concat, lazy_isinstance
 from xgboost.core import (
@@ -1774,14 +1774,11 @@ class DaskXGBRegressor(DaskScikitLearnBase, XGBRegressorBase):
         sample_weight: Optional[_DaskCollection],
         base_margin: Optional[_DaskCollection],
         eval_set: Optional[Sequence[Tuple[_DaskCollection, _DaskCollection]]],
-        eval_metric: Optional[Union[str, Sequence[str], Metric]],
         sample_weight_eval_set: Optional[Sequence[_DaskCollection]],
         base_margin_eval_set: Optional[Sequence[_DaskCollection]],
-        early_stopping_rounds: Optional[int],
         verbose: Union[int, bool],
         xgb_model: Optional[Union[Booster, XGBModel]],
         feature_weights: Optional[_DaskCollection],
-        callbacks: Optional[Sequence[TrainingCallback]],
     ) -> _DaskCollection:
         params = self.get_xgb_params()
         dtrain, evals = await _async_wrap_evaluation_matrices(
@@ -1809,9 +1806,7 @@ class DaskXGBRegressor(DaskScikitLearnBase, XGBRegressorBase):
             obj: Optional[Callable] = _objective_decorator(self.objective)
         else:
             obj = None
-        model, metric, params, early_stopping_rounds, callbacks = self._configure_fit(
-            xgb_model, eval_metric, params, early_stopping_rounds, callbacks
-        )
+        model, metric, params = self._configure_fit(xgb_model, params)
         results = await self.client.sync(
             _train_async,
             asynchronous=True,
@@ -1826,8 +1821,8 @@ class DaskXGBRegressor(DaskScikitLearnBase, XGBRegressorBase):
             feval=None,
             custom_metric=metric,
             verbose_eval=verbose,
-            early_stopping_rounds=early_stopping_rounds,
-            callbacks=callbacks,
+            early_stopping_rounds=self.early_stopping_rounds,
+            callbacks=self.callbacks,
             xgb_model=model,
         )
         self._Booster = results["booster"]
@@ -1844,14 +1839,11 @@ class DaskXGBRegressor(DaskScikitLearnBase, XGBRegressorBase):
         sample_weight: Optional[_DaskCollection] = None,
         base_margin: Optional[_DaskCollection] = None,
         eval_set: Optional[Sequence[Tuple[_DaskCollection, _DaskCollection]]] = None,
-        eval_metric: Optional[Union[str, Sequence[str], Callable]] = None,
-        early_stopping_rounds: Optional[int] = None,
         verbose: Union[int, bool] = True,
         xgb_model: Optional[Union[Booster, XGBModel]] = None,
         sample_weight_eval_set: Optional[Sequence[_DaskCollection]] = None,
         base_margin_eval_set: Optional[Sequence[_DaskCollection]] = None,
         feature_weights: Optional[_DaskCollection] = None,
-        callbacks: Optional[Sequence[TrainingCallback]] = None,
     ) -> "DaskXGBRegressor":
         _assert_dask_support()
         args = {k: v for k, v in locals().items() if k not in ("self", "__class__")}
@@ -1871,14 +1863,11 @@ class DaskXGBClassifier(DaskScikitLearnBase, XGBClassifierBase):
         sample_weight: Optional[_DaskCollection],
         base_margin: Optional[_DaskCollection],
         eval_set: Optional[Sequence[Tuple[_DaskCollection, _DaskCollection]]],
-        eval_metric: Optional[Union[str, Sequence[str], Metric]],
         sample_weight_eval_set: Optional[Sequence[_DaskCollection]],
         base_margin_eval_set: Optional[Sequence[_DaskCollection]],
-        early_stopping_rounds: Optional[int],
         verbose: Union[int, bool],
         xgb_model: Optional[Union[Booster, XGBModel]],
         feature_weights: Optional[_DaskCollection],
-        callbacks: Optional[Sequence[TrainingCallback]],
     ) -> "DaskXGBClassifier":
         params = self.get_xgb_params()
         dtrain, evals = await _async_wrap_evaluation_matrices(
@@ -1924,9 +1913,7 @@ class DaskXGBClassifier(DaskScikitLearnBase, XGBClassifierBase):
             obj: Optional[Callable] = _objective_decorator(self.objective)
         else:
             obj = None
-        model, metric, params, early_stopping_rounds, callbacks = self._configure_fit(
-            xgb_model, eval_metric, params, early_stopping_rounds, callbacks
-        )
+        model, metric, params = self._configure_fit(xgb_model, params)
         results = await self.client.sync(
             _train_async,
             asynchronous=True,
@@ -1941,8 +1928,8 @@ class DaskXGBClassifier(DaskScikitLearnBase, XGBClassifierBase):
             feval=None,
             custom_metric=metric,
             verbose_eval=verbose,
-            early_stopping_rounds=early_stopping_rounds,
-            callbacks=callbacks,
+            early_stopping_rounds=self.early_stopping_rounds,
+            callbacks=self.callbacks,
             xgb_model=model,
         )
         self._Booster = results["booster"]
@@ -1960,14 +1947,11 @@ class DaskXGBClassifier(DaskScikitLearnBase, XGBClassifierBase):
         sample_weight: Optional[_DaskCollection] = None,
         base_margin: Optional[_DaskCollection] = None,
         eval_set: Optional[Sequence[Tuple[_DaskCollection, _DaskCollection]]] = None,
-        eval_metric: Optional[Union[str, Sequence[str], Callable]] = None,
-        early_stopping_rounds: Optional[int] = None,
         verbose: Union[int, bool] = True,
         xgb_model: Optional[Union[Booster, XGBModel]] = None,
         sample_weight_eval_set: Optional[Sequence[_DaskCollection]] = None,
         base_margin_eval_set: Optional[Sequence[_DaskCollection]] = None,
         feature_weights: Optional[_DaskCollection] = None,
-        callbacks: Optional[Sequence[TrainingCallback]] = None,
     ) -> "DaskXGBClassifier":
         _assert_dask_support()
         args = {k: v for k, v in locals().items() if k not in ("self", "__class__")}
@@ -2063,7 +2047,7 @@ class DaskXGBRanker(DaskScikitLearnBase, XGBRankerMixIn):
     def __init__(self, *, objective: str = "rank:pairwise", **kwargs: Any):
         if callable(objective):
             raise ValueError("Custom objective function not supported by XGBRanker.")
-        super().__init__(objective=objective, kwargs=kwargs)
+        super().__init__(objective=objective, **kwargs)
 
     async def _fit_async(
         self,
@@ -2078,12 +2062,9 @@ class DaskXGBRanker(DaskScikitLearnBase, XGBRankerMixIn):
         base_margin_eval_set: Optional[Sequence[_DaskCollection]],
         eval_group: Optional[Sequence[_DaskCollection]],
         eval_qid: Optional[Sequence[_DaskCollection]],
-        eval_metric: Optional[Union[str, Sequence[str], Metric]],
-        early_stopping_rounds: Optional[int],
         verbose: Union[int, bool],
         xgb_model: Optional[Union[XGBModel, Booster]],
         feature_weights: Optional[_DaskCollection],
-        callbacks: Optional[Sequence[TrainingCallback]],
     ) -> "DaskXGBRanker":
         msg = "Use `qid` instead of `group` on dask interface."
         if not (group is None and eval_group is None):
@@ -2111,14 +2092,7 @@ class DaskXGBRanker(DaskScikitLearnBase, XGBRankerMixIn):
             enable_categorical=self.enable_categorical,
             feature_types=self.feature_types,
         )
-        if eval_metric is not None:
-            if callable(eval_metric):
-                raise ValueError(
-                    "Custom evaluation metric is not yet supported for XGBRanker."
-                )
-        model, metric, params, early_stopping_rounds, callbacks = self._configure_fit(
-            xgb_model, eval_metric, params, early_stopping_rounds, callbacks
-        )
+        model, metric, params = self._configure_fit(xgb_model, params)
         results = await self.client.sync(
             _train_async,
             asynchronous=True,
@@ -2133,8 +2107,8 @@ class DaskXGBRanker(DaskScikitLearnBase, XGBRankerMixIn):
             feval=None,
             custom_metric=metric,
             verbose_eval=verbose,
-            early_stopping_rounds=early_stopping_rounds,
-            callbacks=callbacks,
+            early_stopping_rounds=self.early_stopping_rounds,
+            callbacks=self.callbacks,
             xgb_model=model,
         )
         self._Booster = results["booster"]
@@ -2155,14 +2129,11 @@ class DaskXGBRanker(DaskScikitLearnBase, XGBRankerMixIn):
         eval_set: Optional[Sequence[Tuple[_DaskCollection, _DaskCollection]]] = None,
         eval_group: Optional[Sequence[_DaskCollection]] = None,
         eval_qid: Optional[Sequence[_DaskCollection]] = None,
-        eval_metric: Optional[Union[str, Sequence[str], Callable]] = None,
-        early_stopping_rounds: Optional[int] = None,
         verbose: Union[int, bool] = False,
         xgb_model: Optional[Union[XGBModel, Booster]] = None,
         sample_weight_eval_set: Optional[Sequence[_DaskCollection]] = None,
         base_margin_eval_set: Optional[Sequence[_DaskCollection]] = None,
         feature_weights: Optional[_DaskCollection] = None,
-        callbacks: Optional[Sequence[TrainingCallback]] = None,
     ) -> "DaskXGBRanker":
         _assert_dask_support()
         args = {k: v for k, v in locals().items() if k not in ("self", "__class__")}
@@ -2221,18 +2192,15 @@ class DaskXGBRFRegressor(DaskXGBRegressor):
         sample_weight: Optional[_DaskCollection] = None,
         base_margin: Optional[_DaskCollection] = None,
         eval_set: Optional[Sequence[Tuple[_DaskCollection, _DaskCollection]]] = None,
-        eval_metric: Optional[Union[str, Sequence[str], Callable]] = None,
-        early_stopping_rounds: Optional[int] = None,
         verbose: Union[int, bool] = True,
         xgb_model: Optional[Union[Booster, XGBModel]] = None,
         sample_weight_eval_set: Optional[Sequence[_DaskCollection]] = None,
         base_margin_eval_set: Optional[Sequence[_DaskCollection]] = None,
         feature_weights: Optional[_DaskCollection] = None,
-        callbacks: Optional[Sequence[TrainingCallback]] = None,
     ) -> "DaskXGBRFRegressor":
         _assert_dask_support()
         args = {k: v for k, v in locals().items() if k not in ("self", "__class__")}
-        _check_rf_callback(early_stopping_rounds, callbacks)
+        _check_rf_callback(self.early_stopping_rounds, self.callbacks)
         super().fit(**args)
         return self
 
@@ -2285,17 +2253,14 @@ class DaskXGBRFClassifier(DaskXGBClassifier):
         sample_weight: Optional[_DaskCollection] = None,
         base_margin: Optional[_DaskCollection] = None,
         eval_set: Optional[Sequence[Tuple[_DaskCollection, _DaskCollection]]] = None,
-        eval_metric: Optional[Union[str, Sequence[str], Callable]] = None,
-        early_stopping_rounds: Optional[int] = None,
         verbose: Union[int, bool] = True,
         xgb_model: Optional[Union[Booster, XGBModel]] = None,
         sample_weight_eval_set: Optional[Sequence[_DaskCollection]] = None,
         base_margin_eval_set: Optional[Sequence[_DaskCollection]] = None,
         feature_weights: Optional[_DaskCollection] = None,
-        callbacks: Optional[Sequence[TrainingCallback]] = None,
     ) -> "DaskXGBRFClassifier":
         _assert_dask_support()
         args = {k: v for k, v in locals().items() if k not in ("self", "__class__")}
-        _check_rf_callback(early_stopping_rounds, callbacks)
+        _check_rf_callback(self.early_stopping_rounds, self.callbacks)
         super().fit(**args)
         return self
diff --git a/python-package/xgboost/sklearn.py b/python-package/xgboost/sklearn.py
index 8c3a96784..a0fde2292 100644
--- a/python-package/xgboost/sklearn.py
+++ b/python-package/xgboost/sklearn.py
@@ -349,12 +349,6 @@ __model_doc = f"""
         See :doc:`/tutorials/custom_metric_obj` and :ref:`custom-obj-metric` for more
         information.
 
-        .. note::
-
-             This parameter replaces `eval_metric` in :py:meth:`fit` method.  The old
-             one receives un-transformed prediction regardless of whether custom
-             objective is being used.
-
         .. code-block:: python
 
             from sklearn.datasets import load_diabetes
@@ -389,10 +383,6 @@ __model_doc = f"""
           early stopping.  If there's more than one metric in **eval_metric**, the last
           metric will be used for early stopping.
 
-        .. note::
-
-            This parameter replaces `early_stopping_rounds` in :py:meth:`fit` method.
-
     callbacks : Optional[List[TrainingCallback]]
         List of callback functions that are applied at end of each iteration.
         It is possible to use predefined callbacks by using
@@ -872,16 +862,11 @@ class XGBModel(XGBModelBase):
     def _configure_fit(
         self,
         booster: Optional[Union[Booster, "XGBModel", str]],
-        eval_metric: Optional[Union[Callable, str, Sequence[str]]],
         params: Dict[str, Any],
-        early_stopping_rounds: Optional[int],
-        callbacks: Optional[Sequence[TrainingCallback]],
     ) -> Tuple[
         Optional[Union[Booster, str, "XGBModel"]],
         Optional[Metric],
         Dict[str, Any],
-        Optional[int],
-        Optional[Sequence[TrainingCallback]],
     ]:
         """Configure parameters for :py:meth:`fit`."""
         if isinstance(booster, XGBModel):
@@ -903,49 +888,16 @@ class XGBModel(XGBModelBase):
                 "or `set_params` instead."
             )
 
-        # Configure evaluation metric.
-        if eval_metric is not None:
-            _deprecated("eval_metric")
-        if self.eval_metric is not None and eval_metric is not None:
-            _duplicated("eval_metric")
-        # - track where does the evaluation metric come from
-        if self.eval_metric is not None:
-            from_fit = False
-            eval_metric = self.eval_metric
-        else:
-            from_fit = True
         # - configure callable evaluation metric
         metric: Optional[Metric] = None
-        if eval_metric is not None:
-            if callable(eval_metric) and from_fit:
-                # No need to wrap the evaluation function for old parameter.
-                metric = eval_metric
-            elif callable(eval_metric):
-                # Parameter from constructor or set_params
+        if self.eval_metric is not None:
+            if callable(self.eval_metric):
                 if self._get_type() == "ranker":
-                    metric = ltr_metric_decorator(eval_metric, self.n_jobs)
+                    metric = ltr_metric_decorator(self.eval_metric, self.n_jobs)
                 else:
-                    metric = _metric_decorator(eval_metric)
+                    metric = _metric_decorator(self.eval_metric)
             else:
-                params.update({"eval_metric": eval_metric})
-
-        # Configure early_stopping_rounds
-        if early_stopping_rounds is not None:
-            _deprecated("early_stopping_rounds")
-        if early_stopping_rounds is not None and self.early_stopping_rounds is not None:
-            _duplicated("early_stopping_rounds")
-        early_stopping_rounds = (
-            self.early_stopping_rounds
-            if self.early_stopping_rounds is not None
-            else early_stopping_rounds
-        )
-
-        # Configure callbacks
-        if callbacks is not None:
-            _deprecated("callbacks")
-        if callbacks is not None and self.callbacks is not None:
-            _duplicated("callbacks")
-        callbacks = self.callbacks if self.callbacks is not None else callbacks
+                params.update({"eval_metric": self.eval_metric})
 
         tree_method = params.get("tree_method", None)
         if self.enable_categorical and tree_method == "exact":
@@ -953,7 +905,7 @@ class XGBModel(XGBModelBase):
                 "Experimental support for categorical data is not implemented for"
                 " current tree method yet."
             )
-        return model, metric, params, early_stopping_rounds, callbacks
+        return model, metric, params
 
     def _create_dmatrix(self, ref: Optional[DMatrix], **kwargs: Any) -> DMatrix:
         # Use `QuantileDMatrix` to save memory.
@@ -979,14 +931,11 @@ class XGBModel(XGBModelBase):
         sample_weight: Optional[ArrayLike] = None,
         base_margin: Optional[ArrayLike] = None,
         eval_set: Optional[Sequence[Tuple[ArrayLike, ArrayLike]]] = None,
-        eval_metric: Optional[Union[str, Sequence[str], Metric]] = None,
-        early_stopping_rounds: Optional[int] = None,
         verbose: Optional[Union[bool, int]] = True,
         xgb_model: Optional[Union[Booster, str, "XGBModel"]] = None,
         sample_weight_eval_set: Optional[Sequence[ArrayLike]] = None,
         base_margin_eval_set: Optional[Sequence[ArrayLike]] = None,
         feature_weights: Optional[ArrayLike] = None,
-        callbacks: Optional[Sequence[TrainingCallback]] = None,
     ) -> "XGBModel":
         # pylint: disable=invalid-name,attribute-defined-outside-init
         """Fit gradient boosting model.
@@ -1017,18 +966,6 @@ class XGBModel(XGBModelBase):
             metrics will be computed.
             Validation metrics will help us track the performance of the model.
 
-        eval_metric : str, list of str, or callable, optional
-
-            .. deprecated:: 1.6.0
-
-            Use `eval_metric` in :py:meth:`__init__` or :py:meth:`set_params` instead.
-
-        early_stopping_rounds : int
-
-            .. deprecated:: 1.6.0
-
-            Use `early_stopping_rounds` in :py:meth:`__init__` or :py:meth:`set_params`
-            instead.
         verbose :
             If `verbose` is True and an evaluation set is used, the evaluation metric
             measured on the validation set is printed to stdout at each boosting stage.
@@ -1049,10 +986,6 @@ class XGBModel(XGBModelBase):
             selected when colsample is being used.  All values must be greater than 0,
             otherwise a `ValueError` is thrown.
 
-        callbacks :
-            .. deprecated:: 1.6.0
-                Use `callbacks` in :py:meth:`__init__` or :py:meth:`set_params` instead.
-
         """
         with config_context(verbosity=self.verbosity):
             evals_result: TrainingCallback.EvalsLog = {}
@@ -1082,27 +1015,19 @@ class XGBModel(XGBModelBase):
             else:
                 obj = None
 
-            (
-                model,
-                metric,
-                params,
-                early_stopping_rounds,
-                callbacks,
-            ) = self._configure_fit(
-                xgb_model, eval_metric, params, early_stopping_rounds, callbacks
-            )
+            model, metric, params = self._configure_fit(xgb_model, params)
             self._Booster = train(
                 params,
                 train_dmatrix,
                 self.get_num_boosting_rounds(),
                 evals=evals,
-                early_stopping_rounds=early_stopping_rounds,
+                early_stopping_rounds=self.early_stopping_rounds,
                 evals_result=evals_result,
                 obj=obj,
                 custom_metric=metric,
                 verbose_eval=verbose,
                 xgb_model=model,
-                callbacks=callbacks,
+                callbacks=self.callbacks,
             )
 
             self._set_evaluation_result(evals_result)
@@ -1437,14 +1362,11 @@ class XGBClassifier(XGBModel, XGBClassifierBase):
         sample_weight: Optional[ArrayLike] = None,
         base_margin: Optional[ArrayLike] = None,
         eval_set: Optional[Sequence[Tuple[ArrayLike, ArrayLike]]] = None,
-        eval_metric: Optional[Union[str, Sequence[str], Metric]] = None,
-        early_stopping_rounds: Optional[int] = None,
         verbose: Optional[Union[bool, int]] = True,
         xgb_model: Optional[Union[Booster, str, XGBModel]] = None,
         sample_weight_eval_set: Optional[Sequence[ArrayLike]] = None,
         base_margin_eval_set: Optional[Sequence[ArrayLike]] = None,
         feature_weights: Optional[ArrayLike] = None,
-        callbacks: Optional[Sequence[TrainingCallback]] = None,
     ) -> "XGBClassifier":
         # pylint: disable = attribute-defined-outside-init,too-many-statements
         with config_context(verbosity=self.verbosity):
@@ -1492,15 +1414,7 @@ class XGBClassifier(XGBModel, XGBClassifierBase):
                     params["objective"] = "multi:softprob"
                 params["num_class"] = self.n_classes_
 
-            (
-                model,
-                metric,
-                params,
-                early_stopping_rounds,
-                callbacks,
-            ) = self._configure_fit(
-                xgb_model, eval_metric, params, early_stopping_rounds, callbacks
-            )
+            model, metric, params = self._configure_fit(xgb_model, params)
             train_dmatrix, evals = _wrap_evaluation_matrices(
                 missing=self.missing,
                 X=X,
@@ -1525,13 +1439,13 @@ class XGBClassifier(XGBModel, XGBClassifierBase):
                 train_dmatrix,
                 self.get_num_boosting_rounds(),
                 evals=evals,
-                early_stopping_rounds=early_stopping_rounds,
+                early_stopping_rounds=self.early_stopping_rounds,
                 evals_result=evals_result,
                 obj=obj,
                 custom_metric=metric,
                 verbose_eval=verbose,
                 xgb_model=model,
-                callbacks=callbacks,
+                callbacks=self.callbacks,
             )
 
             if not callable(self.objective):
@@ -1693,17 +1607,14 @@ class XGBRFClassifier(XGBClassifier):
         sample_weight: Optional[ArrayLike] = None,
         base_margin: Optional[ArrayLike] = None,
         eval_set: Optional[Sequence[Tuple[ArrayLike, ArrayLike]]] = None,
-        eval_metric: Optional[Union[str, Sequence[str], Metric]] = None,
-        early_stopping_rounds: Optional[int] = None,
         verbose: Optional[Union[bool, int]] = True,
         xgb_model: Optional[Union[Booster, str, XGBModel]] = None,
         sample_weight_eval_set: Optional[Sequence[ArrayLike]] = None,
         base_margin_eval_set: Optional[Sequence[ArrayLike]] = None,
         feature_weights: Optional[ArrayLike] = None,
-        callbacks: Optional[Sequence[TrainingCallback]] = None,
     ) -> "XGBRFClassifier":
         args = {k: v for k, v in locals().items() if k not in ("self", "__class__")}
-        _check_rf_callback(early_stopping_rounds, callbacks)
+        _check_rf_callback(self.early_stopping_rounds, self.callbacks)
         super().fit(**args)
         return self
 
@@ -1768,17 +1679,14 @@ class XGBRFRegressor(XGBRegressor):
         sample_weight: Optional[ArrayLike] = None,
         base_margin: Optional[ArrayLike] = None,
         eval_set: Optional[Sequence[Tuple[ArrayLike, ArrayLike]]] = None,
-        eval_metric: Optional[Union[str, Sequence[str], Metric]] = None,
-        early_stopping_rounds: Optional[int] = None,
         verbose: Optional[Union[bool, int]] = True,
         xgb_model: Optional[Union[Booster, str, XGBModel]] = None,
         sample_weight_eval_set: Optional[Sequence[ArrayLike]] = None,
         base_margin_eval_set: Optional[Sequence[ArrayLike]] = None,
         feature_weights: Optional[ArrayLike] = None,
-        callbacks: Optional[Sequence[TrainingCallback]] = None,
     ) -> "XGBRFRegressor":
         args = {k: v for k, v in locals().items() if k not in ("self", "__class__")}
-        _check_rf_callback(early_stopping_rounds, callbacks)
+        _check_rf_callback(self.early_stopping_rounds, self.callbacks)
         super().fit(**args)
         return self
 
@@ -1883,14 +1791,11 @@ class XGBRanker(XGBModel, XGBRankerMixIn):
         eval_set: Optional[Sequence[Tuple[ArrayLike, ArrayLike]]] = None,
         eval_group: Optional[Sequence[ArrayLike]] = None,
         eval_qid: Optional[Sequence[ArrayLike]] = None,
-        eval_metric: Optional[Union[str, Sequence[str], Metric]] = None,
-        early_stopping_rounds: Optional[int] = None,
         verbose: Optional[Union[bool, int]] = False,
         xgb_model: Optional[Union[Booster, str, XGBModel]] = None,
         sample_weight_eval_set: Optional[Sequence[ArrayLike]] = None,
         base_margin_eval_set: Optional[Sequence[ArrayLike]] = None,
         feature_weights: Optional[ArrayLike] = None,
-        callbacks: Optional[Sequence[TrainingCallback]] = None,
     ) -> "XGBRanker":
         # pylint: disable = attribute-defined-outside-init,arguments-differ
         """Fit gradient boosting ranker
@@ -1960,15 +1865,6 @@ class XGBRanker(XGBModel, XGBRankerMixIn):
             pair in **eval_set**. The special column convention in `X` applies to
             validation datasets as well.
 
-        eval_metric : str, list of str, optional
-            .. deprecated:: 1.6.0
-                use `eval_metric` in :py:meth:`__init__` or :py:meth:`set_params` instead.
-
-        early_stopping_rounds : int
-            .. deprecated:: 1.6.0
-                use `early_stopping_rounds` in :py:meth:`__init__` or
-                :py:meth:`set_params` instead.
-
         verbose :
             If `verbose` is True and an evaluation set is used, the evaluation metric
             measured on the validation set is printed to stdout at each boosting stage.
@@ -1996,10 +1892,6 @@ class XGBRanker(XGBModel, XGBRankerMixIn):
             selected when colsample is being used.  All values must be greater than 0,
             otherwise a `ValueError` is thrown.
 
-        callbacks :
-            .. deprecated:: 1.6.0
-                Use `callbacks` in :py:meth:`__init__` or :py:meth:`set_params` instead.
-
         """
         with config_context(verbosity=self.verbosity):
             train_dmatrix, evals = _wrap_evaluation_matrices(
@@ -2024,27 +1916,19 @@ class XGBRanker(XGBModel, XGBRankerMixIn):
             evals_result: TrainingCallback.EvalsLog = {}
             params = self.get_xgb_params()
 
-            (
-                model,
-                metric,
-                params,
-                early_stopping_rounds,
-                callbacks,
-            ) = self._configure_fit(
-                xgb_model, eval_metric, params, early_stopping_rounds, callbacks
-            )
+            model, metric, params = self._configure_fit(xgb_model, params)
 
             self._Booster = train(
                 params,
                 train_dmatrix,
                 num_boost_round=self.get_num_boosting_rounds(),
-                early_stopping_rounds=early_stopping_rounds,
+                early_stopping_rounds=self.early_stopping_rounds,
                 evals=evals,
                 evals_result=evals_result,
                 custom_metric=metric,
                 verbose_eval=verbose,
                 xgb_model=model,
-                callbacks=callbacks,
+                callbacks=self.callbacks,
             )
 
             self.objective = params["objective"]
diff --git a/python-package/xgboost/testing/continuation.py b/python-package/xgboost/testing/continuation.py
new file mode 100644
index 000000000..9d6dc0338
--- /dev/null
+++ b/python-package/xgboost/testing/continuation.py
@@ -0,0 +1,58 @@
+"""Tests for training continuation."""
+import json
+from typing import Any, Dict, TypeVar
+
+import numpy as np
+import pytest
+
+import xgboost as xgb
+
+
+# pylint: disable=too-many-locals
+def run_training_continuation_model_output(device: str, tree_method: str) -> None:
+    """Run training continuation test."""
+    datasets = pytest.importorskip("sklearn.datasets")
+    n_samples = 64
+    n_features = 32
+    X, y = datasets.make_regression(n_samples, n_features, random_state=1)
+
+    dtrain = xgb.DMatrix(X, y)
+    params = {
+        "tree_method": tree_method,
+        "max_depth": "2",
+        "gamma": "0.1",
+        "alpha": "0.01",
+        "device": device,
+    }
+    bst_0 = xgb.train(params, dtrain, num_boost_round=64)
+    dump_0 = bst_0.get_dump(dump_format="json")
+
+    bst_1 = xgb.train(params, dtrain, num_boost_round=32)
+    bst_1 = xgb.train(params, dtrain, num_boost_round=32, xgb_model=bst_1)
+    dump_1 = bst_1.get_dump(dump_format="json")
+
+    T = TypeVar("T", Dict[str, Any], float, str, int, list)
+
+    def recursive_compare(obj_0: T, obj_1: T) -> None:
+        if isinstance(obj_0, float):
+            assert np.isclose(obj_0, obj_1, atol=1e-6)
+        elif isinstance(obj_0, str):
+            assert obj_0 == obj_1
+        elif isinstance(obj_0, int):
+            assert obj_0 == obj_1
+        elif isinstance(obj_0, dict):
+            for i in range(len(obj_0.items())):
+                assert list(obj_0.keys())[i] == list(obj_1.keys())[i]
+                if list(obj_0.keys())[i] != "missing":
+                    recursive_compare(list(obj_0.values()), list(obj_1.values()))
+        else:
+            for i, lhs in enumerate(obj_0):
+                rhs = obj_1[i]
+                recursive_compare(lhs, rhs)
+
+    assert len(dump_0) == len(dump_1)
+
+    for i, lhs in enumerate(dump_0):
+        obj_0 = json.loads(lhs)
+        obj_1 = json.loads(dump_1[i])
+        recursive_compare(obj_0, obj_1)
diff --git a/src/collective/coll.cc b/src/collective/coll.cc
index 3191896f8..5f14e4d9a 100644
--- a/src/collective/coll.cc
+++ b/src/collective/coll.cc
@@ -18,6 +18,8 @@
 
 #if defined(XGBOOST_USE_CUDA)
 #include "cuda_fp16.h"  // for __half
+#elif defined(XGBOOST_USE_HIP)
+#include <hip/hip_fp16.h>  // for __half
 #endif
 
 namespace xgboost::collective {
diff --git a/src/collective/loop.cc b/src/collective/loop.cc
index 5cfb0034d..b51749fcd 100644
--- a/src/collective/loop.cc
+++ b/src/collective/loop.cc
@@ -1,11 +1,19 @@
 /**
- * Copyright 2023, XGBoost Contributors
+ * Copyright 2023-2024, XGBoost Contributors
  */
 #include "loop.h"
 
-#include <queue>  // for queue
+#include <cstddef>    // for size_t
+#include <cstdint>    // for int32_t
+#include <exception>  // for exception, current_exception, rethrow_exception
+#include <mutex>      // for lock_guard, unique_lock
+#include <queue>      // for queue
+#include <string>     // for string
+#include <thread>     // for thread
+#include <utility>    // for move
 
 #include "rabit/internal/socket.h"      // for PollHelper
+#include "xgboost/collective/result.h"  // for Fail, Success
 #include "xgboost/collective/socket.h"  // for FailWithCode
 #include "xgboost/logging.h"            // for CHECK
 
@@ -109,62 +117,94 @@ Result Loop::EmptyQueue(std::queue<Op>* p_queue) const {
 }
 
 void Loop::Process() {
-  // consumer
-  while (true) {
-    std::unique_lock lock{mu_};
-    cv_.wait(lock, [this] { return !this->queue_.empty() || stop_; });
-    if (stop_) {
-      break;
-    }
+  auto set_rc = [this](Result&& rc) {
+    std::lock_guard lock{rc_lock_};
+    rc_ = std::forward<Result>(rc);
+  };
+
+  // This loop cannot exit unless `stop_` is set to true. There must always be a thread to
+  // answer the blocking call even if there are errors, otherwise the blocking will wait
+  // forever.
+  while (true) {
+    try {
+      std::unique_lock lock{mu_};
+      cv_.wait(lock, [this] { return !this->queue_.empty() || stop_; });
+      if (stop_) {
+        break;  // only point where this loop can exit.
+      }
+
+      // Move the global queue into a local variable to unblock it.
+      std::queue<Op> qcopy;
+
+      bool is_blocking = false;
+      while (!queue_.empty()) {
+        auto op = queue_.front();
+        queue_.pop();
+        if (op.code == Op::kBlock) {
+          is_blocking = true;
+          // Block must be the last op in the current batch since no further submit can be
+          // issued until the blocking call is finished.
+          CHECK(queue_.empty());
+        } else {
+          qcopy.push(op);
+        }
+      }
 
-    auto unlock_notify = [&](bool is_blocking, bool stop) {
       if (!is_blocking) {
-        std::lock_guard guard{mu_};
-        stop_ = stop;
-      } else {
-        stop_ = stop;
+        // Unblock, we can write to the global queue again.
         lock.unlock();
       }
-      cv_.notify_one();
-    };
 
-    // move the queue
-    std::queue<Op> qcopy;
-    bool is_blocking = false;
-    while (!queue_.empty()) {
-      auto op = queue_.front();
-      queue_.pop();
-      if (op.code == Op::kBlock) {
-        is_blocking = true;
-      } else {
-        qcopy.push(op);
+      // Clear the local queue, this is blocking the current worker thread (but not the
+      // client thread), wait until all operations are finished.
+      auto rc = this->EmptyQueue(&qcopy);
+
+      if (is_blocking) {
+        // The unlock is delayed if this is a blocking call
+        lock.unlock();
       }
-    }
-    // unblock the queue
-    if (!is_blocking) {
-      lock.unlock();
-    }
-    // clear the queue
-    auto rc = this->EmptyQueue(&qcopy);
-    // Handle error
-    if (!rc.OK()) {
-      unlock_notify(is_blocking, true);
-      std::lock_guard<std::mutex> guard{rc_lock_};
-      this->rc_ = std::move(rc);
-      return;
-    }
 
-    CHECK(qcopy.empty());
-    unlock_notify(is_blocking, false);
+      // Notify the client thread who called block after all error conditions are set.
+      auto notify_if_block = [&] {
+        if (is_blocking) {
+          std::unique_lock lock{mu_};
+          block_done_ = true;
+          lock.unlock();
+          block_cv_.notify_one();
+        }
+      };
+
+      // Handle error
+      if (!rc.OK()) {
+        set_rc(std::move(rc));
+      } else {
+        CHECK(qcopy.empty());
+      }
+
+      notify_if_block();
+    } catch (std::exception const& e) {
+      curr_exce_ = std::current_exception();
+      set_rc(Fail("Exception inside the event loop:" + std::string{e.what()}));
+    } catch (...) {
+      curr_exce_ = std::current_exception();
+      set_rc(Fail("Unknown exception inside the event loop."));
+    }
   }
 }
 
 Result Loop::Stop() {
+  // Finish all remaining tasks
+  CHECK_EQ(this->Block().OK(), this->rc_.OK());
+
+  // Notify the loop to stop
   std::unique_lock lock{mu_};
   stop_ = true;
   lock.unlock();
+  this->cv_.notify_one();
 
-  CHECK_EQ(this->Block().OK(), this->rc_.OK());
+  if (this->worker_.joinable()) {
+    this->worker_.join();
+  }
 
   if (curr_exce_) {
     std::rethrow_exception(curr_exce_);
@@ -175,17 +215,29 @@ Result Loop::Stop() {
 
 [[nodiscard]] Result Loop::Block() {
   {
+    // Check whether the last op was successful, stop if not.
     std::lock_guard<std::mutex> guard{rc_lock_};
     if (!rc_.OK()) {
-      return std::move(rc_);
+      stop_ = true;
     }
   }
-  this->Submit(Op{Op::kBlock});
-  {
-    std::unique_lock lock{mu_};
-    cv_.wait(lock, [this] { return (this->queue_.empty()) || stop_; });
+
+  if (!this->worker_.joinable()) {
+    std::lock_guard<std::mutex> guard{rc_lock_};
+    return Fail("Worker has stopped.", std::move(rc_));
   }
+
+  this->Submit(Op{Op::kBlock});
+
   {
+    // Wait for the block call to finish.
+    std::unique_lock lock{mu_};
+    block_cv_.wait(lock, [this] { return block_done_ || stop_; });
+    block_done_ = false;
+  }
+
+  {
+    // Transfer the rc.
     std::lock_guard<std::mutex> lock{rc_lock_};
     return std::move(rc_);
   }
@@ -193,26 +245,6 @@ Result Loop::Stop() {
 
 Loop::Loop(std::chrono::seconds timeout) : timeout_{timeout} {
   timer_.Init(__func__);
-  worker_ = std::thread{[this] {
-    try {
-      this->Process();
-    } catch (std::exception const& e) {
-      std::lock_guard<std::mutex> guard{mu_};
-      if (!curr_exce_) {
-        curr_exce_ = std::current_exception();
-        rc_ = Fail("Exception was thrown");
-      }
-      stop_ = true;
-      cv_.notify_all();
-    } catch (...) {
-      std::lock_guard<std::mutex> guard{mu_};
-      if (!curr_exce_) {
-        curr_exce_ = std::current_exception();
-        rc_ = Fail("Exception was thrown");
-      }
-      stop_ = true;
-      cv_.notify_all();
-    }
-  }};
+  worker_ = std::thread{[this] { this->Process(); }};
 }
 }  // namespace xgboost::collective
diff --git a/src/collective/loop.h b/src/collective/loop.h
index 0c1fdcbfe..4839abfd3 100644
--- a/src/collective/loop.h
+++ b/src/collective/loop.h
@@ -1,5 +1,5 @@
 /**
- * Copyright 2023, XGBoost Contributors
+ * Copyright 2023-2024, XGBoost Contributors
  */
 #pragma once
 #include <chrono>              // for seconds
@@ -10,7 +10,6 @@
 #include <mutex>               // for unique_lock, mutex
 #include <queue>               // for queue
 #include <thread>              // for thread
-#include <utility>             // for move
 
 #include "../common/timer.h"            // for Monitor
 #include "xgboost/collective/result.h"  // for Result
@@ -37,10 +36,15 @@ class Loop {
   };
 
  private:
-  std::thread worker_;
-  std::condition_variable cv_;
-  std::mutex mu_;
-  std::queue<Op> queue_;
+  std::thread worker_;  // thread worker to execute the tasks
+
+  std::condition_variable cv_;        // CV used to notify a new submit call
+  std::condition_variable block_cv_;  // CV used to notify the blocking call
+  bool block_done_{false};            // Flag to indicate whether the blocking call has finished.
+
+  std::queue<Op> queue_;  // event queue
+  std::mutex mu_;         // mutex to protect the queue, cv, and block_done
+
   std::chrono::seconds timeout_;
 
   Result rc_;
@@ -51,29 +55,33 @@ class Loop {
   common::Monitor mutable timer_;
 
   Result EmptyQueue(std::queue<Op>* p_queue) const;
+  // The cunsumer function that runs inside a worker thread.
   void Process();
 
  public:
+  /**
+   * @brief Stop the worker thread.
+   */
   Result Stop();
 
   void Submit(Op op) {
-    // producer
     std::unique_lock lock{mu_};
     queue_.push(op);
     lock.unlock();
     cv_.notify_one();
   }
 
+  /**
+   * @brief Block the event loop until all ops are finished. In the case of failure, this
+   *        loop should be not be used for new operations.
+   */
   [[nodiscard]] Result Block();
 
   explicit Loop(std::chrono::seconds timeout);
 
   ~Loop() noexcept(false) {
+    // The worker will be joined in the stop function.
     this->Stop();
-
-    if (worker_.joinable()) {
-      worker_.join();
-    }
   }
 };
 }  // namespace xgboost::collective
diff --git a/src/tree/gpu_hist/histogram.cu b/src/tree/gpu_hist/histogram.cu
index 409c1f260..62e40f4d4 100644
--- a/src/tree/gpu_hist/histogram.cu
+++ b/src/tree/gpu_hist/histogram.cu
@@ -318,7 +318,6 @@ void BuildGradientHistogram(CUDAContext const* ctx, EllpackDeviceAccessor const&
     grid_size = std::min(common::DivRoundUp(grid_size, num_groups), static_cast<std::uint32_t>(
                                         common::DivRoundUp(items_per_group, kMinItemsPerBlock)));
 #endif
-
     dh::LaunchKernel {dim3(grid_size, num_groups), static_cast<uint32_t>(kBlockThreads), smem_size,
                      ctx->Stream()} (kernel, matrix, feature_groups, d_ridx, histogram.data(),
                                      gpair.data(), rounding);
diff --git a/src/tree/hist/evaluate_splits.h b/src/tree/hist/evaluate_splits.h
index 680c50398..bc534d351 100644
--- a/src/tree/hist/evaluate_splits.h
+++ b/src/tree/hist/evaluate_splits.h
@@ -730,6 +730,9 @@ class HistMultiEvaluator {
 
     std::size_t n_nodes = p_tree->Size();
     gain_.resize(n_nodes);
+    // Re-calculate weight without learning rate.
+    CalcWeight(*param_, left_sum, left_weight);
+    CalcWeight(*param_, right_sum, right_weight);
     gain_[left_child] = CalcGainGivenWeight(*param_, left_sum, left_weight);
     gain_[right_child] = CalcGainGivenWeight(*param_, right_sum, right_weight);
 
diff --git a/src/tree/multi_target_tree_model.cc b/src/tree/multi_target_tree_model.cc
index bccc1967e..11ee1f6dd 100644
--- a/src/tree/multi_target_tree_model.cc
+++ b/src/tree/multi_target_tree_model.cc
@@ -195,8 +195,9 @@ void MultiTargetTree::Expand(bst_node_t nidx, bst_feature_t split_idx, float spl
   split_index_.resize(n);
   split_index_[nidx] = split_idx;
 
-  split_conds_.resize(n);
+  split_conds_.resize(n, std::numeric_limits<float>::quiet_NaN());
   split_conds_[nidx] = split_cond;
+
   default_left_.resize(n);
   default_left_[nidx] = static_cast<std::uint8_t>(default_left);
 
diff --git a/src/tree/updater_quantile_hist.cc b/src/tree/updater_quantile_hist.cc
index 7731f505e..c2aaedafa 100644
--- a/src/tree/updater_quantile_hist.cc
+++ b/src/tree/updater_quantile_hist.cc
@@ -149,6 +149,9 @@ class MultiTargetHistBuilder {
   }
 
   void InitData(DMatrix *p_fmat, RegTree const *p_tree) {
+    if (collective::IsDistributed()) {
+      LOG(FATAL) << "Distributed training for vector-leaf is not yet supported.";
+    }
     monitor_->Start(__func__);
 
     p_last_fmat_ = p_fmat;
diff --git a/tests/README.md b/tests/README.md
index 4c29f4905..a118e7918 100644
--- a/tests/README.md
+++ b/tests/README.md
@@ -4,15 +4,13 @@ facilities.
 # Directories
   * ci_build:  Test facilities for Jenkins CI and GitHub action.
   * cli: Basic test for command line executable `xgboost`.  Most of the other command line
-    specific tests are in Python test `test_cli.py`
+    specific tests are in Python test `test_cli.py`.
   * cpp: Tests for C++ core, using Google test framework.
   * python: Tests for Python package, demonstrations and CLI.  For how to setup the
     dependencies for tests, see conda files in `ci_build`.
   * python-gpu: Similar to python tests, but for GPU.
   * travis: CI facilities for Travis.
-  * distributed: Test for distributed system.
-  * benchmark: Legacy benchmark code.  There are a number of benchmark projects for
-    XGBoost with much better configurations.
+  * test_distributed: Test for distributed systems including spark and dask.
 
 # Others
   * pytest.ini: Describes the `pytest` marker for python tests, some markers are generated
diff --git a/tests/benchmark/benchmark_linear.py b/tests/benchmark/benchmark_linear.py
deleted file mode 100644
index cb5141714..000000000
--- a/tests/benchmark/benchmark_linear.py
+++ /dev/null
@@ -1,69 +0,0 @@
-#pylint: skip-file
-import argparse
-import xgboost as xgb
-import numpy as np
-from sklearn.datasets import make_classification
-from sklearn.model_selection import train_test_split
-import time
-import ast
-
-rng = np.random.RandomState(1994)
-
-
-def run_benchmark(args):
-
-    try:
-        dtest = xgb.DMatrix('dtest.dm')
-        dtrain = xgb.DMatrix('dtrain.dm')
-
-        if not (dtest.num_col() == args.columns \
-                and dtrain.num_col() == args.columns):
-            raise ValueError("Wrong cols")
-        if not (dtest.num_row() == args.rows * args.test_size \
-                and dtrain.num_row() == args.rows * (1-args.test_size)):
-            raise ValueError("Wrong rows")
-    except:
-
-        print("Generating dataset: {} rows * {} columns".format(args.rows, args.columns))
-        print("{}/{} test/train split".format(args.test_size, 1.0 - args.test_size))
-        tmp = time.time()
-        X, y = make_classification(args.rows, n_features=args.columns, n_redundant=0, n_informative=args.columns, n_repeated=0, random_state=7)
-        if args.sparsity < 1.0:
-           X = np.array([[np.nan if rng.uniform(0, 1) < args.sparsity else x for x in x_row] for x_row in X])
-
-        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=args.test_size, random_state=7)
-        print ("Generate Time: %s seconds" % (str(time.time() - tmp)))
-        tmp = time.time()
-        print ("DMatrix Start")
-        dtrain = xgb.DMatrix(X_train, y_train)
-        dtest = xgb.DMatrix(X_test, y_test, nthread=-1)
-        print ("DMatrix Time: %s seconds" % (str(time.time() - tmp)))
-
-        dtest.save_binary('dtest.dm')
-        dtrain.save_binary('dtrain.dm')
-
-    param = {'objective': 'binary:logistic','booster':'gblinear'}
-    if args.params != '':
-        param.update(ast.literal_eval(args.params))
-
-    param['updater'] = args.updater
-    print("Training with '%s'" % param['updater'])
-    tmp = time.time()
-    xgb.train(param, dtrain, args.iterations, evals=[(dtrain,"train")], early_stopping_rounds = args.columns)
-    print ("Train Time: %s seconds" % (str(time.time() - tmp)))
-
-parser = argparse.ArgumentParser()
-parser.add_argument('--updater', default='coord_descent')
-parser.add_argument('--sparsity', type=float, default=0.0)
-parser.add_argument('--lambda', type=float, default=1.0)
-parser.add_argument('--tol', type=float, default=1e-5)
-parser.add_argument('--alpha', type=float, default=1.0)
-parser.add_argument('--rows', type=int, default=1000000)
-parser.add_argument('--iterations', type=int, default=10000)
-parser.add_argument('--columns', type=int, default=50)
-parser.add_argument('--test_size', type=float, default=0.25)
-parser.add_argument('--standardise', type=bool, default=False)
-parser.add_argument('--params', default='', help='Provide additional parameters as a Python dict string, e.g. --params \"{\'max_depth\':2}\"')
-args = parser.parse_args()
-
-run_benchmark(args)
diff --git a/tests/benchmark/benchmark_tree.py b/tests/benchmark/benchmark_tree.py
deleted file mode 100644
index 380e03463..000000000
--- a/tests/benchmark/benchmark_tree.py
+++ /dev/null
@@ -1,86 +0,0 @@
-"""Run benchmark on the tree booster."""
-
-import argparse
-import ast
-import time
-
-import numpy as np
-import xgboost as xgb
-
-RNG = np.random.RandomState(1994)
-
-
-def run_benchmark(args):
-    """Runs the benchmark."""
-    try:
-        dtest = xgb.DMatrix('dtest.dm')
-        dtrain = xgb.DMatrix('dtrain.dm')
-
-        if not (dtest.num_col() == args.columns
-                and dtrain.num_col() == args.columns):
-            raise ValueError("Wrong cols")
-        if not (dtest.num_row() == args.rows * args.test_size
-                and dtrain.num_row() == args.rows * (1 - args.test_size)):
-            raise ValueError("Wrong rows")
-    except:
-        print("Generating dataset: {} rows * {} columns".format(args.rows, args.columns))
-        print("{}/{} test/train split".format(args.test_size, 1.0 - args.test_size))
-        tmp = time.time()
-        X = RNG.rand(args.rows, args.columns)
-        y = RNG.randint(0, 2, args.rows)
-        if 0.0 < args.sparsity < 1.0:
-            X = np.array([[np.nan if RNG.uniform(0, 1) < args.sparsity else x for x in x_row]
-                          for x_row in X])
-
-        train_rows = int(args.rows * (1.0 - args.test_size))
-        test_rows = int(args.rows * args.test_size)
-        X_train = X[:train_rows, :]
-        X_test = X[-test_rows:, :]
-        y_train = y[:train_rows]
-        y_test = y[-test_rows:]
-        print("Generate Time: %s seconds" % (str(time.time() - tmp)))
-        del X, y
-
-        tmp = time.time()
-        print("DMatrix Start")
-        dtrain = xgb.DMatrix(X_train, y_train, nthread=-1)
-        dtest = xgb.DMatrix(X_test, y_test, nthread=-1)
-        print("DMatrix Time: %s seconds" % (str(time.time() - tmp)))
-        del X_train, y_train, X_test, y_test
-
-        dtest.save_binary('dtest.dm')
-        dtrain.save_binary('dtrain.dm')
-
-    param = {'objective': 'binary:logistic'}
-    if args.params != '':
-        param.update(ast.literal_eval(args.params))
-
-    param['tree_method'] = args.tree_method
-    print("Training with '%s'" % param['tree_method'])
-    tmp = time.time()
-    xgb.train(param, dtrain, args.iterations, evals=[(dtest, "test")])
-    print("Train Time: %s seconds" % (str(time.time() - tmp)))
-
-
-def main():
-    """The main function.
-
-    Defines and parses command line arguments and calls the benchmark.
-    """
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--tree_method', default='gpu_hist')
-    parser.add_argument('--sparsity', type=float, default=0.0)
-    parser.add_argument('--rows', type=int, default=1000000)
-    parser.add_argument('--columns', type=int, default=50)
-    parser.add_argument('--iterations', type=int, default=500)
-    parser.add_argument('--test_size', type=float, default=0.25)
-    parser.add_argument('--params', default='',
-                        help='Provide additional parameters as a Python dict string, e.g. --params '
-                             '\"{\'max_depth\':2}\"')
-    args = parser.parse_args()
-
-    run_benchmark(args)
-
-
-if __name__ == '__main__':
-    main()
diff --git a/tests/benchmark/generate_libsvm.py b/tests/benchmark/generate_libsvm.py
deleted file mode 100644
index be152df39..000000000
--- a/tests/benchmark/generate_libsvm.py
+++ /dev/null
@@ -1,87 +0,0 @@
-"""Generate synthetic data in LIBSVM format."""
-
-import argparse
-import io
-import time
-
-import numpy as np
-from sklearn.datasets import make_classification
-from sklearn.model_selection import train_test_split
-
-RNG = np.random.RandomState(2019)
-
-
-def generate_data(args):
-    """Generates the data."""
-    print("Generating dataset: {} rows * {} columns".format(args.rows, args.columns))
-    print("Sparsity {}".format(args.sparsity))
-    print("{}/{} train/test split".format(1.0 - args.test_size, args.test_size))
-
-    tmp = time.time()
-    n_informative = args.columns * 7 // 10
-    n_redundant = args.columns // 10
-    n_repeated = args.columns // 10
-    print("n_informative: {}, n_redundant: {}, n_repeated: {}".format(n_informative, n_redundant,
-                                                                      n_repeated))
-    x, y = make_classification(n_samples=args.rows, n_features=args.columns,
-                               n_informative=n_informative, n_redundant=n_redundant,
-                               n_repeated=n_repeated, shuffle=False, random_state=RNG)
-    print("Generate Time: {} seconds".format(time.time() - tmp))
-
-    tmp = time.time()
-    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=args.test_size,
-                                                        random_state=RNG, shuffle=False)
-    print("Train/Test Split Time: {} seconds".format(time.time() - tmp))
-
-    tmp = time.time()
-    write_file('train.libsvm', x_train, y_train, args.sparsity)
-    print("Write Train Time: {} seconds".format(time.time() - tmp))
-
-    tmp = time.time()
-    write_file('test.libsvm', x_test, y_test, args.sparsity)
-    print("Write Test Time: {} seconds".format(time.time() - tmp))
-
-
-def write_file(filename, x_data, y_data, sparsity):
-    with open(filename, 'w') as f:
-        for x, y in zip(x_data, y_data):
-            write_line(f, x, y, sparsity)
-
-
-def write_line(f, x, y, sparsity):
-    with io.StringIO() as line:
-        line.write(str(y))
-        for i, col in enumerate(x):
-            if 0.0 < sparsity < 1.0:
-                if RNG.uniform(0, 1) > sparsity:
-                    write_feature(line, i, col)
-            else:
-                write_feature(line, i, col)
-        line.write('\n')
-        f.write(line.getvalue())
-
-
-def write_feature(line, index, feature):
-    line.write(' ')
-    line.write(str(index))
-    line.write(':')
-    line.write(str(feature))
-
-
-def main():
-    """The main function.
-
-    Defines and parses command line arguments and calls the generator.
-    """
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--rows', type=int, default=1000000)
-    parser.add_argument('--columns', type=int, default=50)
-    parser.add_argument('--sparsity', type=float, default=0.0)
-    parser.add_argument('--test_size', type=float, default=0.01)
-    args = parser.parse_args()
-
-    generate_data(args)
-
-
-if __name__ == '__main__':
-    main()
diff --git a/tests/buildkite/build-jvm-packages.sh b/tests/buildkite/build-jvm-packages.sh
index 12393c561..1998385c5 100755
--- a/tests/buildkite/build-jvm-packages.sh
+++ b/tests/buildkite/build-jvm-packages.sh
@@ -8,13 +8,18 @@ echo "--- Build XGBoost JVM packages scala 2.12"
 tests/ci_build/ci_build.sh jvm tests/ci_build/build_jvm_packages.sh \
   ${SPARK_VERSION}
 
+echo "--- Stash XGBoost4J JARs (Scala 2.12)"
+buildkite-agent artifact upload "jvm-packages/xgboost4j/target/*.jar"
+buildkite-agent artifact upload "jvm-packages/xgboost4j-spark/target/*.jar"
+buildkite-agent artifact upload "jvm-packages/xgboost4j-flink/target/*.jar"
+buildkite-agent artifact upload "jvm-packages/xgboost4j-example/target/*.jar"
 
 echo "--- Build XGBoost JVM packages scala 2.13"
 
 tests/ci_build/ci_build.sh jvm tests/ci_build/build_jvm_packages.sh \
   ${SPARK_VERSION} "" "" "true"
 
-echo "--- Stash XGBoost4J JARs"
+echo "--- Stash XGBoost4J JARs (Scala 2.13)"
 buildkite-agent artifact upload "jvm-packages/xgboost4j/target/*.jar"
 buildkite-agent artifact upload "jvm-packages/xgboost4j-spark/target/*.jar"
 buildkite-agent artifact upload "jvm-packages/xgboost4j-flink/target/*.jar"
diff --git a/tests/buildkite/build-rpkg-win64-gpu.ps1 b/tests/buildkite/build-rpkg-win64-gpu.ps1
deleted file mode 100644
index a6947c270..000000000
--- a/tests/buildkite/build-rpkg-win64-gpu.ps1
+++ /dev/null
@@ -1,21 +0,0 @@
-$ErrorActionPreference = "Stop"
-
-. tests/buildkite/conftest.ps1
-
-Write-Host "--- Build XGBoost R package with CUDA"
-
-nvcc --version
-$arch_flag = "-DGPU_COMPUTE_VER=75"
-
-bash tests/ci_build/build_r_pkg_with_cuda_win64.sh $Env:BUILDKITE_COMMIT
-if ($LASTEXITCODE -ne 0) { throw "Last command failed" }
-
-if ( $is_release_branch -eq 1 ) {
-  Write-Host "--- Upload R tarball"
-  Get-ChildItem . -Filter xgboost_r_gpu_win64_*.tar.gz |
-  Foreach-Object {
-    & aws s3 cp $_ s3://xgboost-nightly-builds/$Env:BUILDKITE_BRANCH/ `
-    --acl public-read --no-progress
-    if ($LASTEXITCODE -ne 0) { throw "Last command failed" }
-  }
-}
diff --git a/tests/buildkite/pipeline-win64.yml b/tests/buildkite/pipeline-win64.yml
index d4491148e..83a61981e 100644
--- a/tests/buildkite/pipeline-win64.yml
+++ b/tests/buildkite/pipeline-win64.yml
@@ -13,11 +13,6 @@ steps:
     key: build-win64-gpu
     agents:
       queue: windows-cpu
-  - label: ":windows: Build XGBoost R package for Windows with CUDA"
-    command: "tests/buildkite/build-rpkg-win64-gpu.ps1"
-    key: build-rpkg-win64-gpu
-    agents:
-      queue: windows-cpu
 
   - wait
 
diff --git a/tests/ci_build/build_jvm_packages.sh b/tests/ci_build/build_jvm_packages.sh
index 5797a1f61..84b41f2b1 100755
--- a/tests/ci_build/build_jvm_packages.sh
+++ b/tests/ci_build/build_jvm_packages.sh
@@ -24,12 +24,13 @@ if [ "x$gpu_arch" != "x" ]; then
   export GPU_ARCH_FLAG=$gpu_arch
 fi
 
-mvn_profile_string=""
 if [ "x$use_scala213" != "x" ]; then
-  export mvn_profile_string="-Pdefault,scala-2.13"
+  cd ..
+  python dev/change_scala_version.py --scala-version 2.13 --purge-artifacts
+  cd jvm-packages
 fi
 
-mvn --no-transfer-progress package $mvn_profile_string -Dspark.version=${spark_version} $gpu_options
+mvn --no-transfer-progress package -Dspark.version=${spark_version} $gpu_options
 
 set +x
 set +e
diff --git a/tests/ci_build/build_r_pkg_with_cuda_win64.sh b/tests/ci_build/build_r_pkg_with_cuda_win64.sh
deleted file mode 100644
index 580358883..000000000
--- a/tests/ci_build/build_r_pkg_with_cuda_win64.sh
+++ /dev/null
@@ -1,36 +0,0 @@
-#!/bin/bash
-set -e
-set -x
-
-if [ "$#" -ne 1 ]
-then
-  echo "Build the R package tarball with CUDA code. Usage: $0 [commit hash]"
-  exit 1
-fi
-
-commit_hash="$1"
-# Clear all positional args
-set --
-
-source activate
-python tests/ci_build/test_r_package.py --task=pack
-mv xgboost/ xgboost_rpack/
-
-mkdir build
-cd build
-cmake .. -G"Visual Studio 17 2022" -A x64 -DUSE_CUDA=ON -DR_LIB=ON -DLIBR_HOME="c:\\Program Files\\R\\R-4.3.2" -DCMAKE_PREFIX_PATH="C:\\rtools43\\x86_64-w64-mingw32.static.posix\\bin"
-cmake --build . --config Release --parallel
-cd ..
-
-# This super wacky hack is found in cmake/RPackageInstall.cmake.in and
-# cmake/RPackageInstallTargetSetup.cmake. This hack lets us bypass the normal build process of R
-# and have R use xgboost.dll that we've already built.
-rm -v xgboost_rpack/configure
-rm -rfv xgboost_rpack/src
-mkdir -p xgboost_rpack/src
-cp -v lib/xgboost.dll xgboost_rpack/src/
-echo 'all:' > xgboost_rpack/src/Makefile
-echo 'all:' > xgboost_rpack/src/Makefile.win
-mv xgboost_rpack/ xgboost/
-/c/Rtools43/usr/bin/tar -cvf xgboost_r_gpu_win64_${commit_hash}.tar xgboost/
-/c/Rtools43/usr/bin/gzip -9c xgboost_r_gpu_win64_${commit_hash}.tar > xgboost_r_gpu_win64_${commit_hash}.tar.gz
diff --git a/tests/ci_build/deploy_jvm_packages.sh b/tests/ci_build/deploy_jvm_packages.sh
index 5f448ee2a..9531d79a9 100755
--- a/tests/ci_build/deploy_jvm_packages.sh
+++ b/tests/ci_build/deploy_jvm_packages.sh
@@ -27,7 +27,10 @@ rm -rf ../build/
 # Deploy to S3 bucket xgboost-maven-repo
 mvn --no-transfer-progress package deploy -P default,gpu,release-to-s3 -Dspark.version=${spark_version} -DskipTests
 # Deploy scala 2.13 to S3 bucket xgboost-maven-repo
-mvn --no-transfer-progress package deploy -P release-to-s3,default,scala-2.13 -Dspark.version=${spark_version} -DskipTests
+cd ..
+python dev/change_scala_version.py --scala-version 2.13 --purge-artifacts
+cd jvm-packages/
+mvn --no-transfer-progress package deploy -P default,gpu,release-to-s3 -Dspark.version=${spark_version} -DskipTests
 
 
 set +x
diff --git a/tests/ci_build/lint_python.py b/tests/ci_build/lint_python.py
index 87d76607f..91b748b4c 100644
--- a/tests/ci_build/lint_python.py
+++ b/tests/ci_build/lint_python.py
@@ -18,14 +18,17 @@ class LintersPaths:
         "python-package/",
         # tests
         "tests/python/test_config.py",
+        "tests/python/test_callback.py",
         "tests/python/test_data_iterator.py",
         "tests/python/test_dmatrix.py",
         "tests/python/test_dt.py",
         "tests/python/test_demos.py",
+        "tests/python/test_eval_metrics.py",
         "tests/python/test_multi_target.py",
         "tests/python/test_predict.py",
         "tests/python/test_quantile_dmatrix.py",
         "tests/python/test_tree_regularization.py",
+        "tests/python/test_training_continuation.py",
         "tests/python/test_shap.py",
         "tests/python/test_model_io.py",
         "tests/python/test_with_pandas.py",
@@ -39,12 +42,15 @@ class LintersPaths:
         "demo/dask/",
         "demo/rmm_plugin",
         "demo/json-model/json_parser.py",
+        "demo/guide-python/continuation.py",
         "demo/guide-python/cat_in_the_dat.py",
         "demo/guide-python/callbacks.py",
         "demo/guide-python/categorical.py",
         "demo/guide-python/cat_pipeline.py",
         "demo/guide-python/feature_weights.py",
         "demo/guide-python/sklearn_parallel.py",
+        "demo/guide-python/sklearn_examples.py",
+        "demo/guide-python/sklearn_evals_result.py",
         "demo/guide-python/spark_estimator_examples.py",
         "demo/guide-python/external_memory.py",
         "demo/guide-python/individual_trees.py",
@@ -86,6 +92,7 @@ class LintersPaths:
         "tests/python/test_multi_target.py",
         "tests/python-gpu/test_gpu_data_iterator.py",
         "tests/python-gpu/load_pickle.py",
+        "tests/python-gpu/test_gpu_training_continuation.py",
         "tests/python/test_model_io.py",
         "tests/test_distributed/test_with_spark/test_data.py",
         "tests/test_distributed/test_gpu_with_spark/test_data.py",
@@ -93,6 +100,7 @@ class LintersPaths:
         # demo
         "demo/json-model/json_parser.py",
         "demo/guide-python/external_memory.py",
+        "demo/guide-python/continuation.py",
         "demo/guide-python/callbacks.py",
         "demo/guide-python/cat_in_the_dat.py",
         "demo/guide-python/categorical.py",
diff --git a/tests/ci_build/test_jvm_cross.sh b/tests/ci_build/test_jvm_cross.sh
index 18265cf01..4e049fce1 100755
--- a/tests/ci_build/test_jvm_cross.sh
+++ b/tests/ci_build/test_jvm_cross.sh
@@ -20,10 +20,11 @@ if [ ! -z "$RUN_INTEGRATION_TEST" ]; then
   cd $jvm_packages_dir
 fi
 
-# including maven profiles for different scala versions: 2.12 is the default at the moment.
-for _maven_profile_string in "" "-Pdefault,scala-2.13"; do
-  scala_version=$(mvn help:evaluate $_maven_profile_string -Dexpression=scala.version -q -DforceStdout)
-  scala_binary_version=$(mvn help:evaluate $_maven_profile_string -Dexpression=scala.binary.version -q -DforceStdout)
+for scala_binary_version in "2.12" "2.13"; do
+  cd ..
+  python dev/change_scala_version.py --scala-version ${scala_binary_version}
+  cd jvm-packages
+  scala_version=$(mvn help:evaluate -Dexpression=scala.version -q -DforceStdout)
 
   # Install XGBoost4J JAR into local Maven repository
   mvn --no-transfer-progress install:install-file -Dfile=./xgboost4j/target/xgboost4j_${scala_binary_version}-${xgboost4j_version}.jar -DgroupId=ml.dmlc -DartifactId=xgboost4j_${scala_binary_version} -Dversion=${xgboost4j_version} -Dpackaging=jar
diff --git a/tests/cpp/tree/test_quantile_hist.cc b/tests/cpp/tree/test_quantile_hist.cc
index 6327703ed..cf806536a 100644
--- a/tests/cpp/tree/test_quantile_hist.cc
+++ b/tests/cpp/tree/test_quantile_hist.cc
@@ -253,6 +253,5 @@ void TestColumnSplit(bst_target_t n_targets) {
 
 TEST(QuantileHist, ColumnSplit) { TestColumnSplit(1); }
 
-TEST(QuantileHist, ColumnSplitMultiTarget) { TestColumnSplit(3); }
-
+TEST(QuantileHist, DISABLED_ColumnSplitMultiTarget) { TestColumnSplit(3); }
 }  // namespace xgboost::tree
diff --git a/tests/cpp/tree/test_tree_stat.cc b/tests/cpp/tree/test_tree_stat.cc
index 7f3e3bc94..dab0f3f4a 100644
--- a/tests/cpp/tree/test_tree_stat.cc
+++ b/tests/cpp/tree/test_tree_stat.cc
@@ -1,18 +1,21 @@
 /**
- * Copyright 2020-2023 by XGBoost Contributors
+ * Copyright 2020-2024, XGBoost Contributors
  */
 #include <gtest/gtest.h>
-#include <xgboost/context.h>  // for Context
-#include <xgboost/task.h>     // for ObjInfo
-#include <xgboost/tree_model.h>
-#include <xgboost/tree_updater.h>
+#include <xgboost/context.h>       // for Context
+#include <xgboost/task.h>          // for ObjInfo
+#include <xgboost/tree_model.h>    // for RegTree
+#include <xgboost/tree_updater.h>  // for TreeUpdater
 
-#include <memory>                     // for unique_ptr
+#include <memory>  // for unique_ptr
 
 #include "../../../src/tree/param.h"  // for TrainParam
 #include "../helpers.h"
 
 namespace xgboost {
+/**
+ * @brief Test the tree statistic (like sum Hessian) is correct.
+ */
 class UpdaterTreeStatTest : public ::testing::Test {
  protected:
   std::shared_ptr<DMatrix> p_dmat_;
@@ -28,13 +31,12 @@ class UpdaterTreeStatTest : public ::testing::Test {
     gpairs_.Data()->Copy(g);
   }
 
-  void RunTest(std::string updater) {
+  void RunTest(Context const* ctx, std::string updater) {
     tree::TrainParam param;
     ObjInfo task{ObjInfo::kRegression};
     param.Init(Args{});
 
-    Context ctx(updater == "grow_gpu_hist" ? MakeCUDACtx(0) : MakeCUDACtx(DeviceOrd::CPUOrdinal()));
-    auto up = std::unique_ptr<TreeUpdater>{TreeUpdater::Create(updater, &ctx, &task)};
+    auto up = std::unique_ptr<TreeUpdater>{TreeUpdater::Create(updater, ctx, &task)};
     up->Configure(Args{});
     RegTree tree{1u, kCols};
     std::vector<HostDeviceVector<bst_node_t>> position(1);
@@ -51,77 +53,136 @@ class UpdaterTreeStatTest : public ::testing::Test {
 };
 
 #if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
-TEST_F(UpdaterTreeStatTest, GpuHist) { this->RunTest("grow_gpu_hist"); }
-#endif  // defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
+TEST_F(UpdaterTreeStatTest, GpuHist) {
+  auto ctx = MakeCUDACtx(0);
+  this->RunTest(&ctx, "grow_gpu_hist");
+}
 
-TEST_F(UpdaterTreeStatTest, Hist) { this->RunTest("grow_quantile_histmaker"); }
+TEST_F(UpdaterTreeStatTest, GpuApprox) {
+  auto ctx = MakeCUDACtx(0);
+  this->RunTest(&ctx, "grow_gpu_approx");
+}
+#endif  // defined(XGBOOST_USE_CUDA)
 
-TEST_F(UpdaterTreeStatTest, Exact) { this->RunTest("grow_colmaker"); }
+TEST_F(UpdaterTreeStatTest, Hist) {
+  Context ctx;
+  this->RunTest(&ctx, "grow_quantile_histmaker");
+}
 
-TEST_F(UpdaterTreeStatTest, Approx) { this->RunTest("grow_histmaker"); }
+TEST_F(UpdaterTreeStatTest, Exact) {
+  Context ctx;
+  this->RunTest(&ctx, "grow_colmaker");
+}
 
-class UpdaterEtaTest : public ::testing::Test {
+TEST_F(UpdaterTreeStatTest, Approx) {
+  Context ctx;
+  this->RunTest(&ctx, "grow_histmaker");
+}
+
+/**
+ * @brief Test changing learning rate doesn't change internal splits.
+ */
+class TestSplitWithEta : public ::testing::Test {
  protected:
-  std::shared_ptr<DMatrix> p_dmat_;
-  linalg::Matrix<GradientPair> gpairs_;
-  size_t constexpr static kRows = 10;
-  size_t constexpr static kCols = 10;
-  size_t constexpr static kClasses = 10;
+  void Run(Context const* ctx, bst_target_t n_targets, std::string name) {
+    auto Xy = RandomDataGenerator{512, 64, 0.2}.Targets(n_targets).GenerateDMatrix(true);
 
-  void SetUp() override {
-    p_dmat_ = RandomDataGenerator(kRows, kCols, .5f).GenerateDMatrix(true, false, kClasses);
-    auto g = GenerateRandomGradients(kRows);
-    gpairs_.Reshape(kRows, 1);
-    gpairs_.Data()->Copy(g);
-  }
+    auto gen_tree = [&](float eta) {
+      auto tree =
+          std::make_unique<RegTree>(n_targets, static_cast<bst_feature_t>(Xy->Info().num_col_));
+      std::vector<RegTree*> trees{tree.get()};
+      ObjInfo task{ObjInfo::kRegression};
+      std::unique_ptr<TreeUpdater> updater{TreeUpdater::Create(name, ctx, &task)};
+      updater->Configure({});
 
-  void RunTest(std::string updater) {
-    ObjInfo task{ObjInfo::kClassification};
+      auto grad = GenerateRandomGradients(ctx, Xy->Info().num_row_, n_targets);
+      CHECK_EQ(grad.Shape(1), n_targets);
+      tree::TrainParam param;
+      param.Init(Args{{"learning_rate", std::to_string(eta)}});
+      HostDeviceVector<bst_node_t> position;
 
-    Context ctx(updater == "grow_gpu_hist" ? MakeCUDACtx(0) : MakeCUDACtx(DeviceOrd::CPUOrdinal()));
-
-    float eta = 0.4;
-    auto up_0 = std::unique_ptr<TreeUpdater>{TreeUpdater::Create(updater, &ctx, &task)};
-    up_0->Configure(Args{});
-    tree::TrainParam param0;
-    param0.Init(Args{{"eta", std::to_string(eta)}});
-
-    auto up_1 = std::unique_ptr<TreeUpdater>{TreeUpdater::Create(updater, &ctx, &task)};
-    up_1->Configure(Args{{"eta", "1.0"}});
-    tree::TrainParam param1;
-    param1.Init(Args{{"eta", "1.0"}});
-
-    for (size_t iter = 0; iter < 4; ++iter) {
-      RegTree tree_0{1u, kCols};
-      {
-        std::vector<HostDeviceVector<bst_node_t>> position(1);
-        up_0->Update(&param0, &gpairs_, p_dmat_.get(), position, {&tree_0});
+      updater->Update(&param, &grad, Xy.get(), common::Span{&position, 1}, trees);
+      CHECK_EQ(tree->NumTargets(), n_targets);
+      if (n_targets > 1) {
+        CHECK(tree->IsMultiTarget());
       }
+      return tree;
+    };
 
-      RegTree tree_1{1u, kCols};
-      {
-        std::vector<HostDeviceVector<bst_node_t>> position(1);
-        up_1->Update(&param1, &gpairs_, p_dmat_.get(), position, {&tree_1});
-      }
-      tree_0.WalkTree([&](bst_node_t nidx) {
-        if (tree_0[nidx].IsLeaf()) {
-          EXPECT_NEAR(tree_1[nidx].LeafValue() * eta, tree_0[nidx].LeafValue(), kRtEps);
+    auto eta_ratio = 8.0f;
+    auto p_tree0 = gen_tree(0.1f);
+    auto p_tree1 = gen_tree(0.1f * eta_ratio);
+    // Just to make sure we are not testing a stump.
+    CHECK_GE(p_tree0->NumExtraNodes(), 32);
+
+    bst_node_t n_nodes{0};
+    p_tree0->WalkTree([&](bst_node_t nidx) {
+      if (p_tree0->IsLeaf(nidx)) {
+        CHECK(p_tree1->IsLeaf(nidx));
+        if (p_tree0->IsMultiTarget()) {
+          CHECK(p_tree1->IsMultiTarget());
+          auto leaf_0 = p_tree0->GetMultiTargetTree()->LeafValue(nidx);
+          auto leaf_1 = p_tree1->GetMultiTargetTree()->LeafValue(nidx);
+          CHECK_EQ(leaf_0.Size(), leaf_1.Size());
+          for (std::size_t i = 0; i < leaf_0.Size(); ++i) {
+            CHECK_EQ(leaf_0(i) * eta_ratio, leaf_1(i));
+          }
+          CHECK(std::isnan(p_tree0->SplitCond(nidx)));
+          CHECK(std::isnan(p_tree1->SplitCond(nidx)));
+        } else {
+          // NON-mt tree reuses split cond for leaf value.
+          auto leaf_0 = p_tree0->SplitCond(nidx);
+          auto leaf_1 = p_tree1->SplitCond(nidx);
+          CHECK_EQ(leaf_0 * eta_ratio, leaf_1);
         }
-        return true;
-      });
-    }
+      } else {
+        CHECK(!p_tree1->IsLeaf(nidx));
+        CHECK_EQ(p_tree0->SplitCond(nidx), p_tree1->SplitCond(nidx));
+      }
+      n_nodes++;
+      return true;
+    });
+    ASSERT_EQ(n_nodes, p_tree0->NumExtraNodes() + 1);
   }
 };
 
-TEST_F(UpdaterEtaTest, Hist) { this->RunTest("grow_quantile_histmaker"); }
+TEST_F(TestSplitWithEta, HistMulti) {
+  Context ctx;
+  bst_target_t n_targets{3};
+  this->Run(&ctx, n_targets, "grow_quantile_histmaker");
+}
 
-TEST_F(UpdaterEtaTest, Exact) { this->RunTest("grow_colmaker"); }
+TEST_F(TestSplitWithEta, Hist) {
+  Context ctx;
+  bst_target_t n_targets{1};
+  this->Run(&ctx, n_targets, "grow_quantile_histmaker");
+}
 
-TEST_F(UpdaterEtaTest, Approx) { this->RunTest("grow_histmaker"); }
+TEST_F(TestSplitWithEta, Approx) {
+  Context ctx;
+  bst_target_t n_targets{1};
+  this->Run(&ctx, n_targets, "grow_histmaker");
+}
+
+TEST_F(TestSplitWithEta, Exact) {
+  Context ctx;
+  bst_target_t n_targets{1};
+  this->Run(&ctx, n_targets, "grow_colmaker");
+}
 
 #if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
-TEST_F(UpdaterEtaTest, GpuHist) { this->RunTest("grow_gpu_hist"); }
-#endif  // defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
+TEST_F(TestSplitWithEta, GpuHist) {
+  auto ctx = MakeCUDACtx(0);
+  bst_target_t n_targets{1};
+  this->Run(&ctx, n_targets, "grow_gpu_hist");
+}
+
+TEST_F(TestSplitWithEta, GpuApprox) {
+  auto ctx = MakeCUDACtx(0);
+  bst_target_t n_targets{1};
+  this->Run(&ctx, n_targets, "grow_gpu_approx");
+}
+#endif  // defined(XGBOOST_USE_CUDA)
 
 class TestMinSplitLoss : public ::testing::Test {
   std::shared_ptr<DMatrix> dmat_;
diff --git a/tests/python-gpu/test_gpu_training_continuation.py b/tests/python-gpu/test_gpu_training_continuation.py
index a67d2f26b..6f948890d 100644
--- a/tests/python-gpu/test_gpu_training_continuation.py
+++ b/tests/python-gpu/test_gpu_training_continuation.py
@@ -1,54 +1,12 @@
-import json
-
 import numpy as np
+import pytest
 
-import xgboost as xgb
+from xgboost.testing.continuation import run_training_continuation_model_output
 
 rng = np.random.RandomState(1994)
 
 
 class TestGPUTrainingContinuation:
-    def test_training_continuation(self):
-        kRows = 64
-        kCols = 32
-        X = np.random.randn(kRows, kCols)
-        y = np.random.randn(kRows)
-        dtrain = xgb.DMatrix(X, y)
-        params = {
-            "tree_method": "gpu_hist",
-            "max_depth": "2",
-            "gamma": "0.1",
-            "alpha": "0.01",
-        }
-        bst_0 = xgb.train(params, dtrain, num_boost_round=64)
-        dump_0 = bst_0.get_dump(dump_format="json")
-
-        bst_1 = xgb.train(params, dtrain, num_boost_round=32)
-        bst_1 = xgb.train(params, dtrain, num_boost_round=32, xgb_model=bst_1)
-        dump_1 = bst_1.get_dump(dump_format="json")
-
-        def recursive_compare(obj_0, obj_1):
-            if isinstance(obj_0, float):
-                assert np.isclose(obj_0, obj_1, atol=1e-6)
-            elif isinstance(obj_0, str):
-                assert obj_0 == obj_1
-            elif isinstance(obj_0, int):
-                assert obj_0 == obj_1
-            elif isinstance(obj_0, dict):
-                keys_0 = list(obj_0.keys())
-                keys_1 = list(obj_1.keys())
-                values_0 = list(obj_0.values())
-                values_1 = list(obj_1.values())
-                for i in range(len(obj_0.items())):
-                    assert keys_0[i] == keys_1[i]
-                    if list(obj_0.keys())[i] != "missing":
-                        recursive_compare(values_0[i], values_1[i])
-            else:
-                for i in range(len(obj_0)):
-                    recursive_compare(obj_0[i], obj_1[i])
-
-        assert len(dump_0) == len(dump_1)
-        for i in range(len(dump_0)):
-            obj_0 = json.loads(dump_0[i])
-            obj_1 = json.loads(dump_1[i])
-            recursive_compare(obj_0, obj_1)
+    @pytest.mark.parametrize("tree_method", ["hist", "approx"])
+    def test_model_output(self, tree_method: str) -> None:
+        run_training_continuation_model_output("cuda", tree_method)
diff --git a/tests/python/test_callback.py b/tests/python/test_callback.py
index 3a7501e48..d2e7cb5c4 100644
--- a/tests/python/test_callback.py
+++ b/tests/python/test_callback.py
@@ -16,13 +16,14 @@ class TestCallbacks:
     @classmethod
     def setup_class(cls):
         from sklearn.datasets import load_breast_cancer
+
         X, y = load_breast_cancer(return_X_y=True)
         cls.X = X
         cls.y = y
 
-        split = int(X.shape[0]*0.8)
-        cls.X_train = X[: split, ...]
-        cls.y_train = y[: split, ...]
+        split = int(X.shape[0] * 0.8)
+        cls.X_train = X[:split, ...]
+        cls.y_train = y[:split, ...]
         cls.X_valid = X[split:, ...]
         cls.y_valid = y[split:, ...]
 
@@ -31,31 +32,32 @@ class TestCallbacks:
         D_train: xgb.DMatrix,
         D_valid: xgb.DMatrix,
         rounds: int,
-        verbose_eval: Union[bool, int]
+        verbose_eval: Union[bool, int],
     ):
         def check_output(output: str) -> None:
             if int(verbose_eval) == 1:
                 # Should print each iteration info
-                assert len(output.split('\n')) == rounds
+                assert len(output.split("\n")) == rounds
             elif int(verbose_eval) > rounds:
                 # Should print first and latest iteration info
-                assert len(output.split('\n')) == 2
+                assert len(output.split("\n")) == 2
             else:
                 # Should print info by each period additionaly to first and latest
                 # iteration
                 num_periods = rounds // int(verbose_eval)
                 # Extra information is required for latest iteration
                 is_extra_info_required = num_periods * int(verbose_eval) < (rounds - 1)
-                assert len(output.split('\n')) == (
+                assert len(output.split("\n")) == (
                     1 + num_periods + int(is_extra_info_required)
                 )
 
         evals_result: xgb.callback.TrainingCallback.EvalsLog = {}
-        params = {'objective': 'binary:logistic', 'eval_metric': 'error'}
+        params = {"objective": "binary:logistic", "eval_metric": "error"}
         with tm.captured_output() as (out, err):
             xgb.train(
-                params, D_train,
-                evals=[(D_train, 'Train'), (D_valid, 'Valid')],
+                params,
+                D_train,
+                evals=[(D_train, "Train"), (D_valid, "Valid")],
                 num_boost_round=rounds,
                 evals_result=evals_result,
                 verbose_eval=verbose_eval,
@@ -73,14 +75,16 @@ class TestCallbacks:
         D_valid = xgb.DMatrix(self.X_valid, self.y_valid)
         evals_result = {}
         rounds = 10
-        xgb.train({'objective': 'binary:logistic',
-                   'eval_metric': 'error'}, D_train,
-                  evals=[(D_train, 'Train'), (D_valid, 'Valid')],
-                  num_boost_round=rounds,
-                  evals_result=evals_result,
-                  verbose_eval=True)
-        assert len(evals_result['Train']['error']) == rounds
-        assert len(evals_result['Valid']['error']) == rounds
+        xgb.train(
+            {"objective": "binary:logistic", "eval_metric": "error"},
+            D_train,
+            evals=[(D_train, "Train"), (D_valid, "Valid")],
+            num_boost_round=rounds,
+            evals_result=evals_result,
+            verbose_eval=True,
+        )
+        assert len(evals_result["Train"]["error"]) == rounds
+        assert len(evals_result["Valid"]["error"]) == rounds
 
         self.run_evaluation_monitor(D_train, D_valid, rounds, True)
         self.run_evaluation_monitor(D_train, D_valid, rounds, 2)
@@ -93,72 +97,83 @@ class TestCallbacks:
         evals_result = {}
         rounds = 30
         early_stopping_rounds = 5
-        booster = xgb.train({'objective': 'binary:logistic',
-                             'eval_metric': 'error'}, D_train,
-                            evals=[(D_train, 'Train'), (D_valid, 'Valid')],
-                            num_boost_round=rounds,
-                            evals_result=evals_result,
-                            verbose_eval=True,
-                            early_stopping_rounds=early_stopping_rounds)
-        dump = booster.get_dump(dump_format='json')
+        booster = xgb.train(
+            {"objective": "binary:logistic", "eval_metric": "error"},
+            D_train,
+            evals=[(D_train, "Train"), (D_valid, "Valid")],
+            num_boost_round=rounds,
+            evals_result=evals_result,
+            verbose_eval=True,
+            early_stopping_rounds=early_stopping_rounds,
+        )
+        dump = booster.get_dump(dump_format="json")
         assert len(dump) - booster.best_iteration == early_stopping_rounds + 1
 
     def test_early_stopping_custom_eval(self):
         D_train = xgb.DMatrix(self.X_train, self.y_train)
         D_valid = xgb.DMatrix(self.X_valid, self.y_valid)
         early_stopping_rounds = 5
-        booster = xgb.train({'objective': 'binary:logistic',
-                             'eval_metric': 'error',
-                             'tree_method': 'hist'}, D_train,
-                            evals=[(D_train, 'Train'), (D_valid, 'Valid')],
-                            feval=tm.eval_error_metric,
-                            num_boost_round=1000,
-                            early_stopping_rounds=early_stopping_rounds,
-                            verbose_eval=False)
-        dump = booster.get_dump(dump_format='json')
+        booster = xgb.train(
+            {
+                "objective": "binary:logistic",
+                "eval_metric": "error",
+                "tree_method": "hist",
+            },
+            D_train,
+            evals=[(D_train, "Train"), (D_valid, "Valid")],
+            feval=tm.eval_error_metric,
+            num_boost_round=1000,
+            early_stopping_rounds=early_stopping_rounds,
+            verbose_eval=False,
+        )
+        dump = booster.get_dump(dump_format="json")
         assert len(dump) - booster.best_iteration == early_stopping_rounds + 1
 
     def test_early_stopping_customize(self):
         D_train = xgb.DMatrix(self.X_train, self.y_train)
         D_valid = xgb.DMatrix(self.X_valid, self.y_valid)
         early_stopping_rounds = 5
-        early_stop = xgb.callback.EarlyStopping(rounds=early_stopping_rounds,
-                                                metric_name='CustomErr',
-                                                data_name='Train')
+        early_stop = xgb.callback.EarlyStopping(
+            rounds=early_stopping_rounds, metric_name="CustomErr", data_name="Train"
+        )
         # Specify which dataset and which metric should be used for early stopping.
         booster = xgb.train(
-            {'objective': 'binary:logistic',
-             'eval_metric': ['error', 'rmse'],
-             'tree_method': 'hist'}, D_train,
-            evals=[(D_train, 'Train'), (D_valid, 'Valid')],
+            {
+                "objective": "binary:logistic",
+                "eval_metric": ["error", "rmse"],
+                "tree_method": "hist",
+            },
+            D_train,
+            evals=[(D_train, "Train"), (D_valid, "Valid")],
             feval=tm.eval_error_metric,
             num_boost_round=1000,
             callbacks=[early_stop],
-            verbose_eval=False)
-        dump = booster.get_dump(dump_format='json')
+            verbose_eval=False,
+        )
+        dump = booster.get_dump(dump_format="json")
         assert len(dump) - booster.best_iteration == early_stopping_rounds + 1
-        assert len(early_stop.stopping_history['Train']['CustomErr']) == len(dump)
+        assert len(early_stop.stopping_history["Train"]["CustomErr"]) == len(dump)
 
         rounds = 100
         early_stop = xgb.callback.EarlyStopping(
             rounds=early_stopping_rounds,
-            metric_name='CustomErr',
-            data_name='Train',
+            metric_name="CustomErr",
+            data_name="Train",
             min_delta=100,
             save_best=True,
         )
         booster = xgb.train(
             {
-                'objective': 'binary:logistic',
-                'eval_metric': ['error', 'rmse'],
-                'tree_method': 'hist'
+                "objective": "binary:logistic",
+                "eval_metric": ["error", "rmse"],
+                "tree_method": "hist",
             },
             D_train,
-            evals=[(D_train, 'Train'), (D_valid, 'Valid')],
+            evals=[(D_train, "Train"), (D_valid, "Valid")],
             feval=tm.eval_error_metric,
             num_boost_round=rounds,
             callbacks=[early_stop],
-            verbose_eval=False
+            verbose_eval=False,
         )
         # No iteration can be made with min_delta == 100
         assert booster.best_iteration == 0
@@ -166,18 +181,20 @@ class TestCallbacks:
 
     def test_early_stopping_skl(self):
         from sklearn.datasets import load_breast_cancer
+
         X, y = load_breast_cancer(return_X_y=True)
         early_stopping_rounds = 5
         cls = xgb.XGBClassifier(
-            early_stopping_rounds=early_stopping_rounds, eval_metric='error'
+            early_stopping_rounds=early_stopping_rounds, eval_metric="error"
         )
         cls.fit(X, y, eval_set=[(X, y)])
         booster = cls.get_booster()
-        dump = booster.get_dump(dump_format='json')
+        dump = booster.get_dump(dump_format="json")
         assert len(dump) - booster.best_iteration == early_stopping_rounds + 1
 
     def test_early_stopping_custom_eval_skl(self):
         from sklearn.datasets import load_breast_cancer
+
         X, y = load_breast_cancer(return_X_y=True)
         early_stopping_rounds = 5
         early_stop = xgb.callback.EarlyStopping(rounds=early_stopping_rounds)
@@ -186,11 +203,12 @@ class TestCallbacks:
         )
         cls.fit(X, y, eval_set=[(X, y)])
         booster = cls.get_booster()
-        dump = booster.get_dump(dump_format='json')
+        dump = booster.get_dump(dump_format="json")
         assert len(dump) - booster.best_iteration == early_stopping_rounds + 1
 
     def test_early_stopping_save_best_model(self):
         from sklearn.datasets import load_breast_cancer
+
         X, y = load_breast_cancer(return_X_y=True)
         n_estimators = 100
         early_stopping_rounds = 5
@@ -200,11 +218,11 @@ class TestCallbacks:
         cls = xgb.XGBClassifier(
             n_estimators=n_estimators,
             eval_metric=tm.eval_error_metric_skl,
-            callbacks=[early_stop]
+            callbacks=[early_stop],
         )
         cls.fit(X, y, eval_set=[(X, y)])
         booster = cls.get_booster()
-        dump = booster.get_dump(dump_format='json')
+        dump = booster.get_dump(dump_format="json")
         assert len(dump) == booster.best_iteration + 1
 
         early_stop = xgb.callback.EarlyStopping(
@@ -220,8 +238,9 @@ class TestCallbacks:
             cls.fit(X, y, eval_set=[(X, y)])
 
         # No error
-        early_stop = xgb.callback.EarlyStopping(rounds=early_stopping_rounds,
-                                                save_best=False)
+        early_stop = xgb.callback.EarlyStopping(
+            rounds=early_stopping_rounds, save_best=False
+        )
         xgb.XGBClassifier(
             booster="gblinear",
             n_estimators=10,
@@ -231,14 +250,17 @@ class TestCallbacks:
 
     def test_early_stopping_continuation(self):
         from sklearn.datasets import load_breast_cancer
+
         X, y = load_breast_cancer(return_X_y=True)
-        cls = xgb.XGBClassifier(eval_metric=tm.eval_error_metric_skl)
+
         early_stopping_rounds = 5
         early_stop = xgb.callback.EarlyStopping(
             rounds=early_stopping_rounds, save_best=True
         )
-        with pytest.warns(UserWarning):
-            cls.fit(X, y, eval_set=[(X, y)], callbacks=[early_stop])
+        cls = xgb.XGBClassifier(
+            eval_metric=tm.eval_error_metric_skl, callbacks=[early_stop]
+        )
+        cls.fit(X, y, eval_set=[(X, y)])
 
         booster = cls.get_booster()
         assert booster.num_boosted_rounds() == booster.best_iteration + 1
@@ -256,21 +278,10 @@ class TestCallbacks:
             )
             cls.fit(X, y, eval_set=[(X, y)])
             booster = cls.get_booster()
-            assert booster.num_boosted_rounds() == \
-                booster.best_iteration + early_stopping_rounds + 1
-
-    def test_deprecated(self):
-        from sklearn.datasets import load_breast_cancer
-        X, y = load_breast_cancer(return_X_y=True)
-        early_stopping_rounds = 5
-        early_stop = xgb.callback.EarlyStopping(
-            rounds=early_stopping_rounds, save_best=True
-        )
-        clf = xgb.XGBClassifier(
-            eval_metric=tm.eval_error_metric_skl, callbacks=[early_stop]
-        )
-        with pytest.raises(ValueError, match=r".*set_params.*"):
-            clf.fit(X, y, eval_set=[(X, y)], callbacks=[early_stop])
+            assert (
+                booster.num_boosted_rounds()
+                == booster.best_iteration + early_stopping_rounds + 1
+            )
 
     def run_eta_decay(self, tree_method):
         """Test learning rate scheduler, used by both CPU and GPU tests."""
@@ -343,7 +354,7 @@ class TestCallbacks:
             callbacks=[scheduler([0, 0, 0, 0])],
             evals_result=evals_result,
         )
-        eval_errors_2 = list(map(float, evals_result['eval']['error']))
+        eval_errors_2 = list(map(float, evals_result["eval"]["error"]))
         assert isinstance(bst, xgb.core.Booster)
         # validation error should not decrease, if eta/learning_rate = 0
         assert eval_errors_2[0] == eval_errors_2[-1]
@@ -361,7 +372,7 @@ class TestCallbacks:
             callbacks=[scheduler(eta_decay)],
             evals_result=evals_result,
         )
-        eval_errors_3 = list(map(float, evals_result['eval']['error']))
+        eval_errors_3 = list(map(float, evals_result["eval"]["error"]))
 
         assert isinstance(bst, xgb.core.Booster)
 
diff --git a/tests/python/test_early_stopping.py b/tests/python/test_early_stopping.py
index 7695c6861..a275a8077 100644
--- a/tests/python/test_early_stopping.py
+++ b/tests/python/test_early_stopping.py
@@ -15,23 +15,23 @@ class TestEarlyStopping:
         from sklearn.model_selection import train_test_split
 
         digits = load_digits(n_class=2)
-        X = digits['data']
-        y = digits['target']
+        X = digits["data"]
+        y = digits["target"]
         X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
-        clf1 = xgb.XGBClassifier(learning_rate=0.1)
-        clf1.fit(X_train, y_train, early_stopping_rounds=5, eval_metric="auc",
-                 eval_set=[(X_test, y_test)])
-        clf2 = xgb.XGBClassifier(learning_rate=0.1)
-        clf2.fit(X_train, y_train, early_stopping_rounds=4, eval_metric="auc",
-                 eval_set=[(X_test, y_test)])
+        clf1 = xgb.XGBClassifier(
+            learning_rate=0.1, early_stopping_rounds=5, eval_metric="auc"
+        )
+        clf1.fit(X_train, y_train, eval_set=[(X_test, y_test)])
+        clf2 = xgb.XGBClassifier(
+            learning_rate=0.1, early_stopping_rounds=4, eval_metric="auc"
+        )
+        clf2.fit(X_train, y_train, eval_set=[(X_test, y_test)])
         # should be the same
         assert clf1.best_score == clf2.best_score
         assert clf1.best_score != 1
         # check overfit
         clf3 = xgb.XGBClassifier(
-            learning_rate=0.1,
-            eval_metric="auc",
-            early_stopping_rounds=10
+            learning_rate=0.1, eval_metric="auc", early_stopping_rounds=10
         )
         clf3.fit(X_train, y_train, eval_set=[(X_test, y_test)])
         base_score = get_basescore(clf3)
@@ -39,9 +39,9 @@ class TestEarlyStopping:
 
         clf3 = xgb.XGBClassifier(
             learning_rate=0.1,
-            base_score=.5,
+            base_score=0.5,
             eval_metric="auc",
-            early_stopping_rounds=10
+            early_stopping_rounds=10,
         )
         clf3.fit(X_train, y_train, eval_set=[(X_test, y_test)])
 
diff --git a/tests/python/test_eval_metrics.py b/tests/python/test_eval_metrics.py
index 92726014b..cbb3dc88d 100644
--- a/tests/python/test_eval_metrics.py
+++ b/tests/python/test_eval_metrics.py
@@ -9,37 +9,41 @@ rng = np.random.RandomState(1337)
 
 
 class TestEvalMetrics:
-    xgb_params_01 = {'nthread': 1, 'eval_metric': 'error'}
+    xgb_params_01 = {"nthread": 1, "eval_metric": "error"}
 
-    xgb_params_02 = {'nthread': 1, 'eval_metric': ['error']}
+    xgb_params_02 = {"nthread": 1, "eval_metric": ["error"]}
 
-    xgb_params_03 = {'nthread': 1, 'eval_metric': ['rmse', 'error']}
+    xgb_params_03 = {"nthread": 1, "eval_metric": ["rmse", "error"]}
 
-    xgb_params_04 = {'nthread': 1, 'eval_metric': ['error', 'rmse']}
+    xgb_params_04 = {"nthread": 1, "eval_metric": ["error", "rmse"]}
 
     def evalerror_01(self, preds, dtrain):
         labels = dtrain.get_label()
-        return 'error', float(sum(labels != (preds > 0.0))) / len(labels)
+        return "error", float(sum(labels != (preds > 0.0))) / len(labels)
 
     def evalerror_02(self, preds, dtrain):
         labels = dtrain.get_label()
-        return [('error', float(sum(labels != (preds > 0.0))) / len(labels))]
+        return [("error", float(sum(labels != (preds > 0.0))) / len(labels))]
 
     @pytest.mark.skipif(**tm.no_sklearn())
     def evalerror_03(self, preds, dtrain):
         from sklearn.metrics import mean_squared_error
 
         labels = dtrain.get_label()
-        return [('rmse', mean_squared_error(labels, preds)),
-                ('error', float(sum(labels != (preds > 0.0))) / len(labels))]
+        return [
+            ("rmse", mean_squared_error(labels, preds)),
+            ("error", float(sum(labels != (preds > 0.0))) / len(labels)),
+        ]
 
     @pytest.mark.skipif(**tm.no_sklearn())
     def evalerror_04(self, preds, dtrain):
         from sklearn.metrics import mean_squared_error
 
         labels = dtrain.get_label()
-        return [('error', float(sum(labels != (preds > 0.0))) / len(labels)),
-                ('rmse', mean_squared_error(labels, preds))]
+        return [
+            ("error", float(sum(labels != (preds > 0.0))) / len(labels)),
+            ("rmse", mean_squared_error(labels, preds)),
+        ]
 
     @pytest.mark.skipif(**tm.no_sklearn())
     def test_eval_metrics(self):
@@ -50,15 +54,15 @@ class TestEvalMetrics:
         from sklearn.datasets import load_digits
 
         digits = load_digits(n_class=2)
-        X = digits['data']
-        y = digits['target']
+        X = digits["data"]
+        y = digits["target"]
 
         Xt, Xv, yt, yv = train_test_split(X, y, test_size=0.2, random_state=0)
 
         dtrain = xgb.DMatrix(Xt, label=yt)
         dvalid = xgb.DMatrix(Xv, label=yv)
 
-        watchlist = [(dtrain, 'train'), (dvalid, 'val')]
+        watchlist = [(dtrain, "train"), (dvalid, "val")]
 
         gbdt_01 = xgb.train(self.xgb_params_01, dtrain, num_boost_round=10)
         gbdt_02 = xgb.train(self.xgb_params_02, dtrain, num_boost_round=10)
@@ -66,26 +70,54 @@ class TestEvalMetrics:
         assert gbdt_01.predict(dvalid)[0] == gbdt_02.predict(dvalid)[0]
         assert gbdt_01.predict(dvalid)[0] == gbdt_03.predict(dvalid)[0]
 
-        gbdt_01 = xgb.train(self.xgb_params_01, dtrain, 10, watchlist,
-                            early_stopping_rounds=2)
-        gbdt_02 = xgb.train(self.xgb_params_02, dtrain, 10, watchlist,
-                            early_stopping_rounds=2)
-        gbdt_03 = xgb.train(self.xgb_params_03, dtrain, 10, watchlist,
-                            early_stopping_rounds=2)
-        gbdt_04 = xgb.train(self.xgb_params_04, dtrain, 10, watchlist,
-                            early_stopping_rounds=2)
+        gbdt_01 = xgb.train(
+            self.xgb_params_01, dtrain, 10, watchlist, early_stopping_rounds=2
+        )
+        gbdt_02 = xgb.train(
+            self.xgb_params_02, dtrain, 10, watchlist, early_stopping_rounds=2
+        )
+        gbdt_03 = xgb.train(
+            self.xgb_params_03, dtrain, 10, watchlist, early_stopping_rounds=2
+        )
+        gbdt_04 = xgb.train(
+            self.xgb_params_04, dtrain, 10, watchlist, early_stopping_rounds=2
+        )
         assert gbdt_01.predict(dvalid)[0] == gbdt_02.predict(dvalid)[0]
         assert gbdt_01.predict(dvalid)[0] == gbdt_03.predict(dvalid)[0]
         assert gbdt_03.predict(dvalid)[0] != gbdt_04.predict(dvalid)[0]
 
-        gbdt_01 = xgb.train(self.xgb_params_01, dtrain, 10, watchlist,
-                            early_stopping_rounds=2, feval=self.evalerror_01)
-        gbdt_02 = xgb.train(self.xgb_params_02, dtrain, 10, watchlist,
-                            early_stopping_rounds=2, feval=self.evalerror_02)
-        gbdt_03 = xgb.train(self.xgb_params_03, dtrain, 10, watchlist,
-                            early_stopping_rounds=2, feval=self.evalerror_03)
-        gbdt_04 = xgb.train(self.xgb_params_04, dtrain, 10, watchlist,
-                            early_stopping_rounds=2, feval=self.evalerror_04)
+        gbdt_01 = xgb.train(
+            self.xgb_params_01,
+            dtrain,
+            10,
+            watchlist,
+            early_stopping_rounds=2,
+            feval=self.evalerror_01,
+        )
+        gbdt_02 = xgb.train(
+            self.xgb_params_02,
+            dtrain,
+            10,
+            watchlist,
+            early_stopping_rounds=2,
+            feval=self.evalerror_02,
+        )
+        gbdt_03 = xgb.train(
+            self.xgb_params_03,
+            dtrain,
+            10,
+            watchlist,
+            early_stopping_rounds=2,
+            feval=self.evalerror_03,
+        )
+        gbdt_04 = xgb.train(
+            self.xgb_params_04,
+            dtrain,
+            10,
+            watchlist,
+            early_stopping_rounds=2,
+            feval=self.evalerror_04,
+        )
         assert gbdt_01.predict(dvalid)[0] == gbdt_02.predict(dvalid)[0]
         assert gbdt_01.predict(dvalid)[0] == gbdt_03.predict(dvalid)[0]
         assert gbdt_03.predict(dvalid)[0] != gbdt_04.predict(dvalid)[0]
@@ -93,6 +125,7 @@ class TestEvalMetrics:
     @pytest.mark.skipif(**tm.no_sklearn())
     def test_gamma_deviance(self):
         from sklearn.metrics import mean_gamma_deviance
+
         rng = np.random.RandomState(1994)
         n_samples = 100
         n_features = 30
@@ -101,8 +134,13 @@ class TestEvalMetrics:
         y = rng.randn(n_samples)
         y = y - y.min() * 100
 
-        reg = xgb.XGBRegressor(tree_method="hist", objective="reg:gamma", n_estimators=10)
-        reg.fit(X, y, eval_metric="gamma-deviance")
+        reg = xgb.XGBRegressor(
+            tree_method="hist",
+            objective="reg:gamma",
+            n_estimators=10,
+            eval_metric="gamma-deviance",
+        )
+        reg.fit(X, y)
 
         booster = reg.get_booster()
         score = reg.predict(X)
@@ -113,16 +151,26 @@ class TestEvalMetrics:
     @pytest.mark.skipif(**tm.no_sklearn())
     def test_gamma_lik(self) -> None:
         import scipy.stats as stats
+
         rng = np.random.default_rng(1994)
         n_samples = 32
         n_features = 10
 
-        X = rng.normal(0, 1, size=n_samples * n_features).reshape((n_samples, n_features))
+        X = rng.normal(0, 1, size=n_samples * n_features).reshape(
+            (n_samples, n_features)
+        )
 
         alpha, loc, beta = 5.0, 11.1, 22
-        y = stats.gamma.rvs(alpha, loc=loc, scale=beta, size=n_samples, random_state=rng)
-        reg = xgb.XGBRegressor(tree_method="hist", objective="reg:gamma", n_estimators=64)
-        reg.fit(X, y, eval_metric="gamma-nloglik", eval_set=[(X, y)])
+        y = stats.gamma.rvs(
+            alpha, loc=loc, scale=beta, size=n_samples, random_state=rng
+        )
+        reg = xgb.XGBRegressor(
+            tree_method="hist",
+            objective="reg:gamma",
+            n_estimators=64,
+            eval_metric="gamma-nloglik",
+        )
+        reg.fit(X, y, eval_set=[(X, y)])
 
         score = reg.predict(X)
 
@@ -134,7 +182,7 @@ class TestEvalMetrics:
         # XGBoost uses the canonical link function of gamma in evaluation function.
         # so \theta = - (1.0 / y)
         # dispersion is hardcoded as 1.0, so shape (a in scipy parameter) is also 1.0
-        beta = - (1.0 / (- (1.0 / y)))  # == y
+        beta = -(1.0 / (-(1.0 / y)))  # == y
         nloglik_stats = -stats.gamma.logpdf(score, a=1.0, scale=beta)
 
         np.testing.assert_allclose(nloglik, np.mean(nloglik_stats), rtol=1e-3)
@@ -153,7 +201,7 @@ class TestEvalMetrics:
             n_features,
             n_informative=n_features,
             n_redundant=0,
-            random_state=rng
+            random_state=rng,
         )
         Xy = xgb.DMatrix(X, y)
         booster = xgb.train(
@@ -197,7 +245,7 @@ class TestEvalMetrics:
             n_informative=n_features,
             n_redundant=0,
             n_classes=n_classes,
-            random_state=rng
+            random_state=rng,
         )
         if weighted:
             weights = rng.randn(n_samples)
@@ -242,20 +290,25 @@ class TestEvalMetrics:
     def run_pr_auc_binary(self, tree_method):
         from sklearn.datasets import make_classification
         from sklearn.metrics import auc, precision_recall_curve
+
         X, y = make_classification(128, 4, n_classes=2, random_state=1994)
-        clf = xgb.XGBClassifier(tree_method=tree_method, n_estimators=1)
-        clf.fit(X, y, eval_metric="aucpr", eval_set=[(X, y)])
+        clf = xgb.XGBClassifier(
+            tree_method=tree_method, n_estimators=1, eval_metric="aucpr"
+        )
+        clf.fit(X, y, eval_set=[(X, y)])
         evals_result = clf.evals_result()["validation_0"]["aucpr"][-1]
 
         y_score = clf.predict_proba(X)[:, 1]  # get the positive column
         precision, recall, _ = precision_recall_curve(y, y_score)
         prauc = auc(recall, precision)
-        # Interpolation results are slightly different from sklearn, but overall should be
-        # similar.
+        # Interpolation results are slightly different from sklearn, but overall should
+        # be similar.
         np.testing.assert_allclose(prauc, evals_result, rtol=1e-2)
 
-        clf = xgb.XGBClassifier(tree_method=tree_method, n_estimators=10)
-        clf.fit(X, y, eval_metric="aucpr", eval_set=[(X, y)])
+        clf = xgb.XGBClassifier(
+            tree_method=tree_method, n_estimators=10, eval_metric="aucpr"
+        )
+        clf.fit(X, y, eval_set=[(X, y)])
         evals_result = clf.evals_result()["validation_0"]["aucpr"][-1]
         np.testing.assert_allclose(0.99, evals_result, rtol=1e-2)
 
@@ -264,16 +317,21 @@ class TestEvalMetrics:
 
     def run_pr_auc_multi(self, tree_method):
         from sklearn.datasets import make_classification
+
         X, y = make_classification(
             64, 16, n_informative=8, n_classes=3, random_state=1994
         )
-        clf = xgb.XGBClassifier(tree_method=tree_method, n_estimators=1)
-        clf.fit(X, y, eval_metric="aucpr", eval_set=[(X, y)])
+        clf = xgb.XGBClassifier(
+            tree_method=tree_method, n_estimators=1, eval_metric="aucpr"
+        )
+        clf.fit(X, y, eval_set=[(X, y)])
         evals_result = clf.evals_result()["validation_0"]["aucpr"][-1]
-        # No available implementation for comparison, just check that XGBoost converges to
-        # 1.0
-        clf = xgb.XGBClassifier(tree_method=tree_method, n_estimators=10)
-        clf.fit(X, y, eval_metric="aucpr", eval_set=[(X, y)])
+        # No available implementation for comparison, just check that XGBoost converges
+        # to 1.0
+        clf = xgb.XGBClassifier(
+            tree_method=tree_method, n_estimators=10, eval_metric="aucpr"
+        )
+        clf.fit(X, y, eval_set=[(X, y)])
         evals_result = clf.evals_result()["validation_0"]["aucpr"][-1]
         np.testing.assert_allclose(1.0, evals_result, rtol=1e-2)
 
@@ -282,9 +340,13 @@ class TestEvalMetrics:
 
     def run_pr_auc_ltr(self, tree_method):
         from sklearn.datasets import make_classification
+
         X, y = make_classification(128, 4, n_classes=2, random_state=1994)
         ltr = xgb.XGBRanker(
-            tree_method=tree_method, n_estimators=16, objective="rank:pairwise"
+            tree_method=tree_method,
+            n_estimators=16,
+            objective="rank:pairwise",
+            eval_metric="aucpr",
         )
         groups = np.array([32, 32, 64])
         ltr.fit(
@@ -293,7 +355,6 @@ class TestEvalMetrics:
             group=groups,
             eval_set=[(X, y)],
             eval_group=[groups],
-            eval_metric="aucpr",
         )
         results = ltr.evals_result()["validation_0"]["aucpr"]
         assert results[-1] >= 0.99
diff --git a/tests/python/test_training_continuation.py b/tests/python/test_training_continuation.py
index 6b2f96301..9e9f37ea5 100644
--- a/tests/python/test_training_continuation.py
+++ b/tests/python/test_training_continuation.py
@@ -6,6 +6,7 @@ import pytest
 
 import xgboost as xgb
 from xgboost import testing as tm
+from xgboost.testing.continuation import run_training_continuation_model_output
 
 rng = np.random.RandomState(1337)
 
@@ -15,54 +16,51 @@ class TestTrainingContinuation:
 
     def generate_parameters(self):
         xgb_params_01_binary = {
-            'nthread': 1,
+            "nthread": 1,
         }
 
         xgb_params_02_binary = {
-            'nthread': 1,
-            'num_parallel_tree': self.num_parallel_tree
+            "nthread": 1,
+            "num_parallel_tree": self.num_parallel_tree,
         }
 
         xgb_params_03_binary = {
-            'nthread': 1,
-            'num_class': 5,
-            'num_parallel_tree': self.num_parallel_tree
+            "nthread": 1,
+            "num_class": 5,
+            "num_parallel_tree": self.num_parallel_tree,
         }
 
-        return [
-            xgb_params_01_binary, xgb_params_02_binary, xgb_params_03_binary
-        ]
+        return [xgb_params_01_binary, xgb_params_02_binary, xgb_params_03_binary]
 
-    def run_training_continuation(self, xgb_params_01, xgb_params_02,
-                                  xgb_params_03):
+    def run_training_continuation(self, xgb_params_01, xgb_params_02, xgb_params_03):
         from sklearn.datasets import load_digits
         from sklearn.metrics import mean_squared_error
 
         digits_2class = load_digits(n_class=2)
         digits_5class = load_digits(n_class=5)
 
-        X_2class = digits_2class['data']
-        y_2class = digits_2class['target']
+        X_2class = digits_2class["data"]
+        y_2class = digits_2class["target"]
 
-        X_5class = digits_5class['data']
-        y_5class = digits_5class['target']
+        X_5class = digits_5class["data"]
+        y_5class = digits_5class["target"]
 
         dtrain_2class = xgb.DMatrix(X_2class, label=y_2class)
         dtrain_5class = xgb.DMatrix(X_5class, label=y_5class)
 
-        gbdt_01 = xgb.train(xgb_params_01, dtrain_2class,
-                            num_boost_round=10)
+        gbdt_01 = xgb.train(xgb_params_01, dtrain_2class, num_boost_round=10)
         ntrees_01 = len(gbdt_01.get_dump())
         assert ntrees_01 == 10
 
-        gbdt_02 = xgb.train(xgb_params_01, dtrain_2class,
-                            num_boost_round=0)
-        gbdt_02.save_model('xgb_tc.json')
+        gbdt_02 = xgb.train(xgb_params_01, dtrain_2class, num_boost_round=0)
+        gbdt_02.save_model("xgb_tc.json")
 
-        gbdt_02a = xgb.train(xgb_params_01, dtrain_2class,
-                             num_boost_round=10, xgb_model=gbdt_02)
-        gbdt_02b = xgb.train(xgb_params_01, dtrain_2class,
-                             num_boost_round=10, xgb_model="xgb_tc.json")
+        gbdt_02a = xgb.train(
+            xgb_params_01, dtrain_2class, num_boost_round=10, xgb_model=gbdt_02
+        )
+        gbdt_02b = xgb.train(
+            xgb_params_01, dtrain_2class, num_boost_round=10, xgb_model="xgb_tc.json"
+        )
         ntrees_02a = len(gbdt_02a.get_dump())
         ntrees_02b = len(gbdt_02b.get_dump())
         assert ntrees_02a == 10
@@ -76,20 +74,21 @@ class TestTrainingContinuation:
         res2 = mean_squared_error(y_2class, gbdt_02b.predict(dtrain_2class))
         assert res1 == res2
 
-        gbdt_03 = xgb.train(xgb_params_01, dtrain_2class,
-                            num_boost_round=3)
-        gbdt_03.save_model('xgb_tc.json')
+        gbdt_03 = xgb.train(xgb_params_01, dtrain_2class, num_boost_round=3)
+        gbdt_03.save_model("xgb_tc.json")
 
-        gbdt_03a = xgb.train(xgb_params_01, dtrain_2class,
-                             num_boost_round=7, xgb_model=gbdt_03)
-        gbdt_03b = xgb.train(xgb_params_01, dtrain_2class,
-                             num_boost_round=7, xgb_model="xgb_tc.json")
+        gbdt_03a = xgb.train(
+            xgb_params_01, dtrain_2class, num_boost_round=7, xgb_model=gbdt_03
+        )
+        gbdt_03b = xgb.train(
+            xgb_params_01, dtrain_2class, num_boost_round=7, xgb_model="xgb_tc.json"
+        )
         ntrees_03a = len(gbdt_03a.get_dump())
         ntrees_03b = len(gbdt_03b.get_dump())
         assert ntrees_03a == 10
         assert ntrees_03b == 10
 
-        os.remove('xgb_tc.json')
+        os.remove("xgb_tc.json")
 
         res1 = mean_squared_error(y_2class, gbdt_03a.predict(dtrain_2class))
         res2 = mean_squared_error(y_2class, gbdt_03b.predict(dtrain_2class))
@@ -113,16 +112,14 @@ class TestTrainingContinuation:
             y_2class,
             gbdt_04.predict(
                 dtrain_2class, iteration_range=(0, gbdt_04.num_boosted_rounds())
-            )
+            ),
         )
         assert res1 == res2
 
-        gbdt_05 = xgb.train(xgb_params_03, dtrain_5class,
-                            num_boost_round=7)
-        gbdt_05 = xgb.train(xgb_params_03,
-                            dtrain_5class,
-                            num_boost_round=3,
-                            xgb_model=gbdt_05)
+        gbdt_05 = xgb.train(xgb_params_03, dtrain_5class, num_boost_round=7)
+        gbdt_05 = xgb.train(
+            xgb_params_03, dtrain_5class, num_boost_round=3, xgb_model=gbdt_05
+        )
 
         res1 = gbdt_05.predict(dtrain_5class)
         res2 = gbdt_05.predict(
@@ -149,8 +146,8 @@ class TestTrainingContinuation:
         from sklearn.datasets import load_breast_cancer
 
         X, y = load_breast_cancer(return_X_y=True)
-        clf = xgb.XGBClassifier(n_estimators=2)
-        clf.fit(X, y, eval_set=[(X, y)], eval_metric="logloss")
+        clf = xgb.XGBClassifier(n_estimators=2, eval_metric="logloss")
+        clf.fit(X, y, eval_set=[(X, y)])
         assert tm.non_increasing(clf.evals_result()["validation_0"]["logloss"])
 
         with tempfile.TemporaryDirectory() as tmpdir:
@@ -160,5 +157,10 @@ class TestTrainingContinuation:
 
         clf = xgb.XGBClassifier(n_estimators=2)
         # change metric to error
-        clf.fit(X, y, eval_set=[(X, y)], eval_metric="error")
+        clf.set_params(eval_metric="error")
+        clf.fit(X, y, eval_set=[(X, y)], xgb_model=loaded)
         assert tm.non_increasing(clf.evals_result()["validation_0"]["error"])
+
+    @pytest.mark.parametrize("tree_method", ["hist", "approx", "exact"])
+    def test_model_output(self, tree_method: str) -> None:
+        run_training_continuation_model_output("cpu", tree_method)
diff --git a/tests/python/test_with_sklearn.py b/tests/python/test_with_sklearn.py
index 47f1778d6..9047cee6e 100644
--- a/tests/python/test_with_sklearn.py
+++ b/tests/python/test_with_sklearn.py
@@ -30,8 +30,8 @@ def test_binary_classification():
     kf = KFold(n_splits=2, shuffle=True, random_state=rng)
     for cls in (xgb.XGBClassifier, xgb.XGBRFClassifier):
         for train_index, test_index in kf.split(X, y):
-            clf = cls(random_state=42)
-            xgb_model = clf.fit(X[train_index], y[train_index], eval_metric=['auc', 'logloss'])
+            clf = cls(random_state=42, eval_metric=['auc', 'logloss'])
+            xgb_model = clf.fit(X[train_index], y[train_index])
             preds = xgb_model.predict(X[test_index])
             labels = y[test_index]
             err = sum(1 for i in range(len(preds))
@@ -101,10 +101,11 @@ def test_best_iteration():
     def train(booster: str, forest: Optional[int]) -> None:
         rounds = 4
         cls = xgb.XGBClassifier(
-            n_estimators=rounds, num_parallel_tree=forest, booster=booster
-        ).fit(
-            X, y, eval_set=[(X, y)], early_stopping_rounds=3
-        )
+            n_estimators=rounds,
+            num_parallel_tree=forest,
+            booster=booster,
+            early_stopping_rounds=3,
+        ).fit(X, y, eval_set=[(X, y)])
         assert cls.best_iteration == rounds - 1
 
         # best_iteration is used by default, assert that under gblinear it's
@@ -112,9 +113,9 @@ def test_best_iteration():
         cls.predict(X)
 
     num_parallel_tree = 4
-    train('gbtree', num_parallel_tree)
-    train('dart', num_parallel_tree)
-    train('gblinear', None)
+    train("gbtree", num_parallel_tree)
+    train("dart", num_parallel_tree)
+    train("gblinear", None)
 
 
 def test_ranking():
@@ -258,6 +259,7 @@ def test_stacking_classification():
     X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
     clf.fit(X_train, y_train).score(X_test, y_test)
 
+
 @pytest.mark.skipif(**tm.no_pandas())
 def test_feature_importances_weight():
     from sklearn.datasets import load_digits
@@ -474,7 +476,8 @@ def run_housing_rf_regression(tree_method):
 
     rfreg = xgb.XGBRFRegressor()
     with pytest.raises(NotImplementedError):
-        rfreg.fit(X, y, early_stopping_rounds=10)
+        rfreg.set_params(early_stopping_rounds=10)
+        rfreg.fit(X, y)
 
 
 def test_rf_regression():
@@ -574,7 +577,7 @@ def test_classification_with_custom_objective():
         return logregobj(y, p)
 
     cls.set_params(objective=wrapped)
-    cls.predict(X)              # no throw
+    cls.predict(X)  # no throw
     cls.fit(X, y)
 
     assert is_called[0]
@@ -844,51 +847,65 @@ def run_validation_weights(model):
     y_train, y_test = y[:1600], y[1600:]
 
     # instantiate model
-    param_dist = {'objective': 'binary:logistic', 'n_estimators': 2,
-                  'random_state': 123}
+    param_dist = {
+        "objective": "binary:logistic",
+        "n_estimators": 2,
+        "random_state": 123,
+    }
     clf = model(**param_dist)
 
     # train it using instance weights only in the training set
     weights_train = np.random.choice([1, 2], len(X_train))
-    clf.fit(X_train, y_train,
-            sample_weight=weights_train,
-            eval_set=[(X_test, y_test)],
-            eval_metric='logloss',
-            verbose=False)
-
+    clf.set_params(eval_metric="logloss")
+    clf.fit(
+        X_train,
+        y_train,
+        sample_weight=weights_train,
+        eval_set=[(X_test, y_test)],
+        verbose=False,
+    )
     # evaluate logloss metric on test set *without* using weights
     evals_result_without_weights = clf.evals_result()
-    logloss_without_weights = evals_result_without_weights[
-        "validation_0"]["logloss"]
+    logloss_without_weights = evals_result_without_weights["validation_0"]["logloss"]
 
     # now use weights for the test set
     np.random.seed(0)
     weights_test = np.random.choice([1, 2], len(X_test))
-    clf.fit(X_train, y_train,
-            sample_weight=weights_train,
-            eval_set=[(X_test, y_test)],
-            sample_weight_eval_set=[weights_test],
-            eval_metric='logloss',
-            verbose=False)
+    clf.set_params(eval_metric="logloss")
+    clf.fit(
+        X_train,
+        y_train,
+        sample_weight=weights_train,
+        eval_set=[(X_test, y_test)],
+        sample_weight_eval_set=[weights_test],
+        verbose=False,
+    )
     evals_result_with_weights = clf.evals_result()
     logloss_with_weights = evals_result_with_weights["validation_0"]["logloss"]
 
     # check that the logloss in the test set is actually different when using
     # weights than when not using them
-    assert all((logloss_with_weights[i] != logloss_without_weights[i]
-                for i in [0, 1]))
+    assert all((logloss_with_weights[i] != logloss_without_weights[i] for i in [0, 1]))
 
     with pytest.raises(ValueError):
         # length of eval set and sample weight doesn't match.
-        clf.fit(X_train, y_train, sample_weight=weights_train,
-                eval_set=[(X_train, y_train), (X_test, y_test)],
-                sample_weight_eval_set=[weights_train])
+        clf.fit(
+            X_train,
+            y_train,
+            sample_weight=weights_train,
+            eval_set=[(X_train, y_train), (X_test, y_test)],
+            sample_weight_eval_set=[weights_train],
+        )
 
     with pytest.raises(ValueError):
         cls = xgb.XGBClassifier()
-        cls.fit(X_train, y_train, sample_weight=weights_train,
-                eval_set=[(X_train, y_train), (X_test, y_test)],
-                sample_weight_eval_set=[weights_train])
+        cls.fit(
+            X_train,
+            y_train,
+            sample_weight=weights_train,
+            eval_set=[(X_train, y_train), (X_test, y_test)],
+            sample_weight_eval_set=[weights_train],
+        )
 
 
 def test_validation_weights():
@@ -960,8 +977,7 @@ def test_XGBClassifier_resume():
 
         # file name of stored xgb model
         model1.save_model(model1_path)
-        model2 = xgb.XGBClassifier(
-            learning_rate=0.3, random_state=0, n_estimators=8)
+        model2 = xgb.XGBClassifier(learning_rate=0.3, random_state=0, n_estimators=8)
         model2.fit(X, Y, xgb_model=model1_path)
 
         pred2 = model2.predict(X)
@@ -972,8 +988,7 @@ def test_XGBClassifier_resume():
 
         # file name of 'Booster' instance Xgb model
         model1.get_booster().save_model(model1_booster_path)
-        model2 = xgb.XGBClassifier(
-            learning_rate=0.3, random_state=0, n_estimators=8)
+        model2 = xgb.XGBClassifier(learning_rate=0.3, random_state=0, n_estimators=8)
         model2.fit(X, Y, xgb_model=model1_booster_path)
 
         pred2 = model2.predict(X)
@@ -1279,12 +1294,16 @@ def test_estimator_reg(estimator, check):
         ):
             estimator.fit(X, y)
         return
-    if os.environ["PYTEST_CURRENT_TEST"].find("check_estimators_overwrite_params") != -1:
+    if (
+        os.environ["PYTEST_CURRENT_TEST"].find("check_estimators_overwrite_params")
+        != -1
+    ):
         # A hack to pass the scikit-learn parameter mutation tests.  XGBoost regressor
-        # returns actual internal default values for parameters in `get_params`, but those
-        # are set as `None` in sklearn interface to avoid duplication.  So we fit a dummy
-        # model and obtain the default parameters here for the mutation tests.
+        # returns actual internal default values for parameters in `get_params`, but
+        # those are set as `None` in sklearn interface to avoid duplication.  So we fit
+        # a dummy model and obtain the default parameters here for the mutation tests.
         from sklearn.datasets import make_regression
+
         X, y = make_regression(n_samples=2, n_features=1)
         estimator.set_params(**xgb.XGBRegressor().fit(X, y).get_params())
 
@@ -1325,6 +1344,7 @@ def test_categorical():
 def test_evaluation_metric():
     from sklearn.datasets import load_diabetes, load_digits
     from sklearn.metrics import mean_absolute_error
+
     X, y = load_diabetes(return_X_y=True)
     n_estimators = 16
 
@@ -1341,17 +1361,6 @@ def test_evaluation_metric():
     for line in lines:
         assert line.find("mean_absolute_error") != -1
 
-    def metric(predt: np.ndarray, Xy: xgb.DMatrix):
-        y = Xy.get_label()
-        return "m", np.abs(predt - y).sum()
-
-    with pytest.warns(UserWarning):
-        reg = xgb.XGBRegressor(
-            tree_method="hist",
-            n_estimators=1,
-        )
-        reg.fit(X, y, eval_set=[(X, y)], eval_metric=metric)
-
     def merror(y_true: np.ndarray, predt: np.ndarray):
         n_samples = y_true.shape[0]
         assert n_samples == predt.size
diff --git a/tests/test_distributed/test_gpu_with_dask/test_gpu_with_dask.py b/tests/test_distributed/test_gpu_with_dask/test_gpu_with_dask.py
index 1bc6bbccd..1b58b2657 100644
--- a/tests/test_distributed/test_gpu_with_dask/test_gpu_with_dask.py
+++ b/tests/test_distributed/test_gpu_with_dask/test_gpu_with_dask.py
@@ -363,12 +363,12 @@ class TestDistributedGPU:
             device="cuda",
             eval_metric="error",
             n_estimators=100,
+            early_stopping_rounds=early_stopping_rounds,
         )
         cls.client = local_cuda_client
         cls.fit(
             X,
             y,
-            early_stopping_rounds=early_stopping_rounds,
             eval_set=[(valid_X, valid_y)],
         )
         booster = cls.get_booster()
diff --git a/tests/test_distributed/test_with_dask/test_with_dask.py b/tests/test_distributed/test_with_dask/test_with_dask.py
index 79df025fe..150e698d3 100644
--- a/tests/test_distributed/test_with_dask/test_with_dask.py
+++ b/tests/test_distributed/test_with_dask/test_with_dask.py
@@ -937,8 +937,10 @@ def run_empty_dmatrix_auc(client: "Client", device: str, n_workers: int) -> None
     valid_X = dd.from_array(valid_X_, chunksize=n_samples)
     valid_y = dd.from_array(valid_y_, chunksize=n_samples)
 
-    cls = xgb.dask.DaskXGBClassifier(device=device, n_estimators=2)
-    cls.fit(X, y, eval_metric=["auc", "aucpr"], eval_set=[(valid_X, valid_y)])
+    cls = xgb.dask.DaskXGBClassifier(
+        device=device, n_estimators=2, eval_metric=["auc", "aucpr"]
+    )
+    cls.fit(X, y, eval_set=[(valid_X, valid_y)])
 
     # multiclass
     X_, y_ = make_classification(
@@ -966,8 +968,10 @@ def run_empty_dmatrix_auc(client: "Client", device: str, n_workers: int) -> None
     valid_X = dd.from_array(valid_X_, chunksize=n_samples)
     valid_y = dd.from_array(valid_y_, chunksize=n_samples)
 
-    cls = xgb.dask.DaskXGBClassifier(device=device, n_estimators=2)
-    cls.fit(X, y, eval_metric=["auc", "aucpr"], eval_set=[(valid_X, valid_y)])
+    cls = xgb.dask.DaskXGBClassifier(
+        device=device, n_estimators=2, eval_metric=["auc", "aucpr"]
+    )
+    cls.fit(X, y, eval_set=[(valid_X, valid_y)])
 
 
 def test_empty_dmatrix_auc() -> None:
@@ -994,11 +998,11 @@ def run_auc(client: "Client", device: str) -> None:
     valid_X = dd.from_array(valid_X_, chunksize=10)
     valid_y = dd.from_array(valid_y_, chunksize=10)
 
-    cls = xgb.XGBClassifier(device=device, n_estimators=2)
-    cls.fit(X_, y_, eval_metric="auc", eval_set=[(valid_X_, valid_y_)])
+    cls = xgb.XGBClassifier(device=device, n_estimators=2, eval_metric="auc")
+    cls.fit(X_, y_, eval_set=[(valid_X_, valid_y_)])
 
-    dcls = xgb.dask.DaskXGBClassifier(device=device, n_estimators=2)
-    dcls.fit(X, y, eval_metric="auc", eval_set=[(valid_X, valid_y)])
+    dcls = xgb.dask.DaskXGBClassifier(device=device, n_estimators=2, eval_metric="auc")
+    dcls.fit(X, y, eval_set=[(valid_X, valid_y)])
 
     approx = dcls.evals_result()["validation_0"]["auc"]
     exact = cls.evals_result()["validation_0"]["auc"]
@@ -1267,16 +1271,16 @@ def test_dask_ranking(client: "Client") -> None:
     qid_valid = qid_valid.astype(np.uint32)
     qid_test = qid_test.astype(np.uint32)
 
-    rank = xgb.dask.DaskXGBRanker(n_estimators=2500)
+    rank = xgb.dask.DaskXGBRanker(
+        n_estimators=2500, eval_metric=["ndcg"], early_stopping_rounds=10
+    )
     rank.fit(
         x_train,
         y_train,
         qid=qid_train,
         eval_set=[(x_test, y_test), (x_train, y_train)],
         eval_qid=[qid_test, qid_train],
-        eval_metric=["ndcg"],
         verbose=True,
-        early_stopping_rounds=10,
     )
     assert rank.n_features_in_ == 46
     assert rank.best_score > 0.98
@@ -2150,13 +2154,15 @@ class TestDaskCallbacks:
         valid_X, valid_y = load_breast_cancer(return_X_y=True)
         valid_X, valid_y = da.from_array(valid_X), da.from_array(valid_y)
         cls = xgb.dask.DaskXGBClassifier(
-            objective="binary:logistic", tree_method="hist", n_estimators=1000
+            objective="binary:logistic",
+            tree_method="hist",
+            n_estimators=1000,
+            early_stopping_rounds=early_stopping_rounds,
         )
         cls.client = client
         cls.fit(
             X,
             y,
-            early_stopping_rounds=early_stopping_rounds,
             eval_set=[(valid_X, valid_y)],
         )
         booster = cls.get_booster()
@@ -2165,15 +2171,17 @@ class TestDaskCallbacks:
 
         # Specify the metric
         cls = xgb.dask.DaskXGBClassifier(
-            objective="binary:logistic", tree_method="hist", n_estimators=1000
+            objective="binary:logistic",
+            tree_method="hist",
+            n_estimators=1000,
+            early_stopping_rounds=early_stopping_rounds,
+            eval_metric="error",
         )
         cls.client = client
         cls.fit(
             X,
             y,
-            early_stopping_rounds=early_stopping_rounds,
             eval_set=[(valid_X, valid_y)],
-            eval_metric="error",
         )
         assert tm.non_increasing(cls.evals_result()["validation_0"]["error"])
         booster = cls.get_booster()
@@ -2215,12 +2223,12 @@ class TestDaskCallbacks:
             tree_method="hist",
             n_estimators=1000,
             eval_metric=tm.eval_error_metric_skl,
+            early_stopping_rounds=early_stopping_rounds,
         )
         cls.client = client
         cls.fit(
             X,
             y,
-            early_stopping_rounds=early_stopping_rounds,
             eval_set=[(valid_X, valid_y)],
         )
         booster = cls.get_booster()
@@ -2234,21 +2242,22 @@ class TestDaskCallbacks:
         X, y = load_breast_cancer(return_X_y=True)
         X, y = da.from_array(X), da.from_array(y)
 
-        cls = xgb.dask.DaskXGBClassifier(
-            objective="binary:logistic", tree_method="hist", n_estimators=10
-        )
-        cls.client = client
-
         with tempfile.TemporaryDirectory() as tmpdir:
-            cls.fit(
-                X,
-                y,
+            cls = xgb.dask.DaskXGBClassifier(
+                objective="binary:logistic",
+                tree_method="hist",
+                n_estimators=10,
                 callbacks=[
                     xgb.callback.TrainingCheckPoint(
                         directory=Path(tmpdir), interval=1, name="model"
                     )
                 ],
             )
+            cls.client = client
+            cls.fit(
+                X,
+                y,
+            )
             for i in range(1, 10):
                 assert os.path.exists(
                     os.path.join(
diff --git a/tests/test_distributed/test_with_spark/test_spark_local.py b/tests/test_distributed/test_with_spark/test_spark_local.py
index 406174542..b8c16ef1c 100644
--- a/tests/test_distributed/test_with_spark/test_spark_local.py
+++ b/tests/test_distributed/test_with_spark/test_spark_local.py
@@ -311,24 +311,20 @@ def clf_with_weight(
     y_val = np.array([0, 1])
     w_train = np.array([1.0, 2.0])
     w_val = np.array([1.0, 2.0])
-    cls2 = XGBClassifier()
+    cls2 = XGBClassifier(eval_metric="logloss", early_stopping_rounds=1)
     cls2.fit(
         X_train,
         y_train,
         eval_set=[(X_val, y_val)],
-        early_stopping_rounds=1,
-        eval_metric="logloss",
     )
 
-    cls3 = XGBClassifier()
+    cls3 = XGBClassifier(eval_metric="logloss", early_stopping_rounds=1)
     cls3.fit(
         X_train,
         y_train,
         sample_weight=w_train,
         eval_set=[(X_val, y_val)],
         sample_weight_eval_set=[w_val],
-        early_stopping_rounds=1,
-        eval_metric="logloss",
     )
 
     cls_df_train_with_eval_weight = spark.createDataFrame(