From 2c12b956da33aa9a8ae3dbcc9fe883d732cade46 Mon Sep 17 00:00:00 2001
From: david-cortes <david.cortes.rivera@gmail.com>
Date: Fri, 1 Mar 2024 08:57:47 +0100
Subject: [PATCH] [R] Refactor callback structure and attributes (#9957)

---
 R-package/NAMESPACE                           |   22 +-
 R-package/R/callbacks.R                       | 1918 ++++++++++-------
 R-package/R/utils.R                           |    2 +-
 R-package/R/xgb.Booster.R                     |   35 +-
 R-package/R/xgb.cv.R                          |  138 +-
 R-package/R/xgb.load.R                        |    2 +-
 R-package/R/xgb.train.R                       |  189 +-
 R-package/R/xgboost.R                         |    7 +-
 R-package/man/callbacks.Rd                    |   37 -
 R-package/man/cb.early.stop.Rd                |   62 -
 R-package/man/cb.evaluation.log.Rd            |   31 -
 R-package/man/cb.print.evaluation.Rd          |   29 -
 R-package/man/cb.save.model.Rd                |   40 -
 R-package/man/xgb.Callback.Rd                 |  248 +++
 ...{cb.cv.predict.Rd => xgb.cb.cv.predict.Rd} |   43 +-
 R-package/man/xgb.cb.early.stop.Rd            |   55 +
 R-package/man/xgb.cb.evaluation.log.Rd        |   24 +
 ....history.Rd => xgb.cb.gblinear.history.Rd} |   55 +-
 R-package/man/xgb.cb.print.evaluation.Rd      |   25 +
 ...rameters.Rd => xgb.cb.reset.parameters.Rd} |   25 +-
 R-package/man/xgb.cb.save.model.Rd            |   28 +
 R-package/man/xgb.cv.Rd                       |   26 +-
 R-package/man/xgb.gblinear.history.Rd         |   24 +-
 R-package/man/xgb.load.Rd                     |    2 +-
 R-package/man/xgb.train.Rd                    |   34 +-
 R-package/src/init.c                          |    2 +
 R-package/src/xgboost_R.cc                    |   15 +
 R-package/src/xgboost_R.h                     |   10 +
 R-package/tests/testthat.R                    |    1 +
 R-package/tests/testthat/test_basic.R         |    1 -
 R-package/tests/testthat/test_callbacks.R     |  281 ++-
 R-package/tests/testthat/test_glm.R           |    4 +-
 32 files changed, 2076 insertions(+), 1339 deletions(-)
 delete mode 100644 R-package/man/callbacks.Rd
 delete mode 100644 R-package/man/cb.early.stop.Rd
 delete mode 100644 R-package/man/cb.evaluation.log.Rd
 delete mode 100644 R-package/man/cb.print.evaluation.Rd
 delete mode 100644 R-package/man/cb.save.model.Rd
 create mode 100644 R-package/man/xgb.Callback.Rd
 rename R-package/man/{cb.cv.predict.Rd => xgb.cb.cv.predict.Rd} (53%)
 create mode 100644 R-package/man/xgb.cb.early.stop.Rd
 create mode 100644 R-package/man/xgb.cb.evaluation.log.Rd
 rename R-package/man/{cb.gblinear.history.Rd => xgb.cb.gblinear.history.Rd} (63%)
 create mode 100644 R-package/man/xgb.cb.print.evaluation.Rd
 rename R-package/man/{cb.reset.parameters.Rd => xgb.cb.reset.parameters.Rd} (57%)
 create mode 100644 R-package/man/xgb.cb.save.model.Rd

diff --git a/R-package/NAMESPACE b/R-package/NAMESPACE
index 580d1f873..c9e085e77 100644
--- a/R-package/NAMESPACE
+++ b/R-package/NAMESPACE
@@ -20,15 +20,9 @@ export("xgb.attr<-")
 export("xgb.attributes<-")
 export("xgb.config<-")
 export("xgb.parameters<-")
-export(cb.cv.predict)
-export(cb.early.stop)
-export(cb.evaluation.log)
-export(cb.gblinear.history)
-export(cb.print.evaluation)
-export(cb.reset.parameters)
-export(cb.save.model)
 export(getinfo)
 export(setinfo)
+export(xgb.Callback)
 export(xgb.DMatrix)
 export(xgb.DMatrix.hasinfo)
 export(xgb.DMatrix.save)
@@ -39,6 +33,13 @@ export(xgb.QuantileDMatrix)
 export(xgb.QuantileDMatrix.from_iterator)
 export(xgb.attr)
 export(xgb.attributes)
+export(xgb.cb.cv.predict)
+export(xgb.cb.early.stop)
+export(xgb.cb.evaluation.log)
+export(xgb.cb.gblinear.history)
+export(xgb.cb.print.evaluation)
+export(xgb.cb.reset.parameters)
+export(xgb.cb.save.model)
 export(xgb.config)
 export(xgb.copy.Booster)
 export(xgb.create.features)
@@ -72,14 +73,10 @@ export(xgb.slice.DMatrix)
 export(xgb.train)
 export(xgboost)
 import(methods)
+importClassesFrom(Matrix,CsparseMatrix)
 importClassesFrom(Matrix,dgCMatrix)
 importClassesFrom(Matrix,dgRMatrix)
-importClassesFrom(Matrix,dgeMatrix)
-importFrom(Matrix,colSums)
 importFrom(Matrix,sparse.model.matrix)
-importFrom(Matrix,sparseMatrix)
-importFrom(Matrix,sparseVector)
-importFrom(Matrix,t)
 importFrom(data.table,":=")
 importFrom(data.table,as.data.table)
 importFrom(data.table,data.table)
@@ -101,6 +98,7 @@ importFrom(methods,new)
 importFrom(stats,coef)
 importFrom(stats,median)
 importFrom(stats,predict)
+importFrom(stats,sd)
 importFrom(stats,variable.names)
 importFrom(utils,head)
 importFrom(utils,object.size)
diff --git a/R-package/R/callbacks.R b/R-package/R/callbacks.R
index 02e0a7cd4..d768e1b9e 100644
--- a/R-package/R/callbacks.R
+++ b/R-package/R/callbacks.R
@@ -1,769 +1,392 @@
-#' Callback closures for booster training.
-#'
-#' These are used to perform various service tasks either during boosting iterations or at the end.
-#' This approach helps to modularize many of such tasks without bloating the main training methods,
-#' and it offers .
-#'
-#' @details
-#' By default, a callback function is run after each boosting iteration.
-#' An R-attribute \code{is_pre_iteration} could be set for a callback to define a pre-iteration function.
-#'
-#' When a callback function has \code{finalize} parameter, its finalizer part will also be run after
-#' the boosting is completed.
-#'
-#' WARNING: side-effects!!! Be aware that these callback functions access and modify things in
-#' the environment from which they are called from, which is a fairly uncommon thing to do in R.
-#'
-#' To write a custom callback closure, make sure you first understand the main concepts about R environments.
-#' Check either R documentation on \code{\link[base]{environment}} or the
-#' \href{http://adv-r.had.co.nz/Environments.html}{Environments chapter} from the "Advanced R"
-#' book by Hadley Wickham. Further, the best option is to read the code of some of the existing callbacks -
-#' choose ones that do something similar to what you want to achieve. Also, you would need to get familiar
-#' with the objects available inside of the \code{xgb.train} and \code{xgb.cv} internal environments.
-#'
-#' @seealso
-#' \code{\link{cb.print.evaluation}},
-#' \code{\link{cb.evaluation.log}},
-#' \code{\link{cb.reset.parameters}},
-#' \code{\link{cb.early.stop}},
-#' \code{\link{cb.save.model}},
-#' \code{\link{cb.cv.predict}},
-#' \code{\link{xgb.train}},
-#' \code{\link{xgb.cv}}
-#'
-#' @name callbacks
-NULL
+.reserved_cb_names <- c("names", "class", "call", "params", "niter", "nfeatures", "folds")
 
-#
-# Callbacks -------------------------------------------------------------------
-#
-
-#' Callback closure for printing the result of evaluation
+#' @title XGBoost Callback Constructor
+#' @description Constructor for defining the structure of callback functions that can be executed
+#' at different stages of model training (before / after training, before / after each boosting
+#' iteration).
+#' @param cb_name Name for the callback.
 #'
-#' @param period  results would be printed every number of periods
-#' @param showsd  whether standard deviations should be printed (when available)
+#' If the callback produces some non-NULL result (from executing the function passed under
+#' `f_after_training`), that result will be added as an R attribute to the resulting booster
+#' (or as a named element in the result of CV), with the attribute name specified here.
 #'
-#' @details
-#' The callback function prints the result of evaluation at every \code{period} iterations.
-#' The initial and the last iteration's evaluations are always printed.
+#' Names of callbacks must be unique - i.e. there cannot be two callbacks with the same name.
+#' @param env An environment object that will be passed to the different functions in the callback.
+#' Note that this environment will not be shared with other callbacks.
+#' @param f_before_training A function that will be executed before the training has started.
 #'
-#' Callback function expects the following values to be set in its calling frame:
-#' \code{bst_evaluation} (also \code{bst_evaluation_err} when available),
-#' \code{iteration},
-#' \code{begin_iteration},
-#' \code{end_iteration}.
+#' If passing `NULL` for this or for the other function inputs, then no function will be executed.
 #'
-#' @seealso
-#' \code{\link{callbacks}}
+#' If passing a function, it will be called with parameters supplied as non-named arguments
+#' matching the function signatures that are shown in the default value for each function argument.
+#' @param f_before_iter A function that will be executed before each boosting round.
 #'
-#' @export
-cb.print.evaluation <- function(period = 1, showsd = TRUE) {
-
-  callback <- function(env = parent.frame()) {
-    if (length(env$bst_evaluation) == 0 ||
-        period == 0 ||
-        NVL(env$rank, 0) != 0)
-      return()
-
-    i <- env$iteration
-    if ((i - 1) %% period == 0 ||
-        i == env$begin_iteration ||
-        i == env$end_iteration) {
-      stdev <- if (showsd) env$bst_evaluation_err else NULL
-      msg <- .format_eval_string(i, env$bst_evaluation, stdev)
-      cat(msg, '\n')
-    }
-  }
-  attr(callback, 'call') <- match.call()
-  attr(callback, 'name') <- 'cb.print.evaluation'
-  callback
-}
-
-
-#' Callback closure for logging the evaluation history
+#' This function can signal whether the training should be finalized or not, by outputting
+#' a value that evaluates to `TRUE` - i.e. if the output from the function provided here at
+#' a given round is `TRUE`, then training will be stopped before the current iteration happens.
 #'
-#' @details
-#' This callback function appends the current iteration evaluation results \code{bst_evaluation}
-#' available in the calling parent frame to the \code{evaluation_log} list in a calling frame.
+#' Return values of `NULL` will be interpreted as `FALSE`.
+#' @param f_after_iter A function that will be executed after each boosting round.
 #'
-#' The finalizer callback (called with \code{finalize = TURE} in the end) converts
-#' the \code{evaluation_log} list into a final data.table.
+#' This function can signal whether the training should be finalized or not, by outputting
+#' a value that evaluates to `TRUE` - i.e. if the output from the function provided here at
+#' a given round is `TRUE`, then training will be stopped at that round.
 #'
-#' The iteration evaluation result \code{bst_evaluation} must be a named numeric vector.
+#' Return values of `NULL` will be interpreted as `FALSE`.
+#' @param f_after_training A function that will be executed after training is finished.
 #'
-#' Note: in the column names of the final data.table, the dash '-' character is replaced with
-#' the underscore '_' in order to make the column names more like regular R identifiers.
+#' This function can optionally output something non-NULL, which will become part of the R
+#' attributes of the booster (assuming one passes `keep_extra_attributes=TRUE` to \link{xgb.train})
+#' under the name supplied for parameter `cb_name` imn the case of \link{xgb.train}; or a part
+#' of the named elements in the result of \link{xgb.cv}.
+#' @return An `xgb.Callback` object, which can be passed to \link{xgb.train} or \link{xgb.cv}.
+#' @details Arguments that will be passed to the supplied functions are as follows:\itemize{
 #'
-#' Callback function expects the following values to be set in its calling frame:
-#' \code{evaluation_log},
-#' \code{bst_evaluation},
-#' \code{iteration}.
+#' \item env The same environment that is passed under argument `env`.
 #'
-#' @seealso
-#' \code{\link{callbacks}}
+#' It may be modified by the functions in order to e.g. keep tracking of what happens
+#' across iterations or similar.
 #'
-#' @export
-cb.evaluation.log <- function() {
-
-  mnames <- NULL
-
-  init <- function(env) {
-    if (!is.list(env$evaluation_log))
-      stop("'evaluation_log' has to be a list")
-    mnames <<- names(env$bst_evaluation)
-    if (is.null(mnames) || any(mnames == ""))
-      stop("bst_evaluation must have non-empty names")
-
-    mnames <<- gsub('-', '_', names(env$bst_evaluation), fixed = TRUE)
-    if (!is.null(env$bst_evaluation_err))
-      mnames <<- c(paste0(mnames, '_mean'), paste0(mnames, '_std'))
-  }
-
-  finalizer <- function(env) {
-    env$evaluation_log <- as.data.table(t(simplify2array(env$evaluation_log)))
-    setnames(env$evaluation_log, c('iter', mnames))
-
-    if (!is.null(env$bst_evaluation_err)) {
-      # rearrange col order from _mean,_mean,...,_std,_std,...
-      # to be _mean,_std,_mean,_std,...
-      len <- length(mnames)
-      means <- mnames[seq_len(len / 2)]
-      stds <- mnames[(len / 2 + 1):len]
-      cnames <- numeric(len)
-      cnames[c(TRUE, FALSE)] <- means
-      cnames[c(FALSE, TRUE)] <- stds
-      env$evaluation_log <- env$evaluation_log[, c('iter', cnames), with = FALSE]
-    }
-  }
-
-  callback <- function(env = parent.frame(), finalize = FALSE) {
-    if (is.null(mnames))
-      init(env)
-
-    if (finalize)
-      return(finalizer(env))
-
-    ev <- env$bst_evaluation
-    if (!is.null(env$bst_evaluation_err))
-      ev <- c(ev, env$bst_evaluation_err)
-    env$evaluation_log <- c(env$evaluation_log,
-                            list(c(iter = env$iteration, ev)))
-  }
-  attr(callback, 'call') <- match.call()
-  attr(callback, 'name') <- 'cb.evaluation.log'
-  callback
-}
-
-#' Callback closure for resetting the booster's parameters at each iteration.
+#' This environment is only used by the functions supplied to the callback, and will
+#' not be kept after the model fitting function terminates (see parameter `f_after_training`).
 #'
-#' @param new_params a list where each element corresponds to a parameter that needs to be reset.
-#'        Each element's value must be either a vector of values of length \code{nrounds}
-#'        to be set at each iteration,
-#'        or a function of two parameters \code{learning_rates(iteration, nrounds)}
-#'        which returns a new parameter value by using the current iteration number
-#'        and the total number of boosting rounds.
+#' \item model The booster object when using \link{xgb.train}, or the folds when using
+#' \link{xgb.cv}.
 #'
-#' @details
-#' This is a "pre-iteration" callback function used to reset booster's parameters
-#' at the beginning of each iteration.
-#'
-#' Note that when training is resumed from some previous model, and a function is used to
-#' reset a parameter value, the \code{nrounds} argument in this function would be the
-#' the number of boosting rounds in the current training.
-#'
-#' Callback function expects the following values to be set in its calling frame:
-#' \code{bst} or \code{bst_folds},
-#' \code{iteration},
-#' \code{begin_iteration},
-#' \code{end_iteration}.
-#'
-#' @seealso
-#' \code{\link{callbacks}}
-#'
-#' @export
-cb.reset.parameters <- function(new_params) {
-
-  if (typeof(new_params) != "list")
-    stop("'new_params' must be a list")
-  pnames <- gsub(".", "_", names(new_params), fixed = TRUE)
-  nrounds <- NULL
-
-  # run some checks in the beginning
-  init <- function(env) {
-    nrounds <<- env$end_iteration - env$begin_iteration + 1
-
-    if (is.null(env$bst) && is.null(env$bst_folds))
-      stop("Parent frame has neither 'bst' nor 'bst_folds'")
-
-    # Some parameters are not allowed to be changed,
-    # since changing them would simply wreck some chaos
-    not_allowed <- pnames %in%
-      c('num_class', 'num_output_group', 'size_leaf_vector', 'updater_seq')
-    if (any(not_allowed))
-      stop('Parameters ', paste(pnames[not_allowed]), " cannot be changed during boosting.")
-
-    for (n in pnames) {
-      p <- new_params[[n]]
-      if (is.function(p)) {
-        if (length(formals(p)) != 2)
-          stop("Parameter '", n, "' is a function but not of two arguments")
-      } else if (is.numeric(p) || is.character(p)) {
-        if (length(p) != nrounds)
-          stop("Length of '", n, "' has to be equal to 'nrounds'")
-      } else {
-        stop("Parameter '", n, "' is not a function or a vector")
-      }
-    }
-  }
-
-  callback <- function(env = parent.frame()) {
-    if (is.null(nrounds))
-      init(env)
-
-    i <- env$iteration
-    pars <- lapply(new_params, function(p) {
-      if (is.function(p))
-        return(p(i, nrounds))
-      p[i]
-    })
-
-    if (!is.null(env$bst)) {
-      xgb.parameters(env$bst) <- pars
-    } else {
-      for (fd in env$bst_folds)
-        xgb.parameters(fd$bst) <- pars
-    }
-  }
-  attr(callback, 'is_pre_iteration') <- TRUE
-  attr(callback, 'call') <- match.call()
-  attr(callback, 'name') <- 'cb.reset.parameters'
-  callback
-}
-
-
-#' Callback closure to activate the early stopping.
-#'
-#' @param stopping_rounds The number of rounds with no improvement in
-#'        the evaluation metric in order to stop the training.
-#' @param maximize whether to maximize the evaluation metric
-#' @param metric_name the name of an evaluation column to use as a criteria for early
-#'        stopping. If not set, the last column would be used.
-#'        Let's say the test data in \code{watchlist} was labelled as \code{dtest},
-#'        and one wants to use the AUC in test data for early stopping regardless of where
-#'        it is in the \code{watchlist}, then one of the following would need to be set:
-#'        \code{metric_name='dtest-auc'} or \code{metric_name='dtest_auc'}.
-#'        All dash '-' characters in metric names are considered equivalent to '_'.
-#' @param verbose whether to print the early stopping information.
-#'
-#' @details
-#' This callback function determines the condition for early stopping
-#' by setting the \code{stop_condition = TRUE} flag in its calling frame.
-#'
-#' The following additional fields are assigned to the model's R object:
-#' \itemize{
-#' \item \code{best_score} the evaluation score at the best iteration
-#' \item \code{best_iteration} at which boosting iteration the best score has occurred (1-based index)
-#' }
-#' The Same values are also stored as xgb-attributes:
-#' \itemize{
-#' \item \code{best_iteration} is stored as a 0-based iteration index (for interoperability of binary models)
-#' \item \code{best_msg} message string is also stored.
+#' For \link{xgb.cv}, folds are a list with a structure as follows:\itemize{
+#' \item `dtrain`: The training data for the fold (as an `xgb.DMatrix` object).
+#' \item `bst`: Rhe `xgb.Booster` object for the fold.
+#' \item `watchlist`: A list with two DMatrices, with names `train` and `test`
+#' (`test` is the held-out data for the fold).
+#' \item `index`: The indices of the hold-out data for that fold (base-1 indexing),
+#' from which the `test` entry in the watchlist was obtained.
 #' }
 #'
-#' At least one data element is required in the evaluation watchlist for early stopping to work.
+#' This object should \bold{not} be in-place modified in ways that conflict with the
+#' training (e.g. resetting the parameters for a training update in a way that resets
+#' the number of rounds to zero in order to overwrite rounds).
 #'
-#' Callback function expects the following values to be set in its calling frame:
-#' \code{stop_condition},
-#' \code{bst_evaluation},
-#' \code{rank},
-#' \code{bst} (or \code{bst_folds} and \code{basket}),
-#' \code{iteration},
-#' \code{begin_iteration},
-#' \code{end_iteration},
+#' Note that any R attributes that are assigned to the booster during the callback functions,
+#' will not be kept thereafter as the booster object variable is not re-assigned during
+#' training. It is however possible to set C-level attributes of the booster through
+#' \link{xgb.attr} or \link{xgb.attributes}, which should remain available for the rest
+#' of the iterations and after the training is done.
 #'
-#' @seealso
-#' \code{\link{callbacks}},
-#' \code{\link{xgb.attr}}
+#' For keeping variables across iterations, it's recommended to use `env` instead.
+#' \item data The data to which the model is being fit, as an `xgb.DMatrix` object.
 #'
-#' @export
-cb.early.stop <- function(stopping_rounds, maximize = FALSE,
-                          metric_name = NULL, verbose = TRUE) {
-  # state variables
-  best_iteration <- -1
-  best_score <- Inf
-  best_msg <- NULL
-  metric_idx <- 1
-
-  init <- function(env) {
-    if (length(env$bst_evaluation) == 0)
-      stop("For early stopping, watchlist must have at least one element")
-
-    eval_names <- gsub('-', '_', names(env$bst_evaluation), fixed = TRUE)
-    if (!is.null(metric_name)) {
-      metric_idx <<- which(gsub('-', '_', metric_name, fixed = TRUE) == eval_names)
-      if (length(metric_idx) == 0)
-        stop("'metric_name' for early stopping is not one of the following:\n",
-             paste(eval_names, collapse = ' '), '\n')
-    }
-    if (is.null(metric_name) &&
-        length(env$bst_evaluation) > 1) {
-      metric_idx <<- length(eval_names)
-      if (verbose)
-        cat('Multiple eval metrics are present. Will use ',
-            eval_names[metric_idx], ' for early stopping.\n', sep = '')
-    }
-
-    metric_name <<- eval_names[metric_idx]
-
-    # maximize is usually NULL when not set in xgb.train and built-in metrics
-    if (is.null(maximize))
-      maximize <<- grepl('(_auc|_map|_ndcg|_pre)', metric_name)
-
-    if (verbose && NVL(env$rank, 0) == 0)
-      cat("Will train until ", metric_name, " hasn't improved in ",
-          stopping_rounds, " rounds.\n\n", sep = '')
-
-    best_iteration <<- 1
-    if (maximize) best_score <<- -Inf
-
-    env$stop_condition <- FALSE
-
-    if (!is.null(env$bst)) {
-      if (!inherits(env$bst, 'xgb.Booster'))
-        stop("'bst' in the parent frame must be an 'xgb.Booster'")
-      if (!is.null(best_score <- xgb.attr(env$bst, 'best_score'))) {
-        best_score <<- as.numeric(best_score)
-        best_iteration <<- as.numeric(xgb.attr(env$bst, 'best_iteration')) + 1
-        best_msg <<- as.numeric(xgb.attr(env$bst, 'best_msg'))
-      } else {
-        xgb.attributes(env$bst) <- list(best_iteration = best_iteration - 1,
-                                        best_score = best_score)
-      }
-    } else if (is.null(env$bst_folds) || is.null(env$basket)) {
-      stop("Parent frame has neither 'bst' nor ('bst_folds' and 'basket')")
-    }
-  }
-
-  finalizer <- function(env) {
-    if (!is.null(env$bst)) {
-      attr_best_score <- as.numeric(xgb.attr(env$bst, 'best_score'))
-      if (best_score != attr_best_score) {
-        # If the difference is too big, throw an error
-        if (abs(best_score - attr_best_score) >= 1e-14) {
-          stop("Inconsistent 'best_score' values between the closure state: ", best_score,
-               " and the xgb.attr: ", attr_best_score)
-        }
-        # If the difference is due to floating-point truncation, update best_score
-        best_score <- attr_best_score
-      }
-      xgb.attr(env$bst, "best_iteration") <- best_iteration - 1
-      xgb.attr(env$bst, "best_score") <- best_score
-    } else {
-      env$basket$best_iteration <- best_iteration
-    }
-  }
-
-  callback <- function(env = parent.frame(), finalize = FALSE) {
-    if (best_iteration < 0)
-      init(env)
-
-    if (finalize)
-      return(finalizer(env))
-
-    i <- env$iteration
-    score <- env$bst_evaluation[metric_idx]
-
-    if ((maximize && score > best_score) ||
-        (!maximize && score < best_score)) {
-
-      best_msg <<- .format_eval_string(
-        i, env$bst_evaluation, env$bst_evaluation_err
-      )
-      best_score <<- score
-      best_iteration <<- i
-      # save the property to attributes, so they will occur in checkpoint
-      if (!is.null(env$bst)) {
-        xgb.attributes(env$bst) <- list(
-          best_iteration = best_iteration - 1, # convert to 0-based index
-          best_score = best_score,
-          best_msg = best_msg
-        )
-      }
-    } else if (i - best_iteration >= stopping_rounds) {
-      env$stop_condition <- TRUE
-      env$end_iteration <- i
-      if (verbose && NVL(env$rank, 0) == 0)
-        cat("Stopping. Best iteration:\n", best_msg, "\n\n", sep = '')
-    }
-  }
-  attr(callback, 'call') <- match.call()
-  attr(callback, 'name') <- 'cb.early.stop'
-  callback
-}
-
-
-#' Callback closure for saving a model file.
+#' Note that, for \link{xgb.cv}, this will be the full data, while data for the specific
+#' folds can be found in the `model` object.
 #'
-#' @param save_period save the model to disk after every
-#'        \code{save_period} iterations; 0 means save the model at the end.
-#' @param save_name the name or path for the saved model file.
+#' \item watchlist The evaluation watchlist, as passed under argument `watchlist` to
+#' \link{xgb.train}.
 #'
-#'        Note that the format of the model being saved is determined by the file
-#'        extension specified here (see \link{xgb.save} for details about how it works).
+#' For \link{xgb.cv}, this will always be `NULL`.
 #'
-#'        It can contain a \code{\link[base]{sprintf}} formatting specifier
-#'        to include the integer iteration number in the file name.
-#'        E.g., with \code{save_name} = 'xgboost_%04d.ubj',
-#'        the file saved at iteration 50 would be named "xgboost_0050.ubj".
-#' @seealso \link{xgb.save}
-#' @details
-#' This callback function allows to save an xgb-model file, either periodically after each \code{save_period}'s or at the end.
+#' \item begin_iteration Index of the first boosting iteration that will be executed
+#' (base-1 indexing).
 #'
-#' Callback function expects the following values to be set in its calling frame:
-#' \code{bst},
-#' \code{iteration},
-#' \code{begin_iteration},
-#' \code{end_iteration}.
+#' This will typically be '1', but when using training continuation, depending on the
+#' parameters for updates, boosting rounds will be continued from where the previous
+#' model ended, in which case this will be larger than 1.
 #'
-#' @seealso
-#' \code{\link{callbacks}}
+#' \item end_iteration Index of the last boostign iteration that will be executed
+#' (base-1 indexing, inclusive of this end).
 #'
-#' @export
-cb.save.model <- function(save_period = 0, save_name = "xgboost.ubj") {
-
-  if (save_period < 0)
-    stop("'save_period' cannot be negative")
-
-  callback <- function(env = parent.frame()) {
-    if (is.null(env$bst))
-      stop("'save_model' callback requires the 'bst' booster object in its calling frame")
-
-    if ((save_period > 0 && (env$iteration - env$begin_iteration) %% save_period == 0) ||
-        (save_period == 0 && env$iteration == env$end_iteration)) {
-      # Note: this throws a warning if the name doesn't have anything to format through 'sprintf'
-      suppressWarnings({
-        save_name <- sprintf(save_name, env$iteration)
-      })
-      xgb.save(env$bst, save_name)
-    }
-  }
-  attr(callback, 'call') <- match.call()
-  attr(callback, 'name') <- 'cb.save.model'
-  callback
-}
-
-
-#' Callback closure for returning cross-validation based predictions.
+#' It should match with argument `nrounds` passed to \link{xgb.train} or \link{xgb.cv}.
 #'
-#' @param save_models a flag for whether to save the folds' models.
+#' Note that boosting might be interrupted before reaching this last iteration, for
+#' example by using the early stopping callback \link{xgb.cb.early.stop}.
 #'
-#' @details
-#' This callback function saves predictions for all of the test folds,
-#' and also allows to save the folds' models.
+#' \item iteration Index of the iteration number that is being executed (first iteration
+#' will be the same as parameter `begin_iteration`, then next one will add +1, and so on).
 #'
-#' It is a "finalizer" callback and it uses early stopping information whenever it is available,
-#' thus it must be run after the early stopping callback if the early stopping is used.
+#' \item iter_feval Evaluation metrics for the `watchlist` that was supplied, either
+#' determined by the objective, or by parameter `feval`.
 #'
-#' Callback function expects the following values to be set in its calling frame:
-#' \code{bst_folds},
-#' \code{basket},
-#' \code{data},
-#' \code{end_iteration},
-#' \code{params},
+#' For \link{xgb.train}, this will be a named vector with one entry per element in
+#' `watchlist`, where the names are determined as 'watchlist name' + '-' + 'metric name' - for
+#' example, if `watchlist` contains an entry named "tr" and the metric is "rmse",
+#' this will be a one-element vector with name "tr-rmse".
 #'
-#' @return
-#' Predictions are returned inside of the \code{pred} element, which is either a vector or a matrix,
-#' depending on the number of prediction outputs per data row. The order of predictions corresponds
-#' to the order of rows in the original dataset. Note that when a custom \code{folds} list is
-#' provided in \code{xgb.cv}, the predictions would only be returned properly when this list is a
-#' non-overlapping list of k sets of indices, as in a standard k-fold CV. The predictions would not be
-#' meaningful when user-provided folds have overlapping indices as in, e.g., random sampling splits.
-#' When some of the indices in the training dataset are not included into user-provided \code{folds},
-#' their prediction value would be \code{NA}.
+#' For \link{xgb.cv}, this will be a 2d matrix with dimensions `[length(watchlist), nfolds]`,
+#' where the row names will follow the same naming logic as the one-dimensional vector
+#' that is passed in \link{xgb.train}.
 #'
-#' @seealso
-#' \code{\link{callbacks}}
+#' Note that, internally, the built-in callbacks such as \link{xgb.cb.print.evaluation} summarize
+#' this table by calculating the row-wise means and standard deviations.
 #'
-#' @export
-cb.cv.predict <- function(save_models = FALSE) {
-
-  finalizer <- function(env) {
-    if (is.null(env$basket) || is.null(env$bst_folds))
-      stop("'cb.cv.predict' callback requires 'basket' and 'bst_folds' lists in its calling frame")
-
-    N <- nrow(env$data)
-    pred <- NULL
-
-    iterationrange <- c(1, NVL(env$basket$best_iteration, env$end_iteration))
-    if (NVL(env$params[['booster']], '') == 'gblinear') {
-      iterationrange <- "all"
-    }
-    for (fd in env$bst_folds) {
-      pr <- predict(fd$bst, fd$watchlist[[2]], iterationrange = iterationrange, reshape = TRUE)
-      if (is.null(pred)) {
-        if (NCOL(pr) > 1L) {
-          pred <- matrix(NA_real_, N, ncol(pr))
-        } else {
-          pred <- matrix(NA_real_, N)
-        }
-      }
-      if (is.matrix(pred)) {
-        pred[fd$index, ] <- pr
-      } else {
-        pred[fd$index] <- pr
-      }
-    }
-    env$basket$pred <- pred
-    if (save_models) {
-      env$basket$models <- lapply(env$bst_folds, function(fd) {
-        return(fd$bst)
-      })
-    }
-  }
-
-  callback <- function(env = parent.frame(), finalize = FALSE) {
-    if (finalize)
-      return(finalizer(env))
-  }
-  attr(callback, 'call') <- match.call()
-  attr(callback, 'name') <- 'cb.cv.predict'
-  callback
-}
-
-
-#' Callback closure for collecting the model coefficients history of a gblinear booster
-#' during its training.
+#' \item final_feval The evaluation results after the last boosting round is executed
+#' (same format as `iter_feval`, and will be the exact same input as passed under
+#' `iter_feval` to the last round that is executed during model fitting).
 #'
-#' @param sparse when set to FALSE/TRUE, a dense/sparse matrix is used to store the result.
-#'       Sparse format is useful when one expects only a subset of coefficients to be non-zero,
-#'       when using the "thrifty" feature selector with fairly small number of top features
-#'       selected per iteration.
+#' \item prev_cb_res Result from a previous run of a callback sharing the same name
+#' (as given by parameter `cb_name`) when conducting training continuation, if there
+#' was any in the booster R attributes.
 #'
-#' @details
-#' To keep things fast and simple, gblinear booster does not internally store the history of linear
-#' model coefficients at each boosting iteration. This callback provides a workaround for storing
-#' the coefficients' path, by extracting them after each training iteration.
+#' Some times, one might want to append the new results to the previous one, and this will
+#' be done automatically by the built-in callbacks such as \link{xgb.cb.evaluation.log},
+#' which will append the new rows to the previous table.
 #'
-#' Callback function expects the following values to be set in its calling frame:
-#' \code{bst} (or \code{bst_folds}).
+#' If no such previous callback result is available (which it never will when fitting
+#' a model from start instead of updating an existing model), this will be `NULL`.
 #'
-#' @return
-#' Results are stored in the \code{coefs} element of the closure.
-#' The \code{\link{xgb.gblinear.history}} convenience function provides an easy
-#' way to access it.
-#' With \code{xgb.train}, it is either a dense of a sparse matrix.
-#' While with \code{xgb.cv}, it is a list (an element per each fold) of such
-#' matrices.
+#' For \link{xgb.cv}, which doesn't support training continuation, this will always be `NULL`.
+#' }
 #'
-#' @seealso
-#' \code{\link{callbacks}}, \code{\link{xgb.gblinear.history}}.
+#' The following names (`cb_name` values) are reserved for internal callbacks:\itemize{
+#' \item print_evaluation
+#' \item evaluation_log
+#' \item reset_parameters
+#' \item early_stop
+#' \item save_model
+#' \item cv_predict
+#' \item gblinear_history
+#' }
 #'
+#' The following names are reserved for other non-callback attributes:\itemize{
+#' \item names
+#' \item class
+#' \item call
+#' \item params
+#' \item niter
+#' \item nfeatures
+#' \item folds
+#' }
+#'
+#' When using the built-in early stopping callback (\link{xgb.cb.early.stop}), said callback
+#' will always be executed before the others, as it sets some booster C-level attributes
+#' that other callbacks might also use. Otherwise, the order of execution will match with
+#' the order in which the callbacks are passed to the model fitting function.
+#' @seealso Built-in callbacks:\itemize{
+#' \item \link{xgb.cb.print.evaluation}
+#' \item \link{xgb.cb.evaluation.log}
+#' \item \link{xgb.cb.reset.parameters}
+#' \item \link{xgb.cb.early.stop}
+#' \item \link{xgb.cb.save.model}
+#' \item \link{xgb.cb.cv.predict}
+#' \item \link{xgb.cb.gblinear.history}
+#' }
 #' @examples
-#' #### Binary classification:
+#' # Example constructing a custom callback that calculates
+#' # squared error on the training data, without a watchlist,
+#' # and outputs the per-iteration results.
+#' ssq_callback <- xgb.Callback(
+#'   cb_name = "ssq",
+#'   f_before_training = function(env, model, data, watchlist,
+#'                                begin_iteration, end_iteration) {
+#'     # A vector to keep track of a number at each iteration
+#'     env$logs <- rep(NA_real_, end_iteration - begin_iteration + 1)
+#'   },
+#'   f_after_iter = function(env, model, data, watchlist, iteration, iter_feval) {
+#'     # This calculates the sum of squared errors on the training data.
+#'     # Note that this can be better done by passing a 'watchlist' entry,
+#'     # but this demonstrates a way in which callbacks can be structured.
+#'     pred <- predict(model, data)
+#'     err <- pred - getinfo(data, "label")
+#'     sq_err <- sum(err^2)
+#'     env$logs[iteration] <- sq_err
+#'     cat(
+#'       sprintf(
+#'         "Squared error at iteration %d: %.2f\n",
+#'         iteration, sq_err
+#'       )
+#'     )
 #'
-#' ## Keep the number of threads to 1 for examples
-#' nthread <- 1
-#' data.table::setDTthreads(nthread)
+#'     # A return value of 'TRUE' here would signal to finalize the training
+#'     return(FALSE)
+#'   },
+#'   f_after_training = function(env, model, data, watchlist, iteration,
+#'                               final_feval, prev_cb_res) {
+#'     return(env$logs)
+#'   }
+#' )
 #'
-#' # In the iris dataset, it is hard to linearly separate Versicolor class from the rest
-#' # without considering the 2nd order interactions:
-#' x <- model.matrix(Species ~ .^2, iris)[,-1]
-#' colnames(x)
-#' dtrain <- xgb.DMatrix(scale(x), label = 1*(iris$Species == "versicolor"), nthread = nthread)
-#' param <- list(booster = "gblinear", objective = "reg:logistic", eval_metric = "auc",
-#'               lambda = 0.0003, alpha = 0.0003, nthread = nthread)
-#' # For 'shotgun', which is a default linear updater, using high eta values may result in
-#' # unstable behaviour in some datasets. With this simple dataset, however, the high learning
-#' # rate does not break the convergence, but allows us to illustrate the typical pattern of
-#' # "stochastic explosion" behaviour of this lock-free algorithm at early boosting iterations.
-#' bst <- xgb.train(param, dtrain, list(tr=dtrain), nrounds = 200, eta = 1.,
-#'                  callbacks = list(cb.gblinear.history()))
-#' # Extract the coefficients' path and plot them vs boosting iteration number:
-#' coef_path <- xgb.gblinear.history(bst)
-#' matplot(coef_path, type = 'l')
-#'
-#' # With the deterministic coordinate descent updater, it is safer to use higher learning rates.
-#' # Will try the classical componentwise boosting which selects a single best feature per round:
-#' bst <- xgb.train(param, dtrain, list(tr=dtrain), nrounds = 200, eta = 0.8,
-#'                  updater = 'coord_descent', feature_selector = 'thrifty', top_k = 1,
-#'                  callbacks = list(cb.gblinear.history()))
-#' matplot(xgb.gblinear.history(bst), type = 'l')
-#' #  Componentwise boosting is known to have similar effect to Lasso regularization.
-#' # Try experimenting with various values of top_k, eta, nrounds,
-#' # as well as different feature_selectors.
-#'
-#' # For xgb.cv:
-#' bst <- xgb.cv(param, dtrain, nfold = 5, nrounds = 100, eta = 0.8,
-#'               callbacks = list(cb.gblinear.history()))
-#' # coefficients in the CV fold #3
-#' matplot(xgb.gblinear.history(bst)[[3]], type = 'l')
-#'
-#'
-#' #### Multiclass classification:
-#' #
-#' dtrain <- xgb.DMatrix(scale(x), label = as.numeric(iris$Species) - 1, nthread = nthread)
-#' param <- list(booster = "gblinear", objective = "multi:softprob", num_class = 3,
-#'               lambda = 0.0003, alpha = 0.0003, nthread = nthread)
-#' # For the default linear updater 'shotgun' it sometimes is helpful
-#' # to use smaller eta to reduce instability
-#' bst <- xgb.train(param, dtrain, list(tr=dtrain), nrounds = 50, eta = 0.5,
-#'                  callbacks = list(cb.gblinear.history()))
-#' # Will plot the coefficient paths separately for each class:
-#' matplot(xgb.gblinear.history(bst, class_index = 0), type = 'l')
-#' matplot(xgb.gblinear.history(bst, class_index = 1), type = 'l')
-#' matplot(xgb.gblinear.history(bst, class_index = 2), type = 'l')
-#'
-#' # CV:
-#' bst <- xgb.cv(param, dtrain, nfold = 5, nrounds = 70, eta = 0.5,
-#'               callbacks = list(cb.gblinear.history(FALSE)))
-#' # 1st fold of 1st class
-#' matplot(xgb.gblinear.history(bst, class_index = 0)[[1]], type = 'l')
+#' data(mtcars)
+#' y <- mtcars$mpg
+#' x <- as.matrix(mtcars[, -1])
+#' dm <- xgb.DMatrix(x, label = y, nthread = 1)
+#' model <- xgb.train(
+#'   data = dm,
+#'   params = list(objective = "reg:squarederror", nthread = 1),
+#'   nrounds = 5,
+#'   callbacks = list(ssq_callback),
+#'   keep_extra_attributes = TRUE
+#' )
 #'
+#' # Result from 'f_after_iter' will be available as an attribute
+#' attributes(model)$ssq
 #' @export
-cb.gblinear.history <- function(sparse = FALSE) {
-  coefs <- NULL
+xgb.Callback <- function(
+  cb_name = "custom_callback",
+  env = new.env(),
+  f_before_training = function(env, model, data, watchlist, begin_iteration, end_iteration) NULL,
+  f_before_iter = function(env, model, data, watchlist, iteration) NULL,
+  f_after_iter = function(env, model, data, watchlist, iteration, iter_feval) NULL,
+  f_after_training = function(env, model, data, watchlist, iteration, final_feval, prev_cb_res) NULL
+) {
+  stopifnot(is.null(f_before_training) || is.function(f_before_training))
+  stopifnot(is.null(f_before_iter) || is.function(f_before_iter))
+  stopifnot(is.null(f_after_iter) || is.function(f_after_iter))
+  stopifnot(is.null(f_after_training) || is.function(f_after_training))
+  stopifnot(is.character(cb_name) && length(cb_name) == 1)
 
-  init <- function(env) {
-    # xgb.train(): bst will be present
-    # xgb.cv(): bst_folds will be present
-    if (is.null(env$bst) && is.null(env$bst_folds)) {
-        stop("Parent frame has neither 'bst' nor 'bst_folds'")
-    }
+  if (cb_name %in% .reserved_cb_names) {
+    stop("Cannot use reserved callback name '", cb_name, "'.")
   }
 
-  # convert from list to (sparse) matrix
-  list2mat <- function(coef_list) {
-    if (sparse) {
-      coef_mat <- sparseMatrix(x = unlist(lapply(coef_list, slot, "x")),
-                               i = unlist(lapply(coef_list, slot, "i")),
-                               p = c(0, cumsum(sapply(coef_list, function(x) length(x@x)))),
-                               dims = c(length(coef_list[[1]]), length(coef_list)))
-      return(t(coef_mat))
-    } else {
-      return(do.call(rbind, coef_list))
-    }
-  }
+  out <- list(
+    cb_name = cb_name,
+    env = env,
+    f_before_training = f_before_training,
+    f_before_iter = f_before_iter,
+    f_after_iter = f_after_iter,
+    f_after_training = f_after_training
+  )
+  class(out) <- "xgb.Callback"
+  return(out)
+}
 
-  finalizer <- function(env) {
-    if (length(coefs) == 0)
-      return()
-    if (!is.null(env$bst)) { # # xgb.train:
-      coefs <<- list2mat(coefs)
-    } else { # xgb.cv:
-      # second lapply transposes the list
-      coefs <<- lapply(
-        X = lapply(
-          X = seq_along(coefs[[1]]),
-          FUN = function(i) lapply(coefs, "[[", i)
-        ),
-        FUN = list2mat
+.execute.cb.before.training <- function(
+  callbacks,
+  model,
+  data,
+  watchlist,
+  begin_iteration,
+  end_iteration
+) {
+  for (callback in callbacks) {
+    if (!is.null(callback$f_before_training)) {
+      callback$f_before_training(
+        callback$env,
+        model,
+        data,
+        watchlist,
+        begin_iteration,
+        end_iteration
       )
     }
   }
-
-  extract.coef <- function(env) {
-    if (!is.null(env$bst)) { # # xgb.train:
-      cf <- as.numeric(grep('(booster|bias|weigh)', xgb.dump(env$bst), invert = TRUE, value = TRUE))
-      if (sparse) cf <- as(cf, "sparseVector")
-    } else { # xgb.cv:
-      cf <- vector("list", length(env$bst_folds))
-      for (i in seq_along(env$bst_folds)) {
-        dmp <- xgb.dump(env$bst_folds[[i]]$bst)
-        cf[[i]] <- as.numeric(grep('(booster|bias|weigh)', dmp, invert = TRUE, value = TRUE))
-        if (sparse) cf[[i]] <- as(cf[[i]], "sparseVector")
-      }
-    }
-    cf
-  }
-
-  callback <- function(env = parent.frame(), finalize = FALSE) {
-    if (is.null(coefs)) init(env)
-    if (finalize) return(finalizer(env))
-    cf <- extract.coef(env)
-    coefs <<- c(coefs, list(cf))
-  }
-
-  attr(callback, 'call') <- match.call()
-  attr(callback, 'name') <- 'cb.gblinear.history'
-  callback
 }
 
-#' @title Extract gblinear coefficients history.
-#' @description A helper function to extract the matrix of linear coefficients' history
-#' from a gblinear model created while using the \code{cb.gblinear.history()}
-#' callback.
-#' @details Note that this is an R-specific function that relies on R attributes that
-#' are not saved when using xgboost's own serialization functions like \link{xgb.load}
-#' or \link{xgb.load.raw}.
-#'
-#' In order for a serialized model to be accepted by tgis function, one must use R
-#' serializers such as \link{saveRDS}.
-#' @param model either an \code{xgb.Booster} or a result of \code{xgb.cv()}, trained
-#'        using the \code{cb.gblinear.history()} callback, but \bold{not} a booster
-#'        loaded from \link{xgb.load} or \link{xgb.load.raw}.
-#' @param class_index zero-based class index to extract the coefficients for only that
-#'        specific class in a multinomial multiclass model. When it is NULL, all the
-#'        coefficients are returned. Has no effect in non-multiclass models.
-#'
-#' @return
-#' For an \code{xgb.train} result, a matrix (either dense or sparse) with the columns
-#' corresponding to iteration's coefficients (in the order as \code{xgb.dump()} would
-#' return) and the rows corresponding to boosting iterations.
-#'
-#' For an \code{xgb.cv} result, a list of such matrices is returned with the elements
-#' corresponding to CV folds.
-#'
-#' @export
-xgb.gblinear.history <- function(model, class_index = NULL) {
-
-  if (!(inherits(model, "xgb.Booster") ||
-        inherits(model, "xgb.cv.synchronous")))
-    stop("model must be an object of either xgb.Booster or xgb.cv.synchronous class")
-  is_cv <- inherits(model, "xgb.cv.synchronous")
-
-  if (is_cv) {
-    callbacks <- model$callbacks
-  } else {
-    callbacks <- attributes(model)$callbacks
+.execute.cb.before.iter <- function(
+  callbacks,
+  model,
+  data,
+  watchlist,
+  iteration
+) {
+  if (!length(callbacks)) {
+    return(FALSE)
   }
+  out <- sapply(callbacks, function(cb) {
+    if (is.null(cb$f_before_iter)) {
+      return(FALSE)
+    }
+    should_stop <- cb$f_before_iter(
+      cb$env,
+      model,
+      data,
+      watchlist,
+      iteration
+    )
+    if (!NROW(should_stop)) {
+      should_stop <- FALSE
+    } else if (NROW(should_stop) > 1) {
+      should_stop <- head(as.logical(should_stop), 1)
+    }
+    return(should_stop)
+  })
+  return(any(out))
+}
 
-  if (is.null(callbacks) || is.null(callbacks$cb.gblinear.history))
-    stop("model must be trained while using the cb.gblinear.history() callback")
-
-  if (!is_cv) {
-    num_class <- xgb.num_class(model)
-    num_feat <- xgb.num_feature(model)
-  } else {
-    # in case of CV, the object is expected to have this info
-    if (model$params$booster != "gblinear")
-      stop("It does not appear to be a gblinear model")
-    num_class <- NVL(model$params$num_class, 1)
-    num_feat <- model$nfeatures
-    if (is.null(num_feat))
-      stop("This xgb.cv result does not have nfeatures info")
+.execute.cb.after.iter <- function(
+  callbacks,
+  model,
+  data,
+  watchlist,
+  iteration,
+  iter_feval
+) {
+  if (!length(callbacks)) {
+    return(FALSE)
   }
+  out <- sapply(callbacks, function(cb) {
+    if (is.null(cb$f_after_iter)) {
+      return(FALSE)
+    }
+    should_stop <- cb$f_after_iter(
+      cb$env,
+      model,
+      data,
+      watchlist,
+      iteration,
+      iter_feval
+    )
+    if (!NROW(should_stop)) {
+      should_stop <- FALSE
+    } else if (NROW(should_stop) > 1) {
+      should_stop <- head(as.logical(should_stop), 1)
+    }
+    return(should_stop)
+  })
+  return(any(out))
+}
 
-  if (!is.null(class_index) &&
-      num_class > 1 &&
-      (class_index[1] < 0 || class_index[1] >= num_class))
-    stop("class_index has to be within [0,", num_class - 1, "]")
-
-  coef_path <- environment(callbacks$cb.gblinear.history)[["coefs"]]
-  if (!is.null(class_index) && num_class > 1) {
-    coef_path <- if (is.list(coef_path)) {
-      lapply(coef_path,
-             function(x) x[, seq(1 + class_index, by = num_class, length.out = num_feat)])
+.execute.cb.after.training <- function(
+  callbacks,
+  model,
+  data,
+  watchlist,
+  iteration,
+  final_feval,
+  prev_cb_res
+) {
+  if (!length(callbacks)) {
+    return(NULL)
+  }
+  old_cb_res <- attributes(model)
+  out <- lapply(callbacks, function(cb) {
+    if (is.null(cb$f_after_training)) {
+      return(NULL)
     } else {
-      coef_path <- coef_path[, seq(1 + class_index, by = num_class, length.out = num_feat)]
+      return(
+        cb$f_after_training(
+          cb$env,
+          model,
+          data,
+          watchlist,
+          iteration,
+          final_feval,
+          getElement(old_cb_res, cb$cb_name)
+        )
+      )
     }
+  })
+  names(out) <- sapply(callbacks, function(cb) cb$cb_name)
+  if (NROW(out)) {
+    out <- out[!sapply(out, is.null)]
   }
-  coef_path
+  return(out)
 }
 
+.summarize.feval <- function(iter_feval, showsd) {
+  if (NCOL(iter_feval) > 1L && showsd) {
+    stdev <- apply(iter_feval, 1, sd)
+  } else {
+    stdev <- NULL
+  }
+  if (NCOL(iter_feval) > 1L) {
+    iter_feval <- rowMeans(iter_feval)
+  }
+  return(list(feval = iter_feval, stdev = stdev))
+}
 
-#
-# Internal utility functions for callbacks ------------------------------------
-#
+.print.evaluation <- function(iter_feval, showsd, iteration) {
+  tmp <- .summarize.feval(iter_feval, showsd)
+  msg <- .format_eval_string(iteration, tmp$feval, tmp$stdev)
+  cat(msg, '\n')
+}
 
 # Format the evaluation metric string
 .format_eval_string <- function(iter, eval_res, eval_err = NULL) {
@@ -784,69 +407,838 @@ xgb.gblinear.history <- function(model, class_index = NULL) {
   return(paste0(iter, res))
 }
 
-# Extract callback names from the list of callbacks
-callback.names <- function(cb_list) {
-  unlist(lapply(cb_list, function(x) attr(x, 'name')))
-}
-
-# Extract callback calls from the list of callbacks
-callback.calls <- function(cb_list) {
-  unlist(lapply(cb_list, function(x) attr(x, 'call')))
-}
-
-# Add a callback cb to the list and make sure that
-# cb.early.stop and cb.cv.predict are at the end of the list
-# with cb.cv.predict being the last (when present)
-add.cb <- function(cb_list, cb) {
-  cb_list <- c(cb_list, cb)
-  names(cb_list) <- callback.names(cb_list)
-  if ('cb.early.stop' %in% names(cb_list)) {
-    cb_list <- c(cb_list, cb_list['cb.early.stop'])
-    # this removes only the first one
-    cb_list['cb.early.stop'] <- NULL
+#' @title Callback for printing the result of evaluation
+#' @param period results would be printed every number of periods
+#' @param showsd whether standard deviations should be printed (when available)
+#' @return An `xgb.Callback` object, which can be passed to \link{xgb.train} or \link{xgb.cv}.
+#' @description
+#' The callback function prints the result of evaluation at every \code{period} iterations.
+#' The initial and the last iteration's evaluations are always printed.
+#'
+#' Does not leave any attribute in the booster (see \link{xgb.cb.evaluation.log} for that).
+#' @seealso \link{xgb.Callback}
+#' @export
+xgb.cb.print.evaluation <- function(period = 1, showsd = TRUE) {
+  if (length(period) != 1 || period != floor(period) || period < 1) {
+    stop("'period' must be a positive integer.")
   }
-  if ('cb.cv.predict' %in% names(cb_list)) {
-    cb_list <- c(cb_list, cb_list['cb.cv.predict'])
-    cb_list['cb.cv.predict'] <- NULL
-  }
-  cb_list
-}
 
-# Sort callbacks list into categories
-categorize.callbacks <- function(cb_list) {
-  list(
-    pre_iter = Filter(function(x) {
-        pre <- attr(x, 'is_pre_iteration')
-        !is.null(pre) && pre
-      }, cb_list),
-    post_iter = Filter(function(x) {
-        pre <- attr(x, 'is_pre_iteration')
-        is.null(pre) || !pre
-      }, cb_list),
-    finalize = Filter(function(x) {
-        'finalize' %in% names(formals(x))
-      }, cb_list)
+  xgb.Callback(
+    cb_name = "print_evaluation",
+    env = as.environment(list(period = period, showsd = showsd, is_first_call = TRUE)),
+    f_before_training = NULL,
+    f_before_iter = NULL,
+    f_after_iter = function(env, model, data, watchlist, iteration, iter_feval) {
+      if (is.null(iter_feval)) {
+        return(FALSE)
+      }
+      if (env$is_first_call || (iteration - 1) %% env$period == 0) {
+        .print.evaluation(iter_feval, env$showsd, iteration)
+        env$last_printed_iter <- iteration
+      }
+      env$is_first_call <- FALSE
+      return(FALSE)
+    },
+    f_after_training = function(env, model, data, watchlist, iteration, final_feval, prev_cb_res) {
+      if (is.null(final_feval)) {
+        return(NULL)
+      }
+      if (is.null(env$last_printed_iter) || iteration > env$last_printed_iter) {
+        .print.evaluation(final_feval, env$showsd, iteration)
+      }
+    }
   )
 }
 
-# Check whether all callback functions with names given by 'query_names' are present in the 'cb_list'.
-has.callbacks <- function(cb_list, query_names) {
-  if (length(cb_list) < length(query_names))
-    return(FALSE)
-  if (!is.list(cb_list) ||
-      any(sapply(cb_list, class) != 'function')) {
-    stop('`cb_list` must be a list of callback functions')
-  }
-  cb_names <- callback.names(cb_list)
-  if (!is.character(cb_names) ||
-      length(cb_names) != length(cb_list) ||
-      any(cb_names == "")) {
-    stop('All callbacks in the `cb_list` must have a non-empty `name` attribute')
-  }
-  if (!is.character(query_names) ||
-      length(query_names) == 0 ||
-      any(query_names == "")) {
-    stop('query_names must be a non-empty vector of non-empty character names')
-  }
-  return(all(query_names %in% cb_names))
+#' @title Callback for logging the evaluation history
+#' @return An `xgb.Callback` object, which can be passed to \link{xgb.train} or \link{xgb.cv}.
+#' @details This callback creates a table with per-iteration evaluation metrics (see parameters
+#' `watchlist` and `feval` in \link{xgb.train}).
+#' @details
+#' Note: in the column names of the final data.table, the dash '-' character is replaced with
+#' the underscore '_' in order to make the column names more like regular R identifiers.
+#' @seealso \link{xgb.cb.print.evaluation}
+#' @export
+xgb.cb.evaluation.log <- function() {
+  xgb.Callback(
+    cb_name = "evaluation_log",
+    f_before_training = function(env, model, data, watchlist, begin_iteration, end_iteration) {
+      env$evaluation_log <- vector("list", end_iteration - begin_iteration + 1)
+      env$next_log <- 1
+    },
+    f_before_iter = NULL,
+    f_after_iter = function(env, model, data, watchlist, iteration, iter_feval) {
+      tmp <- .summarize.feval(iter_feval, TRUE)
+      env$evaluation_log[[env$next_log]] <- list(iter = iteration, metrics = tmp$feval, sds = tmp$stdev)
+      env$next_log <- env$next_log + 1
+      return(FALSE)
+    },
+    f_after_training = function(env, model, data, watchlist, iteration, final_feval, prev_cb_res) {
+      if (!NROW(env$evaluation_log)) {
+        return(prev_cb_res)
+      }
+      # in case of early stopping
+      if (env$next_log <= length(env$evaluation_log)) {
+        env$evaluation_log <- head(env$evaluation_log, env$next_log - 1)
+      }
+
+      iters <- data.frame(iter = sapply(env$evaluation_log, function(x) x$iter))
+      metrics <- do.call(rbind, lapply(env$evaluation_log, function(x) x$metrics))
+      mnames <- gsub("-", "_", names(env$evaluation_log[[1]]$metrics), fixed = TRUE)
+      colnames(metrics) <- mnames
+      has_sds <- !is.null(env$evaluation_log[[1]]$sds)
+      if (has_sds) {
+        sds <- do.call(rbind, lapply(env$evaluation_log, function(x) x$sds))
+        colnames(sds) <- mnames
+        metrics <- lapply(
+          mnames,
+          function(metric) {
+            out <- cbind(metrics[, metric], sds[, metric])
+            colnames(out) <- paste0(metric, c("_mean", "_std"))
+            return(out)
+          }
+        )
+        metrics <- do.call(cbind, metrics)
+      }
+      evaluation_log <- cbind(iters, metrics)
+
+      if (!is.null(prev_cb_res)) {
+        if (!is.data.table(prev_cb_res)) {
+          prev_cb_res <- data.table::as.data.table(prev_cb_res)
+        }
+        prev_take <- prev_cb_res[prev_cb_res$iter < min(evaluation_log$iter)]
+        if (nrow(prev_take)) {
+          evaluation_log <- rbind(prev_cb_res, evaluation_log)
+        }
+      }
+      evaluation_log <- data.table::as.data.table(evaluation_log)
+      return(evaluation_log)
+    }
+  )
+}
+
+#' @title Callback for resetting the booster's parameters at each iteration.
+#' @param new_params a list where each element corresponds to a parameter that needs to be reset.
+#'        Each element's value must be either a vector of values of length \code{nrounds}
+#'        to be set at each iteration,
+#'        or a function of two parameters \code{learning_rates(iteration, nrounds)}
+#'        which returns a new parameter value by using the current iteration number
+#'        and the total number of boosting rounds.
+#' @return An `xgb.Callback` object, which can be passed to \link{xgb.train} or \link{xgb.cv}.
+#' @details
+#' Note that when training is resumed from some previous model, and a function is used to
+#' reset a parameter value, the \code{nrounds} argument in this function would be the
+#' the number of boosting rounds in the current training.
+#'
+#' Does not leave any attribute in the booster.
+#' @export
+xgb.cb.reset.parameters <- function(new_params) {
+  stopifnot(is.list(new_params))
+  pnames <- gsub(".", "_", names(new_params), fixed = TRUE)
+  not_allowed <- pnames %in%
+    c('num_class', 'num_output_group', 'size_leaf_vector', 'updater_seq')
+  if (any(not_allowed))
+    stop('Parameters ', paste(pnames[not_allowed]), " cannot be changed during boosting.")
+
+  xgb.Callback(
+    cb_name = "reset_parameters",
+    env = as.environment(list(new_params = new_params)),
+    f_before_training = function(env, model, data, watchlist, begin_iteration, end_iteration) {
+      env$end_iteration <- end_iteration
+
+      pnames <- gsub(".", "_", names(env$new_params), fixed = TRUE)
+      for (n in pnames) {
+        p <- env$new_params[[n]]
+        if (is.function(p)) {
+          if (length(formals(p)) != 2)
+            stop("Parameter '", n, "' is a function but not of two arguments")
+        } else if (is.numeric(p) || is.character(p)) {
+          if (length(p) != env$end_iteration)
+            stop("Length of '", n, "' has to be equal to 'nrounds'")
+        } else {
+          stop("Parameter '", n, "' is not a function or a vector")
+        }
+      }
+    },
+    f_before_iter = function(env, model, data, watchlist, iteration) {
+      pars <- lapply(env$new_params, function(p) {
+        if (is.function(p)) {
+          return(p(iteration, env$end_iteration))
+        } else {
+          return(p[iteration])
+        }
+      })
+
+      if (inherits(model, "xgb.Booster")) {
+        xgb.parameters(model) <- pars
+      } else {
+        for (fd in model) {
+          xgb.parameters(fd$bst) <- pars
+        }
+      }
+      return(FALSE)
+    },
+    f_after_iter = NULL,
+    f_after_training = NULL
+  )
+}
+
+#' @title Callback to activate early stopping
+#' @param stopping_rounds The number of rounds with no improvement in
+#'        the evaluation metric in order to stop the training.
+#' @param maximize Whether to maximize the evaluation metric.
+#' @param metric_name The name of an evaluation column to use as a criteria for early
+#'        stopping. If not set, the last column would be used.
+#'        Let's say the test data in \code{watchlist} was labelled as \code{dtest},
+#'        and one wants to use the AUC in test data for early stopping regardless of where
+#'        it is in the \code{watchlist}, then one of the following would need to be set:
+#'        \code{metric_name='dtest-auc'} or \code{metric_name='dtest_auc'}.
+#'        All dash '-' characters in metric names are considered equivalent to '_'.
+#' @param verbose Whether to print the early stopping information.
+#' @param keep_all_iter Whether to keep all of the boosting rounds that were produced
+#'        in the resulting object. If passing `FALSE`, will only keep the boosting rounds
+#'        up to the detected best iteration, discarding the ones that come after.
+#' @return An `xgb.Callback` object, which can be passed to \link{xgb.train} or \link{xgb.cv}.
+#' @description
+#' This callback function determines the condition for early stopping.
+#'
+#' The following attributes are assigned to the booster's object:
+#' \itemize{
+#' \item \code{best_score} the evaluation score at the best iteration
+#' \item \code{best_iteration} at which boosting iteration the best score has occurred
+#' (0-based index for interoperability of binary models)
+#' }
+#'
+#' The same values are also stored as R attributes as a result of the callback, plus an additional
+#' attribute `stopped_by_max_rounds` which indicates whether an early stopping by the `stopping_rounds`
+#' condition occurred. Note that the `best_iteration` that is stored under R attributes will follow
+#' base-1 indexing, so it will be larger by '1' than the C-level 'best_iteration' that is accessed
+#' through \link{xgb.attr} or \link{xgb.attributes}.
+#'
+#' At least one data element is required in the evaluation watchlist for early stopping to work.
+#' @export
+xgb.cb.early.stop <- function(
+  stopping_rounds,
+  maximize = FALSE,
+  metric_name = NULL,
+  verbose = TRUE,
+  keep_all_iter = TRUE
+) {
+  if (!is.null(metric_name)) {
+    stopifnot(is.character(metric_name))
+    stopifnot(length(metric_name) == 1L)
+  }
+
+  xgb.Callback(
+    cb_name = "early_stop",
+    env = as.environment(
+      list(
+        checked_evnames = FALSE,
+        stopping_rounds = stopping_rounds,
+        maximize = maximize,
+        metric_name = metric_name,
+        verbose = verbose,
+        keep_all_iter = keep_all_iter,
+        stopped_by_max_rounds = FALSE
+      )
+    ),
+    f_before_training = function(env, model, data, watchlist, begin_iteration, end_iteration) {
+      if (inherits(model, "xgb.Booster") && !length(watchlist)) {
+        stop("For early stopping, watchlist must have at least one element")
+      }
+      env$begin_iteration <- begin_iteration
+      return(NULL)
+    },
+    f_before_iter = function(env, model, data, watchlist, iteration) NULL,
+    f_after_iter = function(env, model, data, watchlist, iteration, iter_feval) {
+      sds <- NULL
+      if (NCOL(iter_feval) > 1) {
+        tmp <- .summarize.feval(iter_feval, TRUE)
+        iter_feval <- tmp$feval
+        sds <- tmp$stdev
+      }
+
+      if (!env$checked_evnames) {
+
+        eval_names <- gsub('-', '_', names(iter_feval), fixed = TRUE)
+        if (!is.null(env$metric_name)) {
+          env$metric_idx <- which(gsub('-', '_', env$metric_name, fixed = TRUE) == eval_names)
+          if (length(env$metric_idx) == 0)
+            stop("'metric_name' for early stopping is not one of the following:\n",
+                 paste(eval_names, collapse = ' '), '\n')
+        }
+
+        if (is.null(env$metric_name)) {
+          if (NROW(iter_feval) == 1) {
+            env$metric_idx <- 1L
+          } else {
+            env$metric_idx <- length(eval_names)
+            if (env$verbose)
+              cat('Multiple eval metrics are present. Will use ',
+                  eval_names[env$metric_idx], ' for early stopping.\n', sep = '')
+          }
+        }
+
+        env$metric_name <- eval_names[env$metric_idx]
+
+        # maximize is usually NULL when not set in xgb.train and built-in metrics
+        if (is.null(env$maximize))
+          env$maximize <- grepl('(_auc|_aupr|_map|_ndcg|_pre)', env$metric_name)
+
+        if (env$verbose)
+          cat("Will train until ", env$metric_name, " hasn't improved in ",
+              env$stopping_rounds, " rounds.\n\n", sep = '')
+
+        env$best_iteration <- env$begin_iteration
+        if (env$maximize) {
+          env$best_score <- -Inf
+        } else {
+          env$best_score <- Inf
+        }
+
+        if (inherits(model, "xgb.Booster")) {
+          best_score <- xgb.attr(model, 'best_score')
+          if (NROW(best_score)) env$best_score <- as.numeric(best_score)
+          best_iteration <- xgb.attr(model, 'best_iteration')
+          if (NROW(best_iteration)) env$best_iteration <- as.numeric(best_iteration) + 1
+        }
+
+        env$checked_evnames <- TRUE
+      }
+
+      score <- iter_feval[env$metric_idx]
+      if ((env$maximize && score > env$best_score) ||
+          (!env$maximize && score < env$best_score)) {
+
+        env$best_score <- score
+        env$best_iteration <- iteration
+        # save the property to attributes, so they will occur in checkpoint
+        if (inherits(model, "xgb.Booster")) {
+          xgb.attributes(model) <- list(
+            best_iteration = env$best_iteration - 1, # convert to 0-based index
+            best_score = env$best_score
+          )
+        }
+      } else if (iteration - env$best_iteration >= env$stopping_rounds) {
+        if (env$verbose) {
+          best_msg <- .format_eval_string(iteration, iter_feval, sds)
+          cat("Stopping. Best iteration:\n", best_msg, "\n\n", sep = '')
+        }
+        env$stopped_by_max_rounds <- TRUE
+        return(TRUE)
+      }
+      return(FALSE)
+    },
+    f_after_training = function(env, model, data, watchlist, iteration, final_feval, prev_cb_res) {
+      if (inherits(model, "xgb.Booster") && !env$keep_all_iter && env$best_iteration < iteration) {
+        # Note: it loses the attributes after being sliced,
+        # so they have to be re-assigned afterwards.
+        prev_attr <- xgb.attributes(model)
+        if (NROW(prev_attr)) {
+          suppressWarnings({
+            prev_attr <- within(prev_attr, rm("best_score", "best_iteration"))
+          })
+        }
+        .Call(XGBoosterSliceAndReplace_R, xgb.get.handle(model), 0L, env$best_iteration, 1L)
+        if (NROW(prev_attr)) {
+          xgb.attributes(model) <- prev_attr
+        }
+      }
+      attrs_set <- list(best_iteration = env$best_iteration - 1, best_score = env$best_score)
+      if (inherits(model, "xgb.Booster")) {
+        xgb.attributes(model) <- attrs_set
+      } else {
+        for (fd in model) {
+          xgb.attributes(fd$bst) <- attrs_set # to use in the cv.predict callback
+        }
+      }
+      return(
+        list(
+          best_iteration = env$best_iteration,
+          best_score = env$best_score,
+          stopped_by_max_rounds = env$stopped_by_max_rounds
+        )
+      )
+    }
+  )
+}
+
+.save.model.w.formatted.name <- function(model, save_name, iteration) {
+  # Note: this throws a warning if the name doesn't have anything to format through 'sprintf'
+  suppressWarnings({
+    save_name <- sprintf(save_name, iteration)
+  })
+  xgb.save(model, save_name)
+}
+
+#' @title Callback for saving a model file.
+#' @param save_period Save the model to disk after every
+#'        \code{save_period} iterations; 0 means save the model at the end.
+#' @param save_name The name or path for the saved model file.
+#'        It can contain a \code{\link[base]{sprintf}} formatting specifier
+#'        to include the integer iteration number in the file name.
+#'        E.g., with \code{save_name} = 'xgboost_%04d.model',
+#'        the file saved at iteration 50 would be named "xgboost_0050.model".
+#' @return An `xgb.Callback` object, which can be passed to \link{xgb.train},
+#'         but \bold{not} to \link{xgb.cv}.
+#' @description
+#' This callback function allows to save an xgb-model file, either periodically
+#' after each \code{save_period}'s or at the end.
+#'
+#' Does not leave any attribute in the booster.
+#' @export
+xgb.cb.save.model <- function(save_period = 0, save_name = "xgboost.ubj") {
+  if (save_period < 0) {
+    stop("'save_period' cannot be negative")
+  }
+  if (!is.character(save_name) || length(save_name) != 1L) {
+    stop("'save_name' must be a single character refering to file name.")
+  }
+
+  xgb.Callback(
+    cb_name = "save_model",
+    env = as.environment(list(save_period = save_period, save_name = save_name, last_save = 0)),
+    f_before_training = function(env, model, data, watchlist, begin_iteration, end_iteration) {
+      env$begin_iteration <- begin_iteration
+    },
+    f_before_iter = NULL,
+    f_after_iter = function(env, model, data, watchlist, iteration, iter_feval) {
+      if (env$save_period > 0 && (iteration - env$begin_iteration) %% env$save_period == 0) {
+        .save.model.w.formatted.name(model, env$save_name, iteration)
+        env$last_save <- iteration
+      }
+      return(FALSE)
+    },
+    f_after_training = function(env, model, data, watchlist, iteration, final_feval, prev_cb_res) {
+      if (env$save_period == 0 && iteration > env$last_save) {
+        .save.model.w.formatted.name(model, env$save_name, iteration)
+      }
+    }
+  )
+}
+
+#' @title Callback for returning cross-validation based predictions.
+#' @param save_models A flag for whether to save the folds' models.
+#' @param outputmargin Whether to save margin predictions (same effect as passing this
+#' parameter to \link{predict.xgb.Booster}).
+#' @return An `xgb.Callback` object, which can be passed to \link{xgb.cv},
+#'         but \bold{not} to \link{xgb.train}.
+#' @description
+#' This callback function saves predictions for all of the test folds,
+#' and also allows to save the folds' models.
+#' @details
+#' Predictions are saved inside of the \code{pred} element, which is either a vector or a matrix,
+#' depending on the number of prediction outputs per data row. The order of predictions corresponds
+#' to the order of rows in the original dataset. Note that when a custom \code{folds} list is
+#' provided in \code{xgb.cv}, the predictions would only be returned properly when this list is a
+#' non-overlapping list of k sets of indices, as in a standard k-fold CV. The predictions would not be
+#' meaningful when user-provided folds have overlapping indices as in, e.g., random sampling splits.
+#' When some of the indices in the training dataset are not included into user-provided \code{folds},
+#' their prediction value would be \code{NA}.
+#' @export
+xgb.cb.cv.predict <- function(save_models = FALSE, outputmargin = FALSE) {
+  xgb.Callback(
+    cb_name = "cv_predict",
+    env = as.environment(list(save_models = save_models, outputmargin = outputmargin)),
+    f_before_training = function(env, model, data, watchlist, begin_iteration, end_iteration) {
+      if (inherits(model, "xgb.Booster")) {
+        stop("'cv.predict' callback is only for 'xgb.cv'.")
+      }
+    },
+    f_before_iter = NULL,
+    f_after_iter = NULL,
+    f_after_training = function(env, model, data, watchlist, iteration, final_feval, prev_cb_res) {
+      pred <- NULL
+      for (fd in model) {
+        pr <- predict(
+          fd$bst,
+          fd$watchlist[[2L]],
+          outputmargin = env$outputmargin,
+          reshape = TRUE
+        )
+        if (is.null(pred)) {
+          if (NCOL(pr) > 1L) {
+            pred <- matrix(NA_real_, nrow(data), ncol(pr))
+          } else {
+            pred <- matrix(NA_real_, nrow(data))
+          }
+        }
+        if (is.matrix(pred)) {
+          pred[fd$index, ] <- pr
+        } else {
+          pred[fd$index] <- pr
+        }
+      }
+      out <- list(pred = pred)
+      if (env$save_models) {
+        out$models <- lapply(model, function(fd) fd$bst)
+      }
+      return(out)
+    }
+  )
+}
+
+.list2mat <- function(coef_list, sparse) {
+  if (sparse) {
+    coef_mat <- methods::new("dgRMatrix")
+    coef_mat@p <- as.integer(c(0, cumsum(sapply(coef_list, function(x) length(x@x)))))
+    coef_mat@j <- as.integer(unlist(lapply(coef_list, slot, "i")) - 1L)
+    coef_mat@x <- unlist(lapply(coef_list, slot, "x"))
+    coef_mat@Dim <- as.integer(c(length(coef_list), length(coef_list[[1L]])))
+    # Note: function 'xgb.gblinear.history' might later on try to slice by columns
+    coef_mat <- methods::as(coef_mat, "CsparseMatrix")
+    return(coef_mat)
+  } else {
+    return(unname(do.call(rbind, coef_list)))
+  }
+}
+
+.extract.coef <- function(model, sparse) {
+  coefs <- .internal.coef.xgb.Booster(model, add_names = FALSE)
+  if (NCOL(coefs) > 1L) {
+    coefs <- as.vector(coefs)
+  }
+  if (sparse) {
+    coefs <- methods::as(coefs, "sparseVector")
+  }
+  return(coefs)
+}
+
+#' @title Callback for collecting coefficients history of a gblinear booster
+#' @param sparse when set to `FALSE`/`TRUE`, a dense/sparse matrix is used to store the result.
+#'       Sparse format is useful when one expects only a subset of coefficients to be non-zero,
+#'       when using the "thrifty" feature selector with fairly small number of top features
+#'       selected per iteration.
+#' @return An `xgb.Callback` object, which can be passed to \link{xgb.train} or \link{xgb.cv}.
+#' @details
+#' To keep things fast and simple, gblinear booster does not internally store the history of linear
+#' model coefficients at each boosting iteration. This callback provides a workaround for storing
+#' the coefficients' path, by extracting them after each training iteration.
+#'
+#' This callback will construct a matrix where rows are boosting iterations and columns are
+#' feature coefficients (same order as when calling \link{coef.xgb.Booster}, with the intercept
+#' corresponding to the first column).
+#'
+#' When there is more than one coefficient per feature (e.g. multi-class classification),
+#' the result will be reshaped into a vector where coefficients are arranged first by features and
+#' then by class (e.g. first 1 through N coefficients will be for the first class, then
+#' coefficients N+1 through 2N for the second class, and so on).
+#'
+#' If the result has only one coefficient per feature in the data, then the resulting matrix
+#' will have column names matching with the feature names, otherwise (when there's more than
+#' one coefficient per feature) the names will be composed as 'column name' + ':' + 'class index'
+#' (so e.g. column 'c1' for class '0' will be named 'c1:0').
+#'
+#' With \code{xgb.train}, the output is either a dense or a sparse matrix.
+#' With with \code{xgb.cv}, it is a list (one element per each fold) of such
+#' matrices.
+#'
+#' Function \link{xgb.gblinear.history} function provides an easy way to retrieve the
+#' outputs from this callback.
+#' @seealso \link{xgb.gblinear.history}, \link{coef.xgb.Booster}.
+#' @examples
+#' #### Binary classification:
+#'
+#' ## Keep the number of threads to 1 for examples
+#' nthread <- 1
+#' data.table::setDTthreads(nthread)
+#'
+#' # In the iris dataset, it is hard to linearly separate Versicolor class from the rest
+#' # without considering the 2nd order interactions:
+#' x <- model.matrix(Species ~ .^2, iris)[,-1]
+#' colnames(x)
+#' dtrain <- xgb.DMatrix(scale(x), label = 1*(iris$Species == "versicolor"), nthread = nthread)
+#' param <- list(booster = "gblinear", objective = "reg:logistic", eval_metric = "auc",
+#'               lambda = 0.0003, alpha = 0.0003, nthread = nthread)
+#' # For 'shotgun', which is a default linear updater, using high eta values may result in
+#' # unstable behaviour in some datasets. With this simple dataset, however, the high learning
+#' # rate does not break the convergence, but allows us to illustrate the typical pattern of
+#' # "stochastic explosion" behaviour of this lock-free algorithm at early boosting iterations.
+#' bst <- xgb.train(param, dtrain, list(tr=dtrain), nrounds = 200, eta = 1.,
+#'                  callbacks = list(xgb.cb.gblinear.history()))
+#' # Extract the coefficients' path and plot them vs boosting iteration number:
+#' coef_path <- xgb.gblinear.history(bst)
+#' matplot(coef_path, type = 'l')
+#'
+#' # With the deterministic coordinate descent updater, it is safer to use higher learning rates.
+#' # Will try the classical componentwise boosting which selects a single best feature per round:
+#' bst <- xgb.train(param, dtrain, list(tr=dtrain), nrounds = 200, eta = 0.8,
+#'                  updater = 'coord_descent', feature_selector = 'thrifty', top_k = 1,
+#'                  callbacks = list(xgb.cb.gblinear.history()))
+#' matplot(xgb.gblinear.history(bst), type = 'l')
+#' #  Componentwise boosting is known to have similar effect to Lasso regularization.
+#' # Try experimenting with various values of top_k, eta, nrounds,
+#' # as well as different feature_selectors.
+#'
+#' # For xgb.cv:
+#' bst <- xgb.cv(param, dtrain, nfold = 5, nrounds = 100, eta = 0.8,
+#'               callbacks = list(xgb.cb.gblinear.history()))
+#' # coefficients in the CV fold #3
+#' matplot(xgb.gblinear.history(bst)[[3]], type = 'l')
+#'
+#'
+#' #### Multiclass classification:
+#' #
+#' dtrain <- xgb.DMatrix(scale(x), label = as.numeric(iris$Species) - 1, nthread = nthread)
+#' param <- list(booster = "gblinear", objective = "multi:softprob", num_class = 3,
+#'               lambda = 0.0003, alpha = 0.0003, nthread = nthread)
+#' # For the default linear updater 'shotgun' it sometimes is helpful
+#' # to use smaller eta to reduce instability
+#' bst <- xgb.train(param, dtrain, list(tr=dtrain), nrounds = 50, eta = 0.5,
+#'                  callbacks = list(xgb.cb.gblinear.history()))
+#' # Will plot the coefficient paths separately for each class:
+#' matplot(xgb.gblinear.history(bst, class_index = 0), type = 'l')
+#' matplot(xgb.gblinear.history(bst, class_index = 1), type = 'l')
+#' matplot(xgb.gblinear.history(bst, class_index = 2), type = 'l')
+#'
+#' # CV:
+#' bst <- xgb.cv(param, dtrain, nfold = 5, nrounds = 70, eta = 0.5,
+#'               callbacks = list(xgb.cb.gblinear.history(FALSE)))
+#' # 1st fold of 1st class
+#' matplot(xgb.gblinear.history(bst, class_index = 0)[[1]], type = 'l')
+#'
+#' @export
+xgb.cb.gblinear.history <- function(sparse = FALSE) {
+  xgb.Callback(
+    cb_name = "gblinear_history",
+    env = as.environment(list(sparse = sparse)),
+    f_before_training = function(env, model, data, watchlist, begin_iteration, end_iteration) {
+      if (!inherits(model, "xgb.Booster")) {
+        model <- model[[1L]]$bst
+      }
+      if (xgb.booster_type(model) != "gblinear") {
+        stop("Callback 'xgb.cb.gblinear.history' is only for booster='gblinear'.")
+      }
+      env$coef_hist <- vector("list", end_iteration - begin_iteration + 1)
+      env$next_idx <- 1
+    },
+    f_before_iter = NULL,
+    f_after_iter = function(env, model, data, watchlist, iteration, iter_feval) {
+      if (inherits(model, "xgb.Booster")) {
+        coef_this <- .extract.coef(model, env$sparse)
+      } else {
+        coef_this <- lapply(model, function(fd) .extract.coef(fd$bst, env$sparse))
+      }
+      env$coef_hist[[env$next_idx]] <- coef_this
+      env$next_idx <- env$next_idx + 1
+      return(FALSE)
+    },
+    f_after_training = function(env, model, data, watchlist, iteration, final_feval, prev_cb_res) {
+      # in case of early stopping
+      if (env$next_idx <= length(env$coef_hist)) {
+        env$coef_hist <- head(env$coef_hist, env$next_idx - 1)
+      }
+
+      is_booster <- inherits(model, "xgb.Booster")
+      if (is_booster) {
+        out <- .list2mat(env$coef_hist, env$sparse)
+      } else {
+        out <- lapply(
+          X = lapply(
+            X = seq_along(env$coef_hist[[1]]),
+            FUN = function(i) lapply(env$coef_hist, "[[", i)
+          ),
+          FUN = .list2mat,
+          env$sparse
+        )
+      }
+      if (!is.null(prev_cb_res)) {
+        if (is_booster) {
+          out <- rbind(prev_cb_res, out)
+        } else {
+          # Note: this case should never be encountered, since training cannot
+          # be continued from the result of xgb.cv, but this code should in
+          # theory do the job if the situation were to be encountered.
+          out <- lapply(
+            out,
+            function(lst) {
+              lapply(
+                seq_along(lst),
+                function(i) rbind(prev_cb_res[[i]], lst[[i]])
+              )
+            }
+          )
+        }
+      }
+      feature_names <- getinfo(data, "feature_name")
+      if (!NROW(feature_names)) {
+        feature_names <- paste0("V", seq(1L, ncol(data)))
+      }
+      expected_ncols <- length(feature_names) + 1
+      if (is_booster) {
+        mat_ncols <- ncol(out)
+      } else {
+        mat_ncols <- ncol(out[[1L]])
+      }
+      if (mat_ncols %% expected_ncols == 0) {
+        feature_names <- c("(Intercept)", feature_names)
+        n_rep <- mat_ncols / expected_ncols
+        if (n_rep > 1) {
+          feature_names <- unlist(
+            lapply(
+              seq(1, n_rep),
+              function(cl) paste(feature_names, cl - 1, sep = ":")
+            )
+          )
+        }
+        if (is_booster) {
+          colnames(out) <- feature_names
+        } else {
+          out <- lapply(
+            out,
+            function(mat) {
+              colnames(mat) <- feature_names
+              return(mat)
+            }
+          )
+        }
+      }
+      return(out)
+    }
+  )
+}
+
+#' @title Extract gblinear coefficients history.
+#' @description A helper function to extract the matrix of linear coefficients' history
+#' from a gblinear model created while using the \link{xgb.cb.gblinear.history}
+#' callback (which must be added manually as by default it's not used).
+#' @details Note that this is an R-specific function that relies on R attributes that
+#' are not saved when using xgboost's own serialization functions like \link{xgb.load}
+#' or \link{xgb.load.raw}.
+#'
+#' In order for a serialized model to be accepted by this function, one must use R
+#' serializers such as \link{saveRDS}.
+#' @param model either an \code{xgb.Booster} or a result of \code{xgb.cv()}, trained
+#'        using the \link{xgb.cb.gblinear.history} callback, but \bold{not} a booster
+#'        loaded from \link{xgb.load} or \link{xgb.load.raw}.
+#' @param class_index zero-based class index to extract the coefficients for only that
+#'        specific class in a multinomial multiclass model. When it is NULL, all the
+#'        coefficients are returned. Has no effect in non-multiclass models.
+#'
+#' @return
+#' For an \link{xgb.train} result, a matrix (either dense or sparse) with the columns
+#' corresponding to iteration's coefficients and the rows corresponding to boosting iterations.
+#'
+#' For an \link{xgb.cv} result, a list of such matrices is returned with the elements
+#' corresponding to CV folds.
+#'
+#' When there is more than one coefficient per feature (e.g. multi-class classification)
+#' and `class_index` is not provided,
+#' the result will be reshaped into a vector where coefficients are arranged first by features and
+#' then by class (e.g. first 1 through N coefficients will be for the first class, then
+#' coefficients N+1 through 2N for the second class, and so on).
+#' @seealso \link{xgb.cb.gblinear.history}, \link{coef.xgb.Booster}.
+#' @export
+xgb.gblinear.history <- function(model, class_index = NULL) {
+
+  if (!(inherits(model, "xgb.Booster") ||
+        inherits(model, "xgb.cv.synchronous")))
+    stop("model must be an object of either xgb.Booster or xgb.cv.synchronous class")
+  is_cv <- inherits(model, "xgb.cv.synchronous")
+
+  if (!is_cv) {
+    coef_path <- getElement(attributes(model), "gblinear_history")
+  } else {
+    coef_path <- getElement(model, "gblinear_history")
+  }
+  if (is.null(coef_path)) {
+    stop("model must be trained while using the xgb.cb.gblinear.history() callback")
+  }
+
+  if (!is_cv) {
+    num_class <- xgb.num_class(model)
+    num_feat <- xgb.num_feature(model)
+  } else {
+    # in case of CV, the object is expected to have this info
+    if (model$params$booster != "gblinear")
+      stop("It does not appear to be a gblinear model")
+    num_class <- NVL(model$params$num_class, 1)
+    num_feat <- model$nfeatures
+    if (is.null(num_feat))
+      stop("This xgb.cv result does not have nfeatures info")
+  }
+
+  if (!is.null(class_index) &&
+      num_class > 1 &&
+      (class_index[1] < 0 || class_index[1] >= num_class))
+    stop("class_index has to be within [0,", num_class - 1, "]")
+
+  if (!is.null(class_index) && num_class > 1) {
+    seq_take <- seq(1 + class_index * (num_feat + 1), (class_index + 1) * (num_feat + 1))
+    coef_path <- if (is.list(coef_path)) {
+      lapply(coef_path, function(x) x[, seq_take])
+    } else {
+      coef_path <- coef_path[, seq_take]
+    }
+  }
+  return(coef_path)
+}
+
+.callbacks.only.train <- "save_model"
+.callbacks.only.cv <- "cv_predict"
+
+.process.callbacks <- function(callbacks, is_cv) {
+  if (inherits(callbacks, "xgb.Callback")) {
+    callbacks <- list(callbacks)
+  }
+  if (!is.list(callbacks)) {
+    stop("'callbacks' must be a list.")
+  }
+  cb_names <- character()
+  if (length(callbacks)) {
+    is_callback <- sapply(callbacks, inherits, "xgb.Callback")
+    if (!all(is_callback)) {
+      stop("Entries in 'callbacks' must be 'xgb.Callback' objects.")
+    }
+    cb_names <- sapply(callbacks, function(cb) cb$cb_name)
+    if (length(cb_names) != length(callbacks)) {
+      stop("Passed invalid callback(s).")
+    }
+    if (anyDuplicated(cb_names) > 0) {
+      stop("Callbacks must have unique names.")
+    }
+    if (is_cv) {
+      if (any(.callbacks.only.train %in% cb_names)) {
+        stop(
+          "Passed callback(s) not supported for 'xgb.cv': ",
+          paste(intersect(.callbacks.only.train, cb_names), collapse = ", ")
+        )
+      }
+    } else {
+      if (any(.callbacks.only.cv %in% cb_names)) {
+        stop(
+          "Passed callback(s) not supported for 'xgb.train': ",
+          paste(intersect(.callbacks.only.cv, cb_names), collapse = ", ")
+        )
+      }
+    }
+    # Early stopping callback needs to be executed before the others
+    if ("early_stop" %in% cb_names) {
+      mask <- cb_names == "early_stop"
+      callbacks <- c(list(callbacks[[which(mask)]]), callbacks[!mask])
+    }
+  }
+  return(list(callbacks = callbacks, cb_names = cb_names))
+}
+
+# Note: don't try to use functions like 'append', as they will
+# merge the elements of the different callbacks into a single list.
+add.callback <- function(callbacks, cb, as_first_elt = FALSE) {
+  if (!as_first_elt) {
+    callbacks[[length(callbacks) + 1]] <- cb
+    return(callbacks)
+  } else {
+    if (!length(callbacks)) {
+      return(list(cb))
+    }
+    new_cb <- vector("list", length(callbacks) + 1)
+    new_cb[[1]] <- cb
+    new_cb[seq(2, length(new_cb))] <- callbacks
+    return(new_cb)
+  }
+}
+
+has.callbacks <- function(callbacks, cb_name) {
+  cb_names <- sapply(callbacks, function(cb) cb$name)
+  return(cb_name %in% cb_names)
 }
diff --git a/R-package/R/utils.R b/R-package/R/utils.R
index e8ae787fc..723310ee4 100644
--- a/R-package/R/utils.R
+++ b/R-package/R/utils.R
@@ -142,7 +142,7 @@ check.custom.eval <- function(env = parent.frame()) {
   if (!is.null(env$feval) &&
       is.null(env$maximize) && (
         !is.null(env$early_stopping_rounds) ||
-        has.callbacks(env$callbacks, 'cb.early.stop')))
+        has.callbacks(env$callbacks, "early_stop")))
     stop("Please set 'maximize' to indicate whether the evaluation metric needs to be maximized or not")
 }
 
diff --git a/R-package/R/xgb.Booster.R b/R-package/R/xgb.Booster.R
index 8a5d66198..77d75fa9c 100644
--- a/R-package/R/xgb.Booster.R
+++ b/R-package/R/xgb.Booster.R
@@ -1071,6 +1071,10 @@ xgb.best_iteration <- function(bst) {
 #' coef(model)
 #' @export
 coef.xgb.Booster <- function(object, ...) {
+  return(.internal.coef.xgb.Booster(object, add_names = TRUE))
+}
+
+.internal.coef.xgb.Booster <- function(object, add_names = TRUE) {
   booster_type <- xgb.booster_type(object)
   if (booster_type != "gblinear") {
     stop("Coefficients are not defined for Booster type ", booster_type)
@@ -1089,21 +1093,27 @@ coef.xgb.Booster <- function(object, ...) {
   intercepts <- weights[seq(sep + 1, length(weights))]
   intercepts <- intercepts + as.numeric(base_score)
 
-  feature_names <- xgb.feature_names(object)
-  if (!NROW(feature_names)) {
-    # This mimics the default naming in R which names columns as "V1..N"
-    # when names are needed but not available
-    feature_names <- paste0("V", seq(1L, num_feature))
+  if (add_names) {
+    feature_names <- xgb.feature_names(object)
+    if (!NROW(feature_names)) {
+      # This mimics the default naming in R which names columns as "V1..N"
+      # when names are needed but not available
+      feature_names <- paste0("V", seq(1L, num_feature))
+    }
+    feature_names <- c("(Intercept)", feature_names)
   }
-  feature_names <- c("(Intercept)", feature_names)
   if (n_cols == 1L) {
     out <- c(intercepts, coefs)
-    names(out) <- feature_names
+    if (add_names) {
+      names(out) <- feature_names
+    }
   } else {
     coefs <- matrix(coefs, nrow = num_feature, byrow = TRUE)
     dim(intercepts) <- c(1L, n_cols)
     out <- rbind(intercepts, coefs)
-    row.names(out) <- feature_names
+    if (add_names) {
+      row.names(out) <- feature_names
+    }
     # TODO: if a class names attributes is added,
     # should use those names here.
   }
@@ -1255,12 +1265,9 @@ print.xgb.Booster <- function(x, ...) {
     cat("  ", paste(attr_names, collapse = ", "), "\n")
   }
 
-  if (!is.null(R_attrs$callbacks) && length(R_attrs$callbacks) > 0) {
-    cat('callbacks:\n')
-    lapply(callback.calls(R_attrs$callbacks), function(x) {
-      cat('  ')
-      print(x)
-    })
+  additional_attr <- setdiff(names(R_attrs), .reserved_cb_names)
+  if (NROW(additional_attr)) {
+    cat("callbacks:\n  ", paste(additional_attr, collapse = ", "), "\n")
   }
 
   if (!is.null(R_attrs$evaluation_log)) {
diff --git a/R-package/R/xgb.cv.R b/R-package/R/xgb.cv.R
index 29bddb57f..23ca0f2de 100644
--- a/R-package/R/xgb.cv.R
+++ b/R-package/R/xgb.cv.R
@@ -27,7 +27,7 @@
 #'        that NA values should be considered as 'missing' by the algorithm.
 #'        Sometimes, 0 or other extreme value might be used to represent missing values.
 #' @param prediction A logical value indicating whether to return the test fold predictions
-#'        from each CV model. This parameter engages the \code{\link{cb.cv.predict}} callback.
+#'        from each CV model. This parameter engages the \code{\link{xgb.cb.cv.predict}} callback.
 #' @param showsd \code{boolean}, whether to show standard deviation of cross validation
 #' @param metrics, list of evaluation metrics to be used in cross validation,
 #'   when it is not specified, the evaluation metric is chosen according to objective function.
@@ -57,17 +57,17 @@
 #' @param verbose \code{boolean}, print the statistics during the process
 #' @param print_every_n Print each n-th iteration evaluation messages when \code{verbose>0}.
 #'        Default is 1 which means all messages are printed. This parameter is passed to the
-#'        \code{\link{cb.print.evaluation}} callback.
+#'        \code{\link{xgb.cb.print.evaluation}} callback.
 #' @param early_stopping_rounds If \code{NULL}, the early stopping function is not triggered.
 #'        If set to an integer \code{k}, training with a validation set will stop if the performance
 #'        doesn't improve for \code{k} rounds.
-#'        Setting this parameter engages the \code{\link{cb.early.stop}} callback.
+#'        Setting this parameter engages the \code{\link{xgb.cb.early.stop}} callback.
 #' @param maximize If \code{feval} and \code{early_stopping_rounds} are set,
 #'        then this parameter must be set as well.
 #'        When it is \code{TRUE}, it means the larger the evaluation score the better.
-#'        This parameter is passed to the \code{\link{cb.early.stop}} callback.
+#'        This parameter is passed to the \code{\link{xgb.cb.early.stop}} callback.
 #' @param callbacks a list of callback functions to perform various task during boosting.
-#'        See \code{\link{callbacks}}. Some of the callbacks are automatically created depending on the
+#'        See \code{\link{xgb.Callback}}. Some of the callbacks are automatically created depending on the
 #'        parameters' values. User can provide either existing or their own callback methods in order
 #'        to customize the training process.
 #' @param ... other parameters to pass to \code{params}.
@@ -90,25 +90,25 @@
 #' \itemize{
 #'   \item \code{call} a function call.
 #'   \item \code{params} parameters that were passed to the xgboost library. Note that it does not
-#'         capture parameters changed by the \code{\link{cb.reset.parameters}} callback.
-#'   \item \code{callbacks} callback functions that were either automatically assigned or
-#'         explicitly passed.
+#'         capture parameters changed by the \code{\link{xgb.cb.reset.parameters}} callback.
 #'   \item \code{evaluation_log} evaluation history stored as a \code{data.table} with the
 #'         first column corresponding to iteration number and the rest corresponding to the
 #'         CV-based evaluation means and standard deviations for the training and test CV-sets.
-#'         It is created by the \code{\link{cb.evaluation.log}} callback.
+#'         It is created by the \code{\link{xgb.cb.evaluation.log}} callback.
 #'   \item \code{niter} number of boosting iterations.
 #'   \item \code{nfeatures} number of features in training data.
 #'   \item \code{folds} the list of CV folds' indices - either those passed through the \code{folds}
 #'         parameter or randomly generated.
 #'   \item \code{best_iteration} iteration number with the best evaluation metric value
 #'         (only available with early stopping).
-#'   \item \code{pred} CV prediction values available when \code{prediction} is set.
-#'         It is either vector or matrix (see \code{\link{cb.cv.predict}}).
-#'   \item \code{models} a list of the CV folds' models. It is only available with the explicit
-#'         setting of the \code{cb.cv.predict(save_models = TRUE)} callback.
 #' }
 #'
+#' Plus other potential elements that are the result of callbacks, such as a list `cv_predict` with
+#' a sub-element `pred` when passing `prediction = TRUE`, which is added by the \link{xgb.cb.cv.predict}
+#' callback (note that one can also pass it manually under `callbacks` with different settings,
+#' such as saving also the models created during cross validation); or a list `early_stop` which
+#' will contain elements such as `best_iteration` when using the early stopping callback (\link{xgb.cb.early.stop}).
+#'
 #' @examples
 #' data(agaricus.train, package='xgboost')
 #' dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label, nthread = 2))
@@ -160,32 +160,38 @@ xgb.cv <- function(params = list(), data, nrounds, nfold, label = NULL, missing
     folds <- generate.cv.folds(nfold, nrow(data), stratified, cv_label, params)
   }
 
+  # Callbacks
+  tmp <- .process.callbacks(callbacks, is_cv = TRUE)
+  callbacks <- tmp$callbacks
+  cb_names <- tmp$cb_names
+  rm(tmp)
+
+  # Early stopping callback
+  if (!is.null(early_stopping_rounds) && !("early_stop" %in% cb_names)) {
+    callbacks <- add.callback(
+      callbacks,
+      xgb.cb.early.stop(
+        early_stopping_rounds,
+        maximize = maximize,
+        verbose = verbose
+      ),
+      as_first_elt = TRUE
+    )
+  }
   # verbosity & evaluation printing callback:
   params <- c(params, list(silent = 1))
   print_every_n <- max(as.integer(print_every_n), 1L)
-  if (!has.callbacks(callbacks, 'cb.print.evaluation') && verbose) {
-    callbacks <- add.cb(callbacks, cb.print.evaluation(print_every_n, showsd = showsd))
+  if (verbose && !("print_evaluation" %in% cb_names)) {
+    callbacks <- add.callback(callbacks, xgb.cb.print.evaluation(print_every_n, showsd = showsd))
   }
   # evaluation log callback: always is on in CV
-  evaluation_log <- list()
-  if (!has.callbacks(callbacks, 'cb.evaluation.log')) {
-    callbacks <- add.cb(callbacks, cb.evaluation.log())
-  }
-  # Early stopping callback
-  stop_condition <- FALSE
-  if (!is.null(early_stopping_rounds) &&
-      !has.callbacks(callbacks, 'cb.early.stop')) {
-    callbacks <- add.cb(callbacks, cb.early.stop(early_stopping_rounds,
-                                                 maximize = maximize, verbose = verbose))
+  if (!("evaluation_log" %in% cb_names)) {
+    callbacks <- add.callback(callbacks, xgb.cb.evaluation.log())
   }
   # CV-predictions callback
-  if (prediction &&
-      !has.callbacks(callbacks, 'cb.cv.predict')) {
-    callbacks <- add.cb(callbacks, cb.cv.predict(save_models = FALSE))
+  if (prediction && !("cv_predict" %in% cb_names)) {
+    callbacks <- add.callback(callbacks, xgb.cb.cv.predict(save_models = FALSE))
   }
-  # Sort the callbacks into categories
-  cb <- categorize.callbacks(callbacks)
-
 
   # create the booster-folds
   # train_folds
@@ -211,9 +217,6 @@ xgb.cv <- function(params = list(), data, nrounds, nfold, label = NULL, missing
     bst <- bst$bst
     list(dtrain = dtrain, bst = bst, watchlist = list(train = dtrain, test = dtest), index = folds[[k]])
   })
-  rm(dall)
-  # a "basket" to collect some results from callbacks
-  basket <- list()
 
   # extract parameters that can affect the relationship b/w #trees and #iterations
   num_class <- max(as.numeric(NVL(params[['num_class']], 1)), 1) # nolint
@@ -222,10 +225,25 @@ xgb.cv <- function(params = list(), data, nrounds, nfold, label = NULL, missing
   begin_iteration <- 1
   end_iteration <- nrounds
 
+  .execute.cb.before.training(
+    callbacks,
+    bst_folds,
+    dall,
+    NULL,
+    begin_iteration,
+    end_iteration
+  )
+
   # synchronous CV boosting: run CV folds' models within each iteration
   for (iteration in begin_iteration:end_iteration) {
 
-    for (f in cb$pre_iter) f()
+    .execute.cb.before.iter(
+      callbacks,
+      bst_folds,
+      dall,
+      NULL,
+      iteration
+    )
 
     msg <- lapply(bst_folds, function(fd) {
       xgb.iter.update(
@@ -242,27 +260,36 @@ xgb.cv <- function(params = list(), data, nrounds, nfold, label = NULL, missing
       )
     })
     msg <- simplify2array(msg)
-    # Note: these variables might look unused here, but they are used in the callbacks
-    bst_evaluation <- rowMeans(msg) # nolint
-    bst_evaluation_err <- apply(msg, 1, sd) # nolint
 
-    for (f in cb$post_iter) f()
+    should_stop <- .execute.cb.after.iter(
+      callbacks,
+      bst_folds,
+      dall,
+      NULL,
+      iteration,
+      msg
+    )
 
-    if (stop_condition) break
+    if (should_stop) break
   }
-  for (f in cb$finalize) f(finalize = TRUE)
+  cb_outputs <- .execute.cb.after.training(
+    callbacks,
+    bst_folds,
+    dall,
+    NULL,
+    iteration,
+    msg
+  )
 
   # the CV result
   ret <- list(
     call = match.call(),
     params = params,
-    callbacks = callbacks,
-    evaluation_log = evaluation_log,
-    niter = end_iteration,
-    nfeatures = ncol(data),
+    niter = iteration,
+    nfeatures = ncol(dall),
     folds = folds
   )
-  ret <- c(ret, basket)
+  ret <- c(ret, cb_outputs)
 
   class(ret) <- 'xgb.cv.synchronous'
   return(invisible(ret))
@@ -308,23 +335,16 @@ print.xgb.cv.synchronous <- function(x, verbose = FALSE, ...) {
                 paste0('"', unlist(x$params), '"'),
                 sep = ' = ', collapse = ', '), '\n', sep = '')
     }
-    if (!is.null(x$callbacks) && length(x$callbacks) > 0) {
-      cat('callbacks:\n')
-      lapply(callback.calls(x$callbacks), function(x) {
-        cat('  ')
-        print(x)
-      })
-    }
 
     for (n in c('niter', 'best_iteration')) {
-      if (is.null(x[[n]]))
+      if (is.null(x$early_stop[[n]]))
         next
-      cat(n, ': ', x[[n]], '\n', sep = '')
+      cat(n, ': ', x$early_stop[[n]], '\n', sep = '')
     }
 
-    if (!is.null(x$pred)) {
+    if (!is.null(x$cv_predict$pred)) {
       cat('pred:\n')
-      str(x$pred)
+      str(x$cv_predict$pred)
     }
   }
 
@@ -332,9 +352,9 @@ print.xgb.cv.synchronous <- function(x, verbose = FALSE, ...) {
     cat('evaluation_log:\n')
   print(x$evaluation_log, row.names = FALSE, ...)
 
-  if (!is.null(x$best_iteration)) {
+  if (!is.null(x$early_stop$best_iteration)) {
     cat('Best iteration:\n')
-    print(x$evaluation_log[x$best_iteration], row.names = FALSE, ...)
+    print(x$evaluation_log[x$early_stop$best_iteration], row.names = FALSE, ...)
   }
   invisible(x)
 }
diff --git a/R-package/R/xgb.load.R b/R-package/R/xgb.load.R
index 4985f74b5..d5b192bcb 100644
--- a/R-package/R/xgb.load.R
+++ b/R-package/R/xgb.load.R
@@ -6,7 +6,7 @@
 #'
 #' @details
 #' The input file is expected to contain a model saved in an xgboost model format
-#' using either \code{\link{xgb.save}} or \code{\link{cb.save.model}} in R, or using some
+#' using either \code{\link{xgb.save}} or \code{\link{xgb.cb.save.model}} in R, or using some
 #' appropriate methods from other xgboost interfaces. E.g., a model trained in Python and
 #' saved from there in xgboost format, could be loaded from R.
 #'
diff --git a/R-package/R/xgb.train.R b/R-package/R/xgb.train.R
index 44cde2e7a..34c21d552 100644
--- a/R-package/R/xgb.train.R
+++ b/R-package/R/xgb.train.R
@@ -118,7 +118,7 @@
 #'        Metrics specified in either \code{eval_metric} or \code{feval} will be computed for each
 #'        of these datasets during each boosting iteration, and stored in the end as a field named
 #'        \code{evaluation_log} in the resulting object. When either \code{verbose>=1} or
-#'        \code{\link{cb.print.evaluation}} callback is engaged, the performance results are continuously
+#'        \code{\link{xgb.cb.print.evaluation}} callback is engaged, the performance results are continuously
 #'        printed out during the training.
 #'        E.g., specifying \code{watchlist=list(validation1=mat1, validation2=mat2)} allows to track
 #'        the performance of each round's model on mat1 and mat2.
@@ -130,31 +130,32 @@
 #' @param verbose If 0, xgboost will stay silent. If 1, it will print information about performance.
 #'        If 2, some additional information will be printed out.
 #'        Note that setting \code{verbose > 0} automatically engages the
-#'        \code{cb.print.evaluation(period=1)} callback function.
+#'        \code{xgb.cb.print.evaluation(period=1)} callback function.
 #' @param print_every_n Print each n-th iteration evaluation messages when \code{verbose>0}.
 #'        Default is 1 which means all messages are printed. This parameter is passed to the
-#'        \code{\link{cb.print.evaluation}} callback.
+#'        \code{\link{xgb.cb.print.evaluation}} callback.
 #' @param early_stopping_rounds If \code{NULL}, the early stopping function is not triggered.
 #'        If set to an integer \code{k}, training with a validation set will stop if the performance
 #'        doesn't improve for \code{k} rounds.
-#'        Setting this parameter engages the \code{\link{cb.early.stop}} callback.
+#'        Setting this parameter engages the \code{\link{xgb.cb.early.stop}} callback.
 #' @param maximize If \code{feval} and \code{early_stopping_rounds} are set,
 #'        then this parameter must be set as well.
 #'        When it is \code{TRUE}, it means the larger the evaluation score the better.
-#'        This parameter is passed to the \code{\link{cb.early.stop}} callback.
+#'        This parameter is passed to the \code{\link{xgb.cb.early.stop}} callback.
 #' @param save_period when it is non-NULL, model is saved to disk after every \code{save_period} rounds,
-#'        0 means save at the end. The saving is handled by the \code{\link{cb.save.model}} callback.
+#'        0 means save at the end. The saving is handled by the \code{\link{xgb.cb.save.model}} callback.
 #' @param save_name the name or path for periodically saved model file.
 #' @param xgb_model a previously built model to continue the training from.
 #'        Could be either an object of class \code{xgb.Booster}, or its raw data, or the name of a
 #'        file with a previously saved model.
 #' @param callbacks a list of callback functions to perform various task during boosting.
-#'        See \code{\link{callbacks}}. Some of the callbacks are automatically created depending on the
+#'        See \code{\link{xgb.Callback}}. Some of the callbacks are automatically created depending on the
 #'        parameters' values. User can provide either existing or their own callback methods in order
 #'        to customize the training process.
 #'
-#'        Note that some callbacks might try to set an evaluation log - be aware that these evaluation logs
-#'        are kept as R attributes, and thus do not get saved when using non-R serializaters like
+#'        Note that some callbacks might try to leave attributes in the resulting model object,
+#'        such as an evaluation log (a `data.table` object) - be aware that these objects are kept
+#'        as R attributes, and thus do not get saved when using XGBoost's own serializaters like
 #'        \link{xgb.save} (but are kept when using R serializers like \link{saveRDS}).
 #' @param ... other parameters to pass to \code{params}.
 #' @param label vector of response values. Should not be provided when data is
@@ -206,18 +207,19 @@
 #'
 #' The following callbacks are automatically created when certain parameters are set:
 #' \itemize{
-#'   \item \code{cb.print.evaluation} is turned on when \code{verbose > 0};
+#'   \item \code{xgb.cb.print.evaluation} is turned on when \code{verbose > 0};
 #'         and the \code{print_every_n} parameter is passed to it.
-#'   \item \code{cb.evaluation.log} is on when \code{watchlist} is present.
-#'   \item \code{cb.early.stop}: when \code{early_stopping_rounds} is set.
-#'   \item \code{cb.save.model}: when \code{save_period > 0} is set.
+#'   \item \code{xgb.cb.evaluation.log} is on when \code{watchlist} is present.
+#'   \item \code{xgb.cb.early.stop}: when \code{early_stopping_rounds} is set.
+#'   \item \code{xgb.cb.save.model}: when \code{save_period > 0} is set.
 #' }
 #'
 #' Note that objects of type `xgb.Booster` as returned by this function behave a bit differently
 #' from typical R objects (it's an 'altrep' list class), and it makes a separation between
 #' internal booster attributes (restricted to jsonifyable data), accessed through \link{xgb.attr}
 #' and shared between interfaces through serialization functions like \link{xgb.save}; and
-#' R-specific attributes, accessed through \link{attributes} and \link{attr}, which are otherwise
+#' R-specific attributes (typically the result from a callback), accessed through \link{attributes}
+#' and \link{attr}, which are otherwise
 #' only used in the R interface, only kept when using R's serializers like \link{saveRDS}, and
 #' not anyhow used by functions like \link{predict.xgb.Booster}.
 #'
@@ -229,7 +231,7 @@
 #' effect elsewhere.
 #'
 #' @seealso
-#' \code{\link{callbacks}},
+#' \code{\link{xgb.Callback}},
 #' \code{\link{predict.xgb.Booster}},
 #' \code{\link{xgb.cv}}
 #'
@@ -295,7 +297,7 @@
 #'               objective = "binary:logistic", eval_metric = "auc")
 #' my_etas <- list(eta = c(0.5, 0.1))
 #' bst <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0,
-#'                  callbacks = list(cb.reset.parameters(my_etas)))
+#'                  callbacks = list(xgb.cb.reset.parameters(my_etas)))
 #'
 #' ## Early stopping:
 #' bst <- xgb.train(param, dtrain, nrounds = 25, watchlist,
@@ -339,47 +341,47 @@ xgb.train <- function(params = list(), data, nrounds, watchlist = list(),
     params <- c(params, list(eval_metric = m))
   }
 
-  # evaluation printing callback
   params <- c(params)
-  print_every_n <- max(as.integer(print_every_n), 1L)
-  if (!has.callbacks(callbacks, 'cb.print.evaluation') &&
-      verbose) {
-    callbacks <- add.cb(callbacks, cb.print.evaluation(print_every_n))
-  }
-  # evaluation log callback:  it is automatically enabled when watchlist is provided
-  evaluation_log <- list()
-  if (!has.callbacks(callbacks, 'cb.evaluation.log') &&
-      length(watchlist) > 0) {
-    callbacks <- add.cb(callbacks, cb.evaluation.log())
-  }
-  # Model saving callback
-  if (!is.null(save_period) &&
-      !has.callbacks(callbacks, 'cb.save.model')) {
-    callbacks <- add.cb(callbacks, cb.save.model(save_period, save_name))
-  }
-  # Early stopping callback
-  stop_condition <- FALSE
-  if (!is.null(early_stopping_rounds) &&
-      !has.callbacks(callbacks, 'cb.early.stop')) {
-    callbacks <- add.cb(callbacks, cb.early.stop(early_stopping_rounds,
-                                                 maximize = maximize, verbose = verbose))
-  }
-
-  # Sort the callbacks into categories
-  cb <- categorize.callbacks(callbacks)
   params['validate_parameters'] <- TRUE
   if (!("seed" %in% names(params))) {
     params[["seed"]] <- sample(.Machine$integer.max, size = 1)
   }
 
+  # callbacks
+  tmp <- .process.callbacks(callbacks, is_cv = FALSE)
+  callbacks <- tmp$callbacks
+  cb_names <- tmp$cb_names
+  rm(tmp)
+
+  # Early stopping callback (should always come first)
+  if (!is.null(early_stopping_rounds) && !("early_stop" %in% cb_names)) {
+    callbacks <- add.callback(
+      callbacks,
+      xgb.cb.early.stop(
+        early_stopping_rounds,
+        maximize = maximize,
+        verbose = verbose
+      ),
+      as_first_elt = TRUE
+    )
+  }
+  # evaluation printing callback
+  print_every_n <- max(as.integer(print_every_n), 1L)
+  if (verbose && !("print_evaluation" %in% cb_names)) {
+    callbacks <- add.callback(callbacks, xgb.cb.print.evaluation(print_every_n))
+  }
+  # evaluation log callback:  it is automatically enabled when watchlist is provided
+  if (length(watchlist) && !("evaluation_log" %in% cb_names)) {
+    callbacks <- add.callback(callbacks, xgb.cb.evaluation.log())
+  }
+  # Model saving callback
+  if (!is.null(save_period) && !("save_model" %in% cb_names)) {
+    callbacks <- add.callback(callbacks, xgb.cb.save.model(save_period, save_name))
+  }
+
   # The tree updating process would need slightly different handling
   is_update <- NVL(params[['process_type']], '.') == 'update'
 
-  past_evaluation_log <- NULL
-  if (inherits(xgb_model, "xgb.Booster")) {
-    past_evaluation_log <- attributes(xgb_model)$evaluation_log
-  }
-
   # Construct a booster (either a new one or load from xgb_model)
   bst <- xgb.Booster(
     params = params,
@@ -394,11 +396,6 @@ xgb.train <- function(params = list(), data, nrounds, watchlist = list(),
     dtrain
   )
 
-  # extract parameters that can affect the relationship b/w #trees and #iterations
-  # Note: it might look like these aren't used, but they need to be defined in this
-  # environment for the callbacks for work correctly.
-  num_class <- max(as.numeric(NVL(params[['num_class']], 1)), 1) # nolint
-
   if (is_update && nrounds > niter_init)
     stop("nrounds cannot be larger than ", niter_init, " (nrounds of xgb_model)")
 
@@ -406,20 +403,36 @@ xgb.train <- function(params = list(), data, nrounds, watchlist = list(),
   begin_iteration <- niter_skip + 1
   end_iteration <- niter_skip + nrounds
 
+  .execute.cb.before.training(
+    callbacks,
+    bst,
+    dtrain,
+    watchlist,
+    begin_iteration,
+    end_iteration
+  )
+
   # the main loop for boosting iterations
   for (iteration in begin_iteration:end_iteration) {
 
-    for (f in cb$pre_iter) f()
-
-    xgb.iter.update(
-        bst = bst,
-        dtrain = dtrain,
-        iter = iteration - 1,
-        obj = obj
+    .execute.cb.before.iter(
+      callbacks,
+      bst,
+      dtrain,
+      watchlist,
+      iteration
     )
 
+    xgb.iter.update(
+      bst = bst,
+      dtrain = dtrain,
+      iter = iteration - 1,
+      obj = obj
+    )
+
+    bst_evaluation <- NULL
     if (length(watchlist) > 0) {
-      bst_evaluation <- xgb.iter.eval(  # nolint: object_usage_linter
+      bst_evaluation <- xgb.iter.eval(
         bst = bst,
         watchlist = watchlist,
         iter = iteration - 1,
@@ -427,36 +440,46 @@ xgb.train <- function(params = list(), data, nrounds, watchlist = list(),
       )
     }
 
-    for (f in cb$post_iter) f()
+    should_stop <- .execute.cb.after.iter(
+      callbacks,
+      bst,
+      dtrain,
+      watchlist,
+      iteration,
+      bst_evaluation
+    )
 
-    if (stop_condition) break
+    if (should_stop) break
   }
-  for (f in cb$finalize) f(finalize = TRUE)
 
-  # store the evaluation results
-  keep_evaluation_log <- FALSE
-  if (length(evaluation_log) > 0 && nrow(evaluation_log) > 0) {
-    keep_evaluation_log <- TRUE
-    # include the previous compatible history when available
-    if (inherits(xgb_model, 'xgb.Booster') &&
-        !is_update &&
-        !is.null(past_evaluation_log) &&
-        isTRUE(all.equal(colnames(evaluation_log),
-                         colnames(past_evaluation_log)))) {
-      evaluation_log <- rbindlist(list(past_evaluation_log, evaluation_log))
-    }
-  }
+  cb_outputs <- .execute.cb.after.training(
+    callbacks,
+    bst,
+    dtrain,
+    watchlist,
+    iteration,
+    bst_evaluation
+  )
 
   extra_attrs <- list(
     call = match.call(),
-    params = params,
-    callbacks = callbacks
+    params = params
   )
-  if (keep_evaluation_log) {
-    extra_attrs$evaluation_log <- evaluation_log
-  }
+
   curr_attrs <- attributes(bst)
-  attributes(bst) <- c(curr_attrs, extra_attrs)
+  if (NROW(curr_attrs)) {
+    curr_attrs <- curr_attrs[
+      setdiff(
+        names(curr_attrs),
+        c(names(extra_attrs), names(cb_outputs))
+      )
+    ]
+  }
+  curr_attrs <- c(extra_attrs, curr_attrs)
+  if (NROW(cb_outputs)) {
+    curr_attrs <- c(curr_attrs, cb_outputs)
+  }
+  attributes(bst) <- curr_attrs
 
   return(bst)
 }
diff --git a/R-package/R/xgboost.R b/R-package/R/xgboost.R
index 170aa5ffd..7fecec39c 100644
--- a/R-package/R/xgboost.R
+++ b/R-package/R/xgboost.R
@@ -82,12 +82,8 @@ NULL
 NULL
 
 # Various imports
-#' @importClassesFrom Matrix dgCMatrix dgeMatrix dgRMatrix
-#' @importFrom Matrix colSums
+#' @importClassesFrom Matrix dgCMatrix dgRMatrix CsparseMatrix
 #' @importFrom Matrix sparse.model.matrix
-#' @importFrom Matrix sparseVector
-#' @importFrom Matrix sparseMatrix
-#' @importFrom Matrix t
 #' @importFrom data.table data.table
 #' @importFrom data.table is.data.table
 #' @importFrom data.table as.data.table
@@ -103,6 +99,7 @@ NULL
 #' @importFrom stats coef
 #' @importFrom stats predict
 #' @importFrom stats median
+#' @importFrom stats sd
 #' @importFrom stats variable.names
 #' @importFrom utils head
 #' @importFrom graphics barplot
diff --git a/R-package/man/callbacks.Rd b/R-package/man/callbacks.Rd
deleted file mode 100644
index 9f6f69015..000000000
--- a/R-package/man/callbacks.Rd
+++ /dev/null
@@ -1,37 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/callbacks.R
-\name{callbacks}
-\alias{callbacks}
-\title{Callback closures for booster training.}
-\description{
-These are used to perform various service tasks either during boosting iterations or at the end.
-This approach helps to modularize many of such tasks without bloating the main training methods,
-and it offers .
-}
-\details{
-By default, a callback function is run after each boosting iteration.
-An R-attribute \code{is_pre_iteration} could be set for a callback to define a pre-iteration function.
-
-When a callback function has \code{finalize} parameter, its finalizer part will also be run after
-the boosting is completed.
-
-WARNING: side-effects!!! Be aware that these callback functions access and modify things in
-the environment from which they are called from, which is a fairly uncommon thing to do in R.
-
-To write a custom callback closure, make sure you first understand the main concepts about R environments.
-Check either R documentation on \code{\link[base]{environment}} or the
-\href{http://adv-r.had.co.nz/Environments.html}{Environments chapter} from the "Advanced R"
-book by Hadley Wickham. Further, the best option is to read the code of some of the existing callbacks -
-choose ones that do something similar to what you want to achieve. Also, you would need to get familiar
-with the objects available inside of the \code{xgb.train} and \code{xgb.cv} internal environments.
-}
-\seealso{
-\code{\link{cb.print.evaluation}},
-\code{\link{cb.evaluation.log}},
-\code{\link{cb.reset.parameters}},
-\code{\link{cb.early.stop}},
-\code{\link{cb.save.model}},
-\code{\link{cb.cv.predict}},
-\code{\link{xgb.train}},
-\code{\link{xgb.cv}}
-}
diff --git a/R-package/man/cb.early.stop.Rd b/R-package/man/cb.early.stop.Rd
deleted file mode 100644
index 7cd51a3ce..000000000
--- a/R-package/man/cb.early.stop.Rd
+++ /dev/null
@@ -1,62 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/callbacks.R
-\name{cb.early.stop}
-\alias{cb.early.stop}
-\title{Callback closure to activate the early stopping.}
-\usage{
-cb.early.stop(
-  stopping_rounds,
-  maximize = FALSE,
-  metric_name = NULL,
-  verbose = TRUE
-)
-}
-\arguments{
-\item{stopping_rounds}{The number of rounds with no improvement in
-the evaluation metric in order to stop the training.}
-
-\item{maximize}{whether to maximize the evaluation metric}
-
-\item{metric_name}{the name of an evaluation column to use as a criteria for early
-stopping. If not set, the last column would be used.
-Let's say the test data in \code{watchlist} was labelled as \code{dtest},
-and one wants to use the AUC in test data for early stopping regardless of where
-it is in the \code{watchlist}, then one of the following would need to be set:
-\code{metric_name='dtest-auc'} or \code{metric_name='dtest_auc'}.
-All dash '-' characters in metric names are considered equivalent to '_'.}
-
-\item{verbose}{whether to print the early stopping information.}
-}
-\description{
-Callback closure to activate the early stopping.
-}
-\details{
-This callback function determines the condition for early stopping
-by setting the \code{stop_condition = TRUE} flag in its calling frame.
-
-The following additional fields are assigned to the model's R object:
-\itemize{
-\item \code{best_score} the evaluation score at the best iteration
-\item \code{best_iteration} at which boosting iteration the best score has occurred (1-based index)
-}
-The Same values are also stored as xgb-attributes:
-\itemize{
-\item \code{best_iteration} is stored as a 0-based iteration index (for interoperability of binary models)
-\item \code{best_msg} message string is also stored.
-}
-
-At least one data element is required in the evaluation watchlist for early stopping to work.
-
-Callback function expects the following values to be set in its calling frame:
-\code{stop_condition},
-\code{bst_evaluation},
-\code{rank},
-\code{bst} (or \code{bst_folds} and \code{basket}),
-\code{iteration},
-\code{begin_iteration},
-\code{end_iteration},
-}
-\seealso{
-\code{\link{callbacks}},
-\code{\link{xgb.attr}}
-}
diff --git a/R-package/man/cb.evaluation.log.Rd b/R-package/man/cb.evaluation.log.Rd
deleted file mode 100644
index 94f8a02e6..000000000
--- a/R-package/man/cb.evaluation.log.Rd
+++ /dev/null
@@ -1,31 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/callbacks.R
-\name{cb.evaluation.log}
-\alias{cb.evaluation.log}
-\title{Callback closure for logging the evaluation history}
-\usage{
-cb.evaluation.log()
-}
-\description{
-Callback closure for logging the evaluation history
-}
-\details{
-This callback function appends the current iteration evaluation results \code{bst_evaluation}
-available in the calling parent frame to the \code{evaluation_log} list in a calling frame.
-
-The finalizer callback (called with \code{finalize = TURE} in the end) converts
-the \code{evaluation_log} list into a final data.table.
-
-The iteration evaluation result \code{bst_evaluation} must be a named numeric vector.
-
-Note: in the column names of the final data.table, the dash '-' character is replaced with
-the underscore '_' in order to make the column names more like regular R identifiers.
-
-Callback function expects the following values to be set in its calling frame:
-\code{evaluation_log},
-\code{bst_evaluation},
-\code{iteration}.
-}
-\seealso{
-\code{\link{callbacks}}
-}
diff --git a/R-package/man/cb.print.evaluation.Rd b/R-package/man/cb.print.evaluation.Rd
deleted file mode 100644
index 59b9ba65e..000000000
--- a/R-package/man/cb.print.evaluation.Rd
+++ /dev/null
@@ -1,29 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/callbacks.R
-\name{cb.print.evaluation}
-\alias{cb.print.evaluation}
-\title{Callback closure for printing the result of evaluation}
-\usage{
-cb.print.evaluation(period = 1, showsd = TRUE)
-}
-\arguments{
-\item{period}{results would be printed every number of periods}
-
-\item{showsd}{whether standard deviations should be printed (when available)}
-}
-\description{
-Callback closure for printing the result of evaluation
-}
-\details{
-The callback function prints the result of evaluation at every \code{period} iterations.
-The initial and the last iteration's evaluations are always printed.
-
-Callback function expects the following values to be set in its calling frame:
-\code{bst_evaluation} (also \code{bst_evaluation_err} when available),
-\code{iteration},
-\code{begin_iteration},
-\code{end_iteration}.
-}
-\seealso{
-\code{\link{callbacks}}
-}
diff --git a/R-package/man/cb.save.model.Rd b/R-package/man/cb.save.model.Rd
deleted file mode 100644
index 7701ad990..000000000
--- a/R-package/man/cb.save.model.Rd
+++ /dev/null
@@ -1,40 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/callbacks.R
-\name{cb.save.model}
-\alias{cb.save.model}
-\title{Callback closure for saving a model file.}
-\usage{
-cb.save.model(save_period = 0, save_name = "xgboost.ubj")
-}
-\arguments{
-\item{save_period}{save the model to disk after every
-\code{save_period} iterations; 0 means save the model at the end.}
-
-\item{save_name}{the name or path for the saved model file.
-
-\if{html}{\out{<div class="sourceCode">}}\preformatted{   Note that the format of the model being saved is determined by the file
-   extension specified here (see \link{xgb.save} for details about how it works).
-
-   It can contain a \code{\link[base]{sprintf}} formatting specifier
-   to include the integer iteration number in the file name.
-   E.g., with \code{save_name} = 'xgboost_\%04d.ubj',
-   the file saved at iteration 50 would be named "xgboost_0050.ubj".
-}\if{html}{\out{</div>}}}
-}
-\description{
-Callback closure for saving a model file.
-}
-\details{
-This callback function allows to save an xgb-model file, either periodically after each \code{save_period}'s or at the end.
-
-Callback function expects the following values to be set in its calling frame:
-\code{bst},
-\code{iteration},
-\code{begin_iteration},
-\code{end_iteration}.
-}
-\seealso{
-\link{xgb.save}
-
-\code{\link{callbacks}}
-}
diff --git a/R-package/man/xgb.Callback.Rd b/R-package/man/xgb.Callback.Rd
new file mode 100644
index 000000000..ed1dd7bed
--- /dev/null
+++ b/R-package/man/xgb.Callback.Rd
@@ -0,0 +1,248 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/callbacks.R
+\name{xgb.Callback}
+\alias{xgb.Callback}
+\title{XGBoost Callback Constructor}
+\usage{
+xgb.Callback(
+  cb_name = "custom_callback",
+  env = new.env(),
+  f_before_training = function(env, model, data, watchlist, begin_iteration,
+    end_iteration) NULL,
+  f_before_iter = function(env, model, data, watchlist, iteration) NULL,
+  f_after_iter = function(env, model, data, watchlist, iteration, iter_feval) NULL,
+  f_after_training = function(env, model, data, watchlist, iteration, final_feval,
+    prev_cb_res) NULL
+)
+}
+\arguments{
+\item{cb_name}{Name for the callback.
+
+If the callback produces some non-NULL result (from executing the function passed under
+\code{f_after_training}), that result will be added as an R attribute to the resulting booster
+(or as a named element in the result of CV), with the attribute name specified here.
+
+Names of callbacks must be unique - i.e. there cannot be two callbacks with the same name.}
+
+\item{env}{An environment object that will be passed to the different functions in the callback.
+Note that this environment will not be shared with other callbacks.}
+
+\item{f_before_training}{A function that will be executed before the training has started.
+
+If passing \code{NULL} for this or for the other function inputs, then no function will be executed.
+
+If passing a function, it will be called with parameters supplied as non-named arguments
+matching the function signatures that are shown in the default value for each function argument.}
+
+\item{f_before_iter}{A function that will be executed before each boosting round.
+
+This function can signal whether the training should be finalized or not, by outputting
+a value that evaluates to \code{TRUE} - i.e. if the output from the function provided here at
+a given round is \code{TRUE}, then training will be stopped before the current iteration happens.
+
+Return values of \code{NULL} will be interpreted as \code{FALSE}.}
+
+\item{f_after_iter}{A function that will be executed after each boosting round.
+
+This function can signal whether the training should be finalized or not, by outputting
+a value that evaluates to \code{TRUE} - i.e. if the output from the function provided here at
+a given round is \code{TRUE}, then training will be stopped at that round.
+
+Return values of \code{NULL} will be interpreted as \code{FALSE}.}
+
+\item{f_after_training}{A function that will be executed after training is finished.
+
+This function can optionally output something non-NULL, which will become part of the R
+attributes of the booster (assuming one passes \code{keep_extra_attributes=TRUE} to \link{xgb.train})
+under the name supplied for parameter \code{cb_name} imn the case of \link{xgb.train}; or a part
+of the named elements in the result of \link{xgb.cv}.}
+}
+\value{
+An \code{xgb.Callback} object, which can be passed to \link{xgb.train} or \link{xgb.cv}.
+}
+\description{
+Constructor for defining the structure of callback functions that can be executed
+at different stages of model training (before / after training, before / after each boosting
+iteration).
+}
+\details{
+Arguments that will be passed to the supplied functions are as follows:\itemize{
+
+\item env The same environment that is passed under argument \code{env}.
+
+It may be modified by the functions in order to e.g. keep tracking of what happens
+across iterations or similar.
+
+This environment is only used by the functions supplied to the callback, and will
+not be kept after the model fitting function terminates (see parameter \code{f_after_training}).
+
+\item model The booster object when using \link{xgb.train}, or the folds when using
+\link{xgb.cv}.
+
+For \link{xgb.cv}, folds are a list with a structure as follows:\itemize{
+\item \code{dtrain}: The training data for the fold (as an \code{xgb.DMatrix} object).
+\item \code{bst}: Rhe \code{xgb.Booster} object for the fold.
+\item \code{watchlist}: A list with two DMatrices, with names \code{train} and \code{test}
+(\code{test} is the held-out data for the fold).
+\item \code{index}: The indices of the hold-out data for that fold (base-1 indexing),
+from which the \code{test} entry in the watchlist was obtained.
+}
+
+This object should \bold{not} be in-place modified in ways that conflict with the
+training (e.g. resetting the parameters for a training update in a way that resets
+the number of rounds to zero in order to overwrite rounds).
+
+Note that any R attributes that are assigned to the booster during the callback functions,
+will not be kept thereafter as the booster object variable is not re-assigned during
+training. It is however possible to set C-level attributes of the booster through
+\link{xgb.attr} or \link{xgb.attributes}, which should remain available for the rest
+of the iterations and after the training is done.
+
+For keeping variables across iterations, it's recommended to use \code{env} instead.
+\item data The data to which the model is being fit, as an \code{xgb.DMatrix} object.
+
+Note that, for \link{xgb.cv}, this will be the full data, while data for the specific
+folds can be found in the \code{model} object.
+
+\item watchlist The evaluation watchlist, as passed under argument \code{watchlist} to
+\link{xgb.train}.
+
+For \link{xgb.cv}, this will always be \code{NULL}.
+
+\item begin_iteration Index of the first boosting iteration that will be executed
+(base-1 indexing).
+
+This will typically be '1', but when using training continuation, depending on the
+parameters for updates, boosting rounds will be continued from where the previous
+model ended, in which case this will be larger than 1.
+
+\item end_iteration Index of the last boostign iteration that will be executed
+(base-1 indexing, inclusive of this end).
+
+It should match with argument \code{nrounds} passed to \link{xgb.train} or \link{xgb.cv}.
+
+Note that boosting might be interrupted before reaching this last iteration, for
+example by using the early stopping callback \link{xgb.cb.early.stop}.
+
+\item iteration Index of the iteration number that is being executed (first iteration
+will be the same as parameter \code{begin_iteration}, then next one will add +1, and so on).
+
+\item iter_feval Evaluation metrics for the \code{watchlist} that was supplied, either
+determined by the objective, or by parameter \code{feval}.
+
+For \link{xgb.train}, this will be a named vector with one entry per element in
+\code{watchlist}, where the names are determined as 'watchlist name' + '-' + 'metric name' - for
+example, if \code{watchlist} contains an entry named "tr" and the metric is "rmse",
+this will be a one-element vector with name "tr-rmse".
+
+For \link{xgb.cv}, this will be a 2d matrix with dimensions \verb{[length(watchlist), nfolds]},
+where the row names will follow the same naming logic as the one-dimensional vector
+that is passed in \link{xgb.train}.
+
+Note that, internally, the built-in callbacks such as \link{xgb.cb.print.evaluation} summarize
+this table by calculating the row-wise means and standard deviations.
+
+\item final_feval The evaluation results after the last boosting round is executed
+(same format as \code{iter_feval}, and will be the exact same input as passed under
+\code{iter_feval} to the last round that is executed during model fitting).
+
+\item prev_cb_res Result from a previous run of a callback sharing the same name
+(as given by parameter \code{cb_name}) when conducting training continuation, if there
+was any in the booster R attributes.
+
+Some times, one might want to append the new results to the previous one, and this will
+be done automatically by the built-in callbacks such as \link{xgb.cb.evaluation.log},
+which will append the new rows to the previous table.
+
+If no such previous callback result is available (which it never will when fitting
+a model from start instead of updating an existing model), this will be \code{NULL}.
+
+For \link{xgb.cv}, which doesn't support training continuation, this will always be \code{NULL}.
+}
+
+The following names (\code{cb_name} values) are reserved for internal callbacks:\itemize{
+\item print_evaluation
+\item evaluation_log
+\item reset_parameters
+\item early_stop
+\item save_model
+\item cv_predict
+\item gblinear_history
+}
+
+The following names are reserved for other non-callback attributes:\itemize{
+\item names
+\item class
+\item call
+\item params
+\item niter
+\item nfeatures
+\item folds
+}
+
+When using the built-in early stopping callback (\link{xgb.cb.early.stop}), said callback
+will always be executed before the others, as it sets some booster C-level attributes
+that other callbacks might also use. Otherwise, the order of execution will match with
+the order in which the callbacks are passed to the model fitting function.
+}
+\examples{
+# Example constructing a custom callback that calculates
+# squared error on the training data, without a watchlist,
+# and outputs the per-iteration results.
+ssq_callback <- xgb.Callback(
+  cb_name = "ssq",
+  f_before_training = function(env, model, data, watchlist,
+                               begin_iteration, end_iteration) {
+    # A vector to keep track of a number at each iteration
+    env$logs <- rep(NA_real_, end_iteration - begin_iteration + 1)
+  },
+  f_after_iter = function(env, model, data, watchlist, iteration, iter_feval) {
+    # This calculates the sum of squared errors on the training data.
+    # Note that this can be better done by passing a 'watchlist' entry,
+    # but this demonstrates a way in which callbacks can be structured.
+    pred <- predict(model, data)
+    err <- pred - getinfo(data, "label")
+    sq_err <- sum(err^2)
+    env$logs[iteration] <- sq_err
+    cat(
+      sprintf(
+        "Squared error at iteration \%d: \%.2f\n",
+        iteration, sq_err
+      )
+    )
+
+    # A return value of 'TRUE' here would signal to finalize the training
+    return(FALSE)
+  },
+  f_after_training = function(env, model, data, watchlist, iteration,
+                              final_feval, prev_cb_res) {
+    return(env$logs)
+  }
+)
+
+data(mtcars)
+y <- mtcars$mpg
+x <- as.matrix(mtcars[, -1])
+dm <- xgb.DMatrix(x, label = y, nthread = 1)
+model <- xgb.train(
+  data = dm,
+  params = list(objective = "reg:squarederror", nthread = 1),
+  nrounds = 5,
+  callbacks = list(ssq_callback),
+  keep_extra_attributes = TRUE
+)
+
+# Result from 'f_after_iter' will be available as an attribute
+attributes(model)$ssq
+}
+\seealso{
+Built-in callbacks:\itemize{
+\item \link{xgb.cb.print.evaluation}
+\item \link{xgb.cb.evaluation.log}
+\item \link{xgb.cb.reset.parameters}
+\item \link{xgb.cb.early.stop}
+\item \link{xgb.cb.save.model}
+\item \link{xgb.cb.cv.predict}
+\item \link{xgb.cb.gblinear.history}
+}
+}
diff --git a/R-package/man/cb.cv.predict.Rd b/R-package/man/xgb.cb.cv.predict.Rd
similarity index 53%
rename from R-package/man/cb.cv.predict.Rd
rename to R-package/man/xgb.cb.cv.predict.Rd
index 4cabac1c9..d2d9a084b 100644
--- a/R-package/man/cb.cv.predict.Rd
+++ b/R-package/man/xgb.cb.cv.predict.Rd
@@ -1,16 +1,27 @@
 % Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/callbacks.R
-\name{cb.cv.predict}
-\alias{cb.cv.predict}
-\title{Callback closure for returning cross-validation based predictions.}
+\name{xgb.cb.cv.predict}
+\alias{xgb.cb.cv.predict}
+\title{Callback for returning cross-validation based predictions.}
 \usage{
-cb.cv.predict(save_models = FALSE)
+xgb.cb.cv.predict(save_models = FALSE, outputmargin = FALSE)
 }
 \arguments{
-\item{save_models}{a flag for whether to save the folds' models.}
+\item{save_models}{A flag for whether to save the folds' models.}
+
+\item{outputmargin}{Whether to save margin predictions (same effect as passing this
+parameter to \link{predict.xgb.Booster}).}
 }
 \value{
-Predictions are returned inside of the \code{pred} element, which is either a vector or a matrix,
+An \code{xgb.Callback} object, which can be passed to \link{xgb.cv},
+but \bold{not} to \link{xgb.train}.
+}
+\description{
+This callback function saves predictions for all of the test folds,
+and also allows to save the folds' models.
+}
+\details{
+Predictions are saved inside of the \code{pred} element, which is either a vector or a matrix,
 depending on the number of prediction outputs per data row. The order of predictions corresponds
 to the order of rows in the original dataset. Note that when a custom \code{folds} list is
 provided in \code{xgb.cv}, the predictions would only be returned properly when this list is a
@@ -19,23 +30,3 @@ meaningful when user-provided folds have overlapping indices as in, e.g., random
 When some of the indices in the training dataset are not included into user-provided \code{folds},
 their prediction value would be \code{NA}.
 }
-\description{
-Callback closure for returning cross-validation based predictions.
-}
-\details{
-This callback function saves predictions for all of the test folds,
-and also allows to save the folds' models.
-
-It is a "finalizer" callback and it uses early stopping information whenever it is available,
-thus it must be run after the early stopping callback if the early stopping is used.
-
-Callback function expects the following values to be set in its calling frame:
-\code{bst_folds},
-\code{basket},
-\code{data},
-\code{end_iteration},
-\code{params},
-}
-\seealso{
-\code{\link{callbacks}}
-}
diff --git a/R-package/man/xgb.cb.early.stop.Rd b/R-package/man/xgb.cb.early.stop.Rd
new file mode 100644
index 000000000..26d2f1aa3
--- /dev/null
+++ b/R-package/man/xgb.cb.early.stop.Rd
@@ -0,0 +1,55 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/callbacks.R
+\name{xgb.cb.early.stop}
+\alias{xgb.cb.early.stop}
+\title{Callback to activate early stopping}
+\usage{
+xgb.cb.early.stop(
+  stopping_rounds,
+  maximize = FALSE,
+  metric_name = NULL,
+  verbose = TRUE,
+  keep_all_iter = TRUE
+)
+}
+\arguments{
+\item{stopping_rounds}{The number of rounds with no improvement in
+the evaluation metric in order to stop the training.}
+
+\item{maximize}{Whether to maximize the evaluation metric.}
+
+\item{metric_name}{The name of an evaluation column to use as a criteria for early
+stopping. If not set, the last column would be used.
+Let's say the test data in \code{watchlist} was labelled as \code{dtest},
+and one wants to use the AUC in test data for early stopping regardless of where
+it is in the \code{watchlist}, then one of the following would need to be set:
+\code{metric_name='dtest-auc'} or \code{metric_name='dtest_auc'}.
+All dash '-' characters in metric names are considered equivalent to '_'.}
+
+\item{verbose}{Whether to print the early stopping information.}
+
+\item{keep_all_iter}{Whether to keep all of the boosting rounds that were produced
+in the resulting object. If passing \code{FALSE}, will only keep the boosting rounds
+up to the detected best iteration, discarding the ones that come after.}
+}
+\value{
+An \code{xgb.Callback} object, which can be passed to \link{xgb.train} or \link{xgb.cv}.
+}
+\description{
+This callback function determines the condition for early stopping.
+
+The following attributes are assigned to the booster's object:
+\itemize{
+\item \code{best_score} the evaluation score at the best iteration
+\item \code{best_iteration} at which boosting iteration the best score has occurred
+(0-based index for interoperability of binary models)
+}
+
+The same values are also stored as R attributes as a result of the callback, plus an additional
+attribute \code{stopped_by_max_rounds} which indicates whether an early stopping by the \code{stopping_rounds}
+condition occurred. Note that the \code{best_iteration} that is stored under R attributes will follow
+base-1 indexing, so it will be larger by '1' than the C-level 'best_iteration' that is accessed
+through \link{xgb.attr} or \link{xgb.attributes}.
+
+At least one data element is required in the evaluation watchlist for early stopping to work.
+}
diff --git a/R-package/man/xgb.cb.evaluation.log.Rd b/R-package/man/xgb.cb.evaluation.log.Rd
new file mode 100644
index 000000000..1dab64647
--- /dev/null
+++ b/R-package/man/xgb.cb.evaluation.log.Rd
@@ -0,0 +1,24 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/callbacks.R
+\name{xgb.cb.evaluation.log}
+\alias{xgb.cb.evaluation.log}
+\title{Callback for logging the evaluation history}
+\usage{
+xgb.cb.evaluation.log()
+}
+\value{
+An \code{xgb.Callback} object, which can be passed to \link{xgb.train} or \link{xgb.cv}.
+}
+\description{
+Callback for logging the evaluation history
+}
+\details{
+This callback creates a table with per-iteration evaluation metrics (see parameters
+\code{watchlist} and \code{feval} in \link{xgb.train}).
+
+Note: in the column names of the final data.table, the dash '-' character is replaced with
+the underscore '_' in order to make the column names more like regular R identifiers.
+}
+\seealso{
+\link{xgb.cb.print.evaluation}
+}
diff --git a/R-package/man/cb.gblinear.history.Rd b/R-package/man/xgb.cb.gblinear.history.Rd
similarity index 63%
rename from R-package/man/cb.gblinear.history.Rd
rename to R-package/man/xgb.cb.gblinear.history.Rd
index 2a03c14db..0ebaa4685 100644
--- a/R-package/man/cb.gblinear.history.Rd
+++ b/R-package/man/xgb.cb.gblinear.history.Rd
@@ -1,37 +1,48 @@
 % Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/callbacks.R
-\name{cb.gblinear.history}
-\alias{cb.gblinear.history}
-\title{Callback closure for collecting the model coefficients history of a gblinear booster
-during its training.}
+\name{xgb.cb.gblinear.history}
+\alias{xgb.cb.gblinear.history}
+\title{Callback for collecting coefficients history of a gblinear booster}
 \usage{
-cb.gblinear.history(sparse = FALSE)
+xgb.cb.gblinear.history(sparse = FALSE)
 }
 \arguments{
-\item{sparse}{when set to FALSE/TRUE, a dense/sparse matrix is used to store the result.
+\item{sparse}{when set to \code{FALSE}/\code{TRUE}, a dense/sparse matrix is used to store the result.
 Sparse format is useful when one expects only a subset of coefficients to be non-zero,
 when using the "thrifty" feature selector with fairly small number of top features
 selected per iteration.}
 }
 \value{
-Results are stored in the \code{coefs} element of the closure.
-The \code{\link{xgb.gblinear.history}} convenience function provides an easy
-way to access it.
-With \code{xgb.train}, it is either a dense of a sparse matrix.
-While with \code{xgb.cv}, it is a list (an element per each fold) of such
-matrices.
+An \code{xgb.Callback} object, which can be passed to \link{xgb.train} or \link{xgb.cv}.
 }
 \description{
-Callback closure for collecting the model coefficients history of a gblinear booster
-during its training.
+Callback for collecting coefficients history of a gblinear booster
 }
 \details{
 To keep things fast and simple, gblinear booster does not internally store the history of linear
 model coefficients at each boosting iteration. This callback provides a workaround for storing
 the coefficients' path, by extracting them after each training iteration.
 
-Callback function expects the following values to be set in its calling frame:
-\code{bst} (or \code{bst_folds}).
+This callback will construct a matrix where rows are boosting iterations and columns are
+feature coefficients (same order as when calling \link{coef.xgb.Booster}, with the intercept
+corresponding to the first column).
+
+When there is more than one coefficient per feature (e.g. multi-class classification),
+the result will be reshaped into a vector where coefficients are arranged first by features and
+then by class (e.g. first 1 through N coefficients will be for the first class, then
+coefficients N+1 through 2N for the second class, and so on).
+
+If the result has only one coefficient per feature in the data, then the resulting matrix
+will have column names matching with the feature names, otherwise (when there's more than
+one coefficient per feature) the names will be composed as 'column name' + ':' + 'class index'
+(so e.g. column 'c1' for class '0' will be named 'c1:0').
+
+With \code{xgb.train}, the output is either a dense or a sparse matrix.
+With with \code{xgb.cv}, it is a list (one element per each fold) of such
+matrices.
+
+Function \link{xgb.gblinear.history} function provides an easy way to retrieve the
+outputs from this callback.
 }
 \examples{
 #### Binary classification:
@@ -52,7 +63,7 @@ param <- list(booster = "gblinear", objective = "reg:logistic", eval_metric = "a
 # rate does not break the convergence, but allows us to illustrate the typical pattern of
 # "stochastic explosion" behaviour of this lock-free algorithm at early boosting iterations.
 bst <- xgb.train(param, dtrain, list(tr=dtrain), nrounds = 200, eta = 1.,
-                 callbacks = list(cb.gblinear.history()))
+                 callbacks = list(xgb.cb.gblinear.history()))
 # Extract the coefficients' path and plot them vs boosting iteration number:
 coef_path <- xgb.gblinear.history(bst)
 matplot(coef_path, type = 'l')
@@ -61,7 +72,7 @@ matplot(coef_path, type = 'l')
 # Will try the classical componentwise boosting which selects a single best feature per round:
 bst <- xgb.train(param, dtrain, list(tr=dtrain), nrounds = 200, eta = 0.8,
                  updater = 'coord_descent', feature_selector = 'thrifty', top_k = 1,
-                 callbacks = list(cb.gblinear.history()))
+                 callbacks = list(xgb.cb.gblinear.history()))
 matplot(xgb.gblinear.history(bst), type = 'l')
 #  Componentwise boosting is known to have similar effect to Lasso regularization.
 # Try experimenting with various values of top_k, eta, nrounds,
@@ -69,7 +80,7 @@ matplot(xgb.gblinear.history(bst), type = 'l')
 
 # For xgb.cv:
 bst <- xgb.cv(param, dtrain, nfold = 5, nrounds = 100, eta = 0.8,
-              callbacks = list(cb.gblinear.history()))
+              callbacks = list(xgb.cb.gblinear.history()))
 # coefficients in the CV fold #3
 matplot(xgb.gblinear.history(bst)[[3]], type = 'l')
 
@@ -82,7 +93,7 @@ param <- list(booster = "gblinear", objective = "multi:softprob", num_class = 3,
 # For the default linear updater 'shotgun' it sometimes is helpful
 # to use smaller eta to reduce instability
 bst <- xgb.train(param, dtrain, list(tr=dtrain), nrounds = 50, eta = 0.5,
-                 callbacks = list(cb.gblinear.history()))
+                 callbacks = list(xgb.cb.gblinear.history()))
 # Will plot the coefficient paths separately for each class:
 matplot(xgb.gblinear.history(bst, class_index = 0), type = 'l')
 matplot(xgb.gblinear.history(bst, class_index = 1), type = 'l')
@@ -90,11 +101,11 @@ matplot(xgb.gblinear.history(bst, class_index = 2), type = 'l')
 
 # CV:
 bst <- xgb.cv(param, dtrain, nfold = 5, nrounds = 70, eta = 0.5,
-              callbacks = list(cb.gblinear.history(FALSE)))
+              callbacks = list(xgb.cb.gblinear.history(FALSE)))
 # 1st fold of 1st class
 matplot(xgb.gblinear.history(bst, class_index = 0)[[1]], type = 'l')
 
 }
 \seealso{
-\code{\link{callbacks}}, \code{\link{xgb.gblinear.history}}.
+\link{xgb.gblinear.history}, \link{coef.xgb.Booster}.
 }
diff --git a/R-package/man/xgb.cb.print.evaluation.Rd b/R-package/man/xgb.cb.print.evaluation.Rd
new file mode 100644
index 000000000..c4f2e6991
--- /dev/null
+++ b/R-package/man/xgb.cb.print.evaluation.Rd
@@ -0,0 +1,25 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/callbacks.R
+\name{xgb.cb.print.evaluation}
+\alias{xgb.cb.print.evaluation}
+\title{Callback for printing the result of evaluation}
+\usage{
+xgb.cb.print.evaluation(period = 1, showsd = TRUE)
+}
+\arguments{
+\item{period}{results would be printed every number of periods}
+
+\item{showsd}{whether standard deviations should be printed (when available)}
+}
+\value{
+An \code{xgb.Callback} object, which can be passed to \link{xgb.train} or \link{xgb.cv}.
+}
+\description{
+The callback function prints the result of evaluation at every \code{period} iterations.
+The initial and the last iteration's evaluations are always printed.
+
+Does not leave any attribute in the booster (see \link{xgb.cb.evaluation.log} for that).
+}
+\seealso{
+\link{xgb.Callback}
+}
diff --git a/R-package/man/cb.reset.parameters.Rd b/R-package/man/xgb.cb.reset.parameters.Rd
similarity index 57%
rename from R-package/man/cb.reset.parameters.Rd
rename to R-package/man/xgb.cb.reset.parameters.Rd
index ee0a5d1bd..c7e863817 100644
--- a/R-package/man/cb.reset.parameters.Rd
+++ b/R-package/man/xgb.cb.reset.parameters.Rd
@@ -1,10 +1,10 @@
 % Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/callbacks.R
-\name{cb.reset.parameters}
-\alias{cb.reset.parameters}
-\title{Callback closure for resetting the booster's parameters at each iteration.}
+\name{xgb.cb.reset.parameters}
+\alias{xgb.cb.reset.parameters}
+\title{Callback for resetting the booster's parameters at each iteration.}
 \usage{
-cb.reset.parameters(new_params)
+xgb.cb.reset.parameters(new_params)
 }
 \arguments{
 \item{new_params}{a list where each element corresponds to a parameter that needs to be reset.
@@ -14,23 +14,16 @@ or a function of two parameters \code{learning_rates(iteration, nrounds)}
 which returns a new parameter value by using the current iteration number
 and the total number of boosting rounds.}
 }
+\value{
+An \code{xgb.Callback} object, which can be passed to \link{xgb.train} or \link{xgb.cv}.
+}
 \description{
-Callback closure for resetting the booster's parameters at each iteration.
+Callback for resetting the booster's parameters at each iteration.
 }
 \details{
-This is a "pre-iteration" callback function used to reset booster's parameters
-at the beginning of each iteration.
-
 Note that when training is resumed from some previous model, and a function is used to
 reset a parameter value, the \code{nrounds} argument in this function would be the
 the number of boosting rounds in the current training.
 
-Callback function expects the following values to be set in its calling frame:
-\code{bst} or \code{bst_folds},
-\code{iteration},
-\code{begin_iteration},
-\code{end_iteration}.
-}
-\seealso{
-\code{\link{callbacks}}
+Does not leave any attribute in the booster.
 }
diff --git a/R-package/man/xgb.cb.save.model.Rd b/R-package/man/xgb.cb.save.model.Rd
new file mode 100644
index 000000000..8ddba2f1a
--- /dev/null
+++ b/R-package/man/xgb.cb.save.model.Rd
@@ -0,0 +1,28 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/callbacks.R
+\name{xgb.cb.save.model}
+\alias{xgb.cb.save.model}
+\title{Callback for saving a model file.}
+\usage{
+xgb.cb.save.model(save_period = 0, save_name = "xgboost.ubj")
+}
+\arguments{
+\item{save_period}{Save the model to disk after every
+\code{save_period} iterations; 0 means save the model at the end.}
+
+\item{save_name}{The name or path for the saved model file.
+It can contain a \code{\link[base]{sprintf}} formatting specifier
+to include the integer iteration number in the file name.
+E.g., with \code{save_name} = 'xgboost_\%04d.model',
+the file saved at iteration 50 would be named "xgboost_0050.model".}
+}
+\value{
+An \code{xgb.Callback} object, which can be passed to \link{xgb.train},
+but \bold{not} to \link{xgb.cv}.
+}
+\description{
+This callback function allows to save an xgb-model file, either periodically
+after each \code{save_period}'s or at the end.
+
+Does not leave any attribute in the booster.
+}
diff --git a/R-package/man/xgb.cv.Rd b/R-package/man/xgb.cv.Rd
index 9f6103a52..778b4540a 100644
--- a/R-package/man/xgb.cv.Rd
+++ b/R-package/man/xgb.cv.Rd
@@ -59,7 +59,7 @@ that NA values should be considered as 'missing' by the algorithm.
 Sometimes, 0 or other extreme value might be used to represent missing values.}
 
 \item{prediction}{A logical value indicating whether to return the test fold predictions
-from each CV model. This parameter engages the \code{\link{cb.cv.predict}} callback.}
+from each CV model. This parameter engages the \code{\link{xgb.cb.cv.predict}} callback.}
 
 \item{showsd}{\code{boolean}, whether to show standard deviation of cross validation}
 
@@ -98,20 +98,20 @@ the \code{nfold} and \code{stratified} parameters are ignored.}
 
 \item{print_every_n}{Print each n-th iteration evaluation messages when \code{verbose>0}.
 Default is 1 which means all messages are printed. This parameter is passed to the
-\code{\link{cb.print.evaluation}} callback.}
+\code{\link{xgb.cb.print.evaluation}} callback.}
 
 \item{early_stopping_rounds}{If \code{NULL}, the early stopping function is not triggered.
 If set to an integer \code{k}, training with a validation set will stop if the performance
 doesn't improve for \code{k} rounds.
-Setting this parameter engages the \code{\link{cb.early.stop}} callback.}
+Setting this parameter engages the \code{\link{xgb.cb.early.stop}} callback.}
 
 \item{maximize}{If \code{feval} and \code{early_stopping_rounds} are set,
 then this parameter must be set as well.
 When it is \code{TRUE}, it means the larger the evaluation score the better.
-This parameter is passed to the \code{\link{cb.early.stop}} callback.}
+This parameter is passed to the \code{\link{xgb.cb.early.stop}} callback.}
 
 \item{callbacks}{a list of callback functions to perform various task during boosting.
-See \code{\link{callbacks}}. Some of the callbacks are automatically created depending on the
+See \code{\link{xgb.Callback}}. Some of the callbacks are automatically created depending on the
 parameters' values. User can provide either existing or their own callback methods in order
 to customize the training process.}
 
@@ -122,24 +122,24 @@ An object of class \code{xgb.cv.synchronous} with the following elements:
 \itemize{
 \item \code{call} a function call.
 \item \code{params} parameters that were passed to the xgboost library. Note that it does not
-capture parameters changed by the \code{\link{cb.reset.parameters}} callback.
-\item \code{callbacks} callback functions that were either automatically assigned or
-explicitly passed.
+capture parameters changed by the \code{\link{xgb.cb.reset.parameters}} callback.
 \item \code{evaluation_log} evaluation history stored as a \code{data.table} with the
 first column corresponding to iteration number and the rest corresponding to the
 CV-based evaluation means and standard deviations for the training and test CV-sets.
-It is created by the \code{\link{cb.evaluation.log}} callback.
+It is created by the \code{\link{xgb.cb.evaluation.log}} callback.
 \item \code{niter} number of boosting iterations.
 \item \code{nfeatures} number of features in training data.
 \item \code{folds} the list of CV folds' indices - either those passed through the \code{folds}
 parameter or randomly generated.
 \item \code{best_iteration} iteration number with the best evaluation metric value
 (only available with early stopping).
-\item \code{pred} CV prediction values available when \code{prediction} is set.
-It is either vector or matrix (see \code{\link{cb.cv.predict}}).
-\item \code{models} a list of the CV folds' models. It is only available with the explicit
-setting of the \code{cb.cv.predict(save_models = TRUE)} callback.
 }
+
+Plus other potential elements that are the result of callbacks, such as a list \code{cv_predict} with
+a sub-element \code{pred} when passing \code{prediction = TRUE}, which is added by the \link{xgb.cb.cv.predict}
+callback (note that one can also pass it manually under \code{callbacks} with different settings,
+such as saving also the models created during cross validation); or a list \code{early_stop} which
+will contain elements such as \code{best_iteration} when using the early stopping callback (\link{xgb.cb.early.stop}).
 }
 \description{
 The cross validation function of xgboost
diff --git a/R-package/man/xgb.gblinear.history.Rd b/R-package/man/xgb.gblinear.history.Rd
index 103be16f1..25aef7163 100644
--- a/R-package/man/xgb.gblinear.history.Rd
+++ b/R-package/man/xgb.gblinear.history.Rd
@@ -8,7 +8,7 @@ xgb.gblinear.history(model, class_index = NULL)
 }
 \arguments{
 \item{model}{either an \code{xgb.Booster} or a result of \code{xgb.cv()}, trained
-using the \code{cb.gblinear.history()} callback, but \bold{not} a booster
+using the \link{xgb.cb.gblinear.history} callback, but \bold{not} a booster
 loaded from \link{xgb.load} or \link{xgb.load.raw}.}
 
 \item{class_index}{zero-based class index to extract the coefficients for only that
@@ -16,23 +16,31 @@ specific class in a multinomial multiclass model. When it is NULL, all the
 coefficients are returned. Has no effect in non-multiclass models.}
 }
 \value{
-For an \code{xgb.train} result, a matrix (either dense or sparse) with the columns
-corresponding to iteration's coefficients (in the order as \code{xgb.dump()} would
-return) and the rows corresponding to boosting iterations.
+For an \link{xgb.train} result, a matrix (either dense or sparse) with the columns
+corresponding to iteration's coefficients and the rows corresponding to boosting iterations.
 
-For an \code{xgb.cv} result, a list of such matrices is returned with the elements
+For an \link{xgb.cv} result, a list of such matrices is returned with the elements
 corresponding to CV folds.
+
+When there is more than one coefficient per feature (e.g. multi-class classification)
+and \code{class_index} is not provided,
+the result will be reshaped into a vector where coefficients are arranged first by features and
+then by class (e.g. first 1 through N coefficients will be for the first class, then
+coefficients N+1 through 2N for the second class, and so on).
 }
 \description{
 A helper function to extract the matrix of linear coefficients' history
-from a gblinear model created while using the \code{cb.gblinear.history()}
-callback.
+from a gblinear model created while using the \link{xgb.cb.gblinear.history}
+callback (which must be added manually as by default it's not used).
 }
 \details{
 Note that this is an R-specific function that relies on R attributes that
 are not saved when using xgboost's own serialization functions like \link{xgb.load}
 or \link{xgb.load.raw}.
 
-In order for a serialized model to be accepted by tgis function, one must use R
+In order for a serialized model to be accepted by this function, one must use R
 serializers such as \link{saveRDS}.
 }
+\seealso{
+\link{xgb.cb.gblinear.history}, \link{coef.xgb.Booster}.
+}
diff --git a/R-package/man/xgb.load.Rd b/R-package/man/xgb.load.Rd
index 1fbe0055e..e18a900e3 100644
--- a/R-package/man/xgb.load.Rd
+++ b/R-package/man/xgb.load.Rd
@@ -17,7 +17,7 @@ Load xgboost model from the binary model file.
 }
 \details{
 The input file is expected to contain a model saved in an xgboost model format
-using either \code{\link{xgb.save}} or \code{\link{cb.save.model}} in R, or using some
+using either \code{\link{xgb.save}} or \code{\link{xgb.cb.save.model}} in R, or using some
 appropriate methods from other xgboost interfaces. E.g., a model trained in Python and
 saved from there in xgboost format, could be loaded from R.
 
diff --git a/R-package/man/xgb.train.Rd b/R-package/man/xgb.train.Rd
index 21c5fe7ee..45c78ae13 100644
--- a/R-package/man/xgb.train.Rd
+++ b/R-package/man/xgb.train.Rd
@@ -162,7 +162,7 @@ List is provided in detail section.}
 Metrics specified in either \code{eval_metric} or \code{feval} will be computed for each
 of these datasets during each boosting iteration, and stored in the end as a field named
 \code{evaluation_log} in the resulting object. When either \code{verbose>=1} or
-\code{\link{cb.print.evaluation}} callback is engaged, the performance results are continuously
+\code{\link{xgb.cb.print.evaluation}} callback is engaged, the performance results are continuously
 printed out during the training.
 E.g., specifying \code{watchlist=list(validation1=mat1, validation2=mat2)} allows to track
 the performance of each round's model on mat1 and mat2.}
@@ -177,24 +177,24 @@ prediction and dtrain.}
 \item{verbose}{If 0, xgboost will stay silent. If 1, it will print information about performance.
 If 2, some additional information will be printed out.
 Note that setting \code{verbose > 0} automatically engages the
-\code{cb.print.evaluation(period=1)} callback function.}
+\code{xgb.cb.print.evaluation(period=1)} callback function.}
 
 \item{print_every_n}{Print each n-th iteration evaluation messages when \code{verbose>0}.
 Default is 1 which means all messages are printed. This parameter is passed to the
-\code{\link{cb.print.evaluation}} callback.}
+\code{\link{xgb.cb.print.evaluation}} callback.}
 
 \item{early_stopping_rounds}{If \code{NULL}, the early stopping function is not triggered.
 If set to an integer \code{k}, training with a validation set will stop if the performance
 doesn't improve for \code{k} rounds.
-Setting this parameter engages the \code{\link{cb.early.stop}} callback.}
+Setting this parameter engages the \code{\link{xgb.cb.early.stop}} callback.}
 
 \item{maximize}{If \code{feval} and \code{early_stopping_rounds} are set,
 then this parameter must be set as well.
 When it is \code{TRUE}, it means the larger the evaluation score the better.
-This parameter is passed to the \code{\link{cb.early.stop}} callback.}
+This parameter is passed to the \code{\link{xgb.cb.early.stop}} callback.}
 
 \item{save_period}{when it is non-NULL, model is saved to disk after every \code{save_period} rounds,
-0 means save at the end. The saving is handled by the \code{\link{cb.save.model}} callback.}
+0 means save at the end. The saving is handled by the \code{\link{xgb.cb.save.model}} callback.}
 
 \item{save_name}{the name or path for periodically saved model file.}
 
@@ -203,12 +203,13 @@ Could be either an object of class \code{xgb.Booster}, or its raw data, or the n
 file with a previously saved model.}
 
 \item{callbacks}{a list of callback functions to perform various task during boosting.
-See \code{\link{callbacks}}. Some of the callbacks are automatically created depending on the
+See \code{\link{xgb.Callback}}. Some of the callbacks are automatically created depending on the
 parameters' values. User can provide either existing or their own callback methods in order
 to customize the training process.
 
-\if{html}{\out{<div class="sourceCode">}}\preformatted{   Note that some callbacks might try to set an evaluation log - be aware that these evaluation logs
-   are kept as R attributes, and thus do not get saved when using non-R serializaters like
+\if{html}{\out{<div class="sourceCode">}}\preformatted{   Note that some callbacks might try to leave attributes in the resulting model object,
+   such as an evaluation log (a `data.table` object) - be aware that these objects are kept
+   as R attributes, and thus do not get saved when using XGBoost's own serializaters like
    \link{xgb.save} (but are kept when using R serializers like \link{saveRDS}).
 }\if{html}{\out{</div>}}}
 
@@ -269,18 +270,19 @@ Different threshold (e.g., 0.) could be specified as "error@0."
 
 The following callbacks are automatically created when certain parameters are set:
 \itemize{
-\item \code{cb.print.evaluation} is turned on when \code{verbose > 0};
+\item \code{xgb.cb.print.evaluation} is turned on when \code{verbose > 0};
 and the \code{print_every_n} parameter is passed to it.
-\item \code{cb.evaluation.log} is on when \code{watchlist} is present.
-\item \code{cb.early.stop}: when \code{early_stopping_rounds} is set.
-\item \code{cb.save.model}: when \code{save_period > 0} is set.
+\item \code{xgb.cb.evaluation.log} is on when \code{watchlist} is present.
+\item \code{xgb.cb.early.stop}: when \code{early_stopping_rounds} is set.
+\item \code{xgb.cb.save.model}: when \code{save_period > 0} is set.
 }
 
 Note that objects of type \code{xgb.Booster} as returned by this function behave a bit differently
 from typical R objects (it's an 'altrep' list class), and it makes a separation between
 internal booster attributes (restricted to jsonifyable data), accessed through \link{xgb.attr}
 and shared between interfaces through serialization functions like \link{xgb.save}; and
-R-specific attributes, accessed through \link{attributes} and \link{attr}, which are otherwise
+R-specific attributes (typically the result from a callback), accessed through \link{attributes}
+and \link{attr}, which are otherwise
 only used in the R interface, only kept when using R's serializers like \link{saveRDS}, and
 not anyhow used by functions like \link{predict.xgb.Booster}.
 
@@ -348,7 +350,7 @@ param <- list(max_depth = 2, eta = 1, nthread = nthread,
               objective = "binary:logistic", eval_metric = "auc")
 my_etas <- list(eta = c(0.5, 0.1))
 bst <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0,
-                 callbacks = list(cb.reset.parameters(my_etas)))
+                 callbacks = list(xgb.cb.reset.parameters(my_etas)))
 
 ## Early stopping:
 bst <- xgb.train(param, dtrain, nrounds = 25, watchlist,
@@ -366,7 +368,7 @@ Tianqi Chen and Carlos Guestrin, "XGBoost: A Scalable Tree Boosting System",
 22nd SIGKDD Conference on Knowledge Discovery and Data Mining, 2016, \url{https://arxiv.org/abs/1603.02754}
 }
 \seealso{
-\code{\link{callbacks}},
+\code{\link{xgb.Callback}},
 \code{\link{predict.xgb.Booster}},
 \code{\link{xgb.cv}}
 }
diff --git a/R-package/src/init.c b/R-package/src/init.c
index f2635742e..c869871c6 100644
--- a/R-package/src/init.c
+++ b/R-package/src/init.c
@@ -76,6 +76,7 @@ extern SEXP XGBSetGlobalConfig_R(SEXP);
 extern SEXP XGBGetGlobalConfig_R(void);
 extern SEXP XGBoosterFeatureScore_R(SEXP, SEXP);
 extern SEXP XGBoosterSlice_R(SEXP, SEXP, SEXP, SEXP);
+extern SEXP XGBoosterSliceAndReplace_R(SEXP, SEXP, SEXP, SEXP);
 
 static const R_CallMethodDef CallEntries[] = {
   {"XGDuplicate_R",               (DL_FUNC) &XGDuplicate_R,               1},
@@ -138,6 +139,7 @@ static const R_CallMethodDef CallEntries[] = {
   {"XGBGetGlobalConfig_R",        (DL_FUNC) &XGBGetGlobalConfig_R,        0},
   {"XGBoosterFeatureScore_R",     (DL_FUNC) &XGBoosterFeatureScore_R,     2},
   {"XGBoosterSlice_R",            (DL_FUNC) &XGBoosterSlice_R,            4},
+  {"XGBoosterSliceAndReplace_R",  (DL_FUNC) &XGBoosterSliceAndReplace_R,  4},
   {NULL, NULL, 0}
 };
 
diff --git a/R-package/src/xgboost_R.cc b/R-package/src/xgboost_R.cc
index 5baf8d412..2228932bd 100644
--- a/R-package/src/xgboost_R.cc
+++ b/R-package/src/xgboost_R.cc
@@ -1674,3 +1674,18 @@ XGB_DLL SEXP XGBoosterSlice_R(SEXP handle, SEXP begin_layer, SEXP end_layer, SEX
   Rf_unprotect(1);
   return out;
 }
+
+XGB_DLL SEXP XGBoosterSliceAndReplace_R(SEXP handle, SEXP begin_layer, SEXP end_layer, SEXP step) {
+  R_API_BEGIN();
+  BoosterHandle old_handle = R_ExternalPtrAddr(handle);
+  BoosterHandle new_handle = nullptr;
+  CHECK_CALL(XGBoosterSlice(old_handle,
+                            Rf_asInteger(begin_layer),
+                            Rf_asInteger(end_layer),
+                            Rf_asInteger(step),
+                            &new_handle));
+  R_SetExternalPtrAddr(handle, new_handle);
+  CHECK_CALL(XGBoosterFree(old_handle));
+  R_API_END();
+  return R_NilValue;
+}
diff --git a/R-package/src/xgboost_R.h b/R-package/src/xgboost_R.h
index 70fd885e7..cea50c146 100644
--- a/R-package/src/xgboost_R.h
+++ b/R-package/src/xgboost_R.h
@@ -535,4 +535,14 @@ XGB_DLL SEXP XGBoosterFeatureScore_R(SEXP handle, SEXP json_config);
  */
 XGB_DLL SEXP XGBoosterSlice_R(SEXP handle, SEXP begin_layer, SEXP end_layer, SEXP step);
 
+/*!
+ * \brief Slice a fitted booster model (by rounds), and replace its handle with the result
+ * \param handle handle to the fitted booster
+ * \param begin_layer start of the slice
+ * \param end_later end of the slice; end_layer=0 is equivalent to end_layer=num_boost_round
+ * \param step step size of the slice
+ * \return NULL
+ */
+XGB_DLL SEXP XGBoosterSliceAndReplace_R(SEXP handle, SEXP begin_layer, SEXP end_layer, SEXP step);
+
 #endif  // XGBOOST_WRAPPER_R_H_ // NOLINT(*)
diff --git a/R-package/tests/testthat.R b/R-package/tests/testthat.R
index 7cf711292..bad6c1df3 100644
--- a/R-package/tests/testthat.R
+++ b/R-package/tests/testthat.R
@@ -1,5 +1,6 @@
 library(testthat)
 library(xgboost)
+library(Matrix)
 
 test_check("xgboost", reporter = ProgressReporter)
 RhpcBLASctl::omp_set_num_threads(1)
diff --git a/R-package/tests/testthat/test_basic.R b/R-package/tests/testthat/test_basic.R
index 5438c8bb2..ee0f4c7ba 100644
--- a/R-package/tests/testthat/test_basic.R
+++ b/R-package/tests/testthat/test_basic.R
@@ -348,7 +348,6 @@ test_that("xgb.cv works", {
   expect_false(is.null(cv$folds) && is.list(cv$folds))
   expect_length(cv$folds, 5)
   expect_false(is.null(cv$params) && is.list(cv$params))
-  expect_false(is.null(cv$callbacks))
   expect_false(is.null(cv$call))
 })
 
diff --git a/R-package/tests/testthat/test_callbacks.R b/R-package/tests/testthat/test_callbacks.R
index c60d0c246..a0b4910cc 100644
--- a/R-package/tests/testthat/test_callbacks.R
+++ b/R-package/tests/testthat/test_callbacks.R
@@ -28,79 +28,125 @@ param <- list(objective = "binary:logistic", eval_metric = "error",
               max_depth = 2, nthread = n_threads)
 
 
-test_that("cb.print.evaluation works as expected", {
+test_that("xgb.cb.print.evaluation works as expected for xgb.train", {
+  logs1 <- capture.output({
+    model <- xgb.train(
+      data = dtrain,
+      params = list(
+        objective = "binary:logistic",
+        eval_metric = "auc",
+        max_depth = 2,
+        nthread = n_threads
+      ),
+      nrounds = 10,
+      watchlist = list(train = dtrain, test = dtest),
+      callbacks = list(xgb.cb.print.evaluation(period = 1))
+    )
+  })
+  expect_equal(length(logs1), 10)
+  expect_true(all(grepl("^\\[\\d{1,2}\\]\ttrain-auc:0\\.\\d+\ttest-auc:0\\.\\d+\\s*$", logs1)))
+  lapply(seq(1, 10), function(x) expect_true(grepl(paste0("^\\[", x), logs1[x])))
 
-  bst_evaluation <- c('train-auc' = 0.9, 'test-auc' = 0.8)
-  bst_evaluation_err <- NULL
-  begin_iteration <- 1
-  end_iteration <- 7
-
-  f0 <- cb.print.evaluation(period = 0)
-  f1 <- cb.print.evaluation(period = 1)
-  f5 <- cb.print.evaluation(period = 5)
-
-  expect_false(is.null(attr(f1, 'call')))
-  expect_equal(attr(f1, 'name'), 'cb.print.evaluation')
-
-  iteration <- 1
-  expect_silent(f0())
-  expect_output(f1(), "\\[1\\]\ttrain-auc:0.900000\ttest-auc:0.800000")
-  expect_output(f5(), "\\[1\\]\ttrain-auc:0.900000\ttest-auc:0.800000")
-  expect_null(f1())
-
-  iteration <- 2
-  expect_output(f1(), "\\[2\\]\ttrain-auc:0.900000\ttest-auc:0.800000")
-  expect_silent(f5())
-
-  iteration <- 7
-  expect_output(f1(), "\\[7\\]\ttrain-auc:0.900000\ttest-auc:0.800000")
-  expect_output(f5(), "\\[7\\]\ttrain-auc:0.900000\ttest-auc:0.800000")
-
-  bst_evaluation_err  <- c('train-auc' = 0.1, 'test-auc' = 0.2)
-  expect_output(f1(), "\\[7\\]\ttrain-auc:0.900000±0.100000\ttest-auc:0.800000±0.200000")
+  logs2 <- capture.output({
+    model <- xgb.train(
+      data = dtrain,
+      params = list(
+        objective = "binary:logistic",
+        eval_metric = "auc",
+        max_depth = 2,
+        nthread = n_threads
+      ),
+      nrounds = 10,
+      watchlist = list(train = dtrain, test = dtest),
+      callbacks = list(xgb.cb.print.evaluation(period = 2))
+    )
+  })
+  expect_equal(length(logs2), 6)
+  expect_true(all(grepl("^\\[\\d{1,2}\\]\ttrain-auc:0\\.\\d+\ttest-auc:0\\.\\d+\\s*$", logs2)))
+  seq_matches <- c(seq(1, 10, 2), 10)
+  lapply(seq_along(seq_matches), function(x) expect_true(grepl(paste0("^\\[", seq_matches[x]), logs2[x])))
 })
 
-test_that("cb.evaluation.log works as expected", {
+test_that("xgb.cb.print.evaluation works as expected for xgb.cv", {
+  logs1 <- capture.output({
+    model <- xgb.cv(
+      data = dtrain,
+      params = list(
+        objective = "binary:logistic",
+        eval_metric = "auc",
+        max_depth = 2,
+        nthread = n_threads
+      ),
+      nrounds = 10,
+      nfold = 3,
+      callbacks = list(xgb.cb.print.evaluation(period = 1, showsd = TRUE))
+    )
+  })
+  expect_equal(length(logs1), 10)
+  expect_true(all(grepl("^\\[\\d{1,2}\\]\ttrain-auc:0\\.\\d+±0\\.\\d+\ttest-auc:0\\.\\d+±0\\.\\d+\\s*$", logs1)))
+  lapply(seq(1, 10), function(x) expect_true(grepl(paste0("^\\[", x), logs1[x])))
 
-  bst_evaluation <- c('train-auc' = 0.9, 'test-auc' = 0.8)
-  bst_evaluation_err <- NULL
+  logs2 <- capture.output({
+    model <- xgb.cv(
+      data = dtrain,
+      params = list(
+        objective = "binary:logistic",
+        eval_metric = "auc",
+        max_depth = 2,
+        nthread = n_threads
+      ),
+      nrounds = 10,
+      nfold = 3,
+      callbacks = list(xgb.cb.print.evaluation(period = 2, showsd = TRUE))
+    )
+  })
+  expect_equal(length(logs2), 6)
+  expect_true(all(grepl("^\\[\\d{1,2}\\]\ttrain-auc:0\\.\\d+±0\\.\\d+\ttest-auc:0\\.\\d+±0\\.\\d+\\s*$", logs2)))
+  seq_matches <- c(seq(1, 10, 2), 10)
+  lapply(seq_along(seq_matches), function(x) expect_true(grepl(paste0("^\\[", seq_matches[x]), logs2[x])))
+})
 
-  evaluation_log <- list()
-  f <- cb.evaluation.log()
+test_that("xgb.cb.evaluation.log works as expected for xgb.train", {
+  model <- xgb.train(
+    data = dtrain,
+    params = list(
+      objective = "binary:logistic",
+      eval_metric = "auc",
+      max_depth = 2,
+      nthread = n_threads
+    ),
+    nrounds = 10,
+    verbose = FALSE,
+    watchlist = list(train = dtrain, test = dtest),
+    callbacks = list(xgb.cb.evaluation.log())
+  )
+  logs <- attributes(model)$evaluation_log
 
-  expect_false(is.null(attr(f, 'call')))
-  expect_equal(attr(f, 'name'), 'cb.evaluation.log')
+  expect_equal(nrow(logs), 10)
+  expect_equal(colnames(logs), c("iter", "train_auc", "test_auc"))
+})
 
-  iteration <- 1
-  expect_silent(f())
-  expect_equal(evaluation_log,
-               list(c(iter = 1, bst_evaluation)))
-  iteration <- 2
-  expect_silent(f())
-  expect_equal(evaluation_log,
-               list(c(iter = 1, bst_evaluation), c(iter = 2, bst_evaluation)))
-  expect_silent(f(finalize = TRUE))
-  expect_equal(evaluation_log,
-               data.table::data.table(iter = 1:2, train_auc = c(0.9, 0.9), test_auc = c(0.8, 0.8)))
+test_that("xgb.cb.evaluation.log works as expected for xgb.cv", {
+  model <- xgb.cv(
+    data = dtrain,
+    params = list(
+      objective = "binary:logistic",
+      eval_metric = "auc",
+      max_depth = 2,
+      nthread = n_threads
+    ),
+    nrounds = 10,
+    verbose = FALSE,
+    nfold = 3,
+    callbacks = list(xgb.cb.evaluation.log())
+  )
+  logs <- model$evaluation_log
 
-  bst_evaluation_err  <- c('train-auc' = 0.1, 'test-auc' = 0.2)
-  evaluation_log <- list()
-  f <- cb.evaluation.log()
-
-  iteration <- 1
-  expect_silent(f())
-  expect_equal(evaluation_log,
-               list(c(iter = 1, c(bst_evaluation, bst_evaluation_err))))
-  iteration <- 2
-  expect_silent(f())
-  expect_equal(evaluation_log,
-               list(c(iter = 1, c(bst_evaluation, bst_evaluation_err)),
-                    c(iter = 2, c(bst_evaluation, bst_evaluation_err))))
-  expect_silent(f(finalize = TRUE))
-  expect_equal(evaluation_log,
-               data.table::data.table(iter = 1:2,
-                          train_auc_mean = c(0.9, 0.9), train_auc_std = c(0.1, 0.1),
-                          test_auc_mean = c(0.8, 0.8), test_auc_std = c(0.2, 0.2)))
+  expect_equal(nrow(logs), 10)
+  expect_equal(
+    colnames(logs),
+    c("iter", "train_auc_mean", "train_auc_std", "test_auc_mean", "test_auc_std")
+  )
 })
 
 
@@ -116,7 +162,7 @@ test_that("can store evaluation_log without printing", {
   expect_lt(attributes(bst)$evaluation_log[, min(train_error)], 0.2)
 })
 
-test_that("cb.reset.parameters works as expected", {
+test_that("xgb.cb.reset.parameters works as expected", {
 
   # fixed eta
   set.seed(111)
@@ -128,7 +174,7 @@ test_that("cb.reset.parameters works as expected", {
   set.seed(111)
   my_par <- list(eta = c(0.9, 0.9))
   bst1 <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0,
-                    callbacks = list(cb.reset.parameters(my_par)))
+                    callbacks = list(xgb.cb.reset.parameters(my_par)))
   expect_false(is.null(attributes(bst1)$evaluation_log$train_error))
   expect_equal(attributes(bst0)$evaluation_log$train_error,
                attributes(bst1)$evaluation_log$train_error)
@@ -137,7 +183,7 @@ test_that("cb.reset.parameters works as expected", {
   set.seed(111)
   my_par <- list(eta = function(itr, itr_end) 0.9)
   bst2 <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0,
-                    callbacks = list(cb.reset.parameters(my_par)))
+                    callbacks = list(xgb.cb.reset.parameters(my_par)))
   expect_false(is.null(attributes(bst2)$evaluation_log$train_error))
   expect_equal(attributes(bst0)$evaluation_log$train_error,
                attributes(bst2)$evaluation_log$train_error)
@@ -146,7 +192,7 @@ test_that("cb.reset.parameters works as expected", {
   set.seed(111)
   my_par <- list(eta = c(0.6, 0.5))
   bst3 <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0,
-                    callbacks = list(cb.reset.parameters(my_par)))
+                    callbacks = list(xgb.cb.reset.parameters(my_par)))
   expect_false(is.null(attributes(bst3)$evaluation_log$train_error))
   expect_false(all(attributes(bst0)$evaluation_log$train_error == attributes(bst3)$evaluation_log$train_error))
 
@@ -154,25 +200,25 @@ test_that("cb.reset.parameters works as expected", {
   my_par <- list(eta = c(1., 0.5), gamma = c(1, 2), max_depth = c(4, 8))
   expect_error(
     bst4 <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0,
-                      callbacks = list(cb.reset.parameters(my_par)))
+                      callbacks = list(xgb.cb.reset.parameters(my_par)))
   , NA) # NA = no error
   # CV works as well
   expect_error(
     bst4 <- xgb.cv(param, dtrain, nfold = 2, nrounds = 2, verbose = 0,
-                   callbacks = list(cb.reset.parameters(my_par)))
+                   callbacks = list(xgb.cb.reset.parameters(my_par)))
   , NA) # NA = no error
 
   # expect no learning with 0 learning rate
   my_par <- list(eta = c(0., 0.))
   bstX <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0,
-                    callbacks = list(cb.reset.parameters(my_par)))
+                    callbacks = list(xgb.cb.reset.parameters(my_par)))
   expect_false(is.null(attributes(bstX)$evaluation_log$train_error))
   er <- unique(attributes(bstX)$evaluation_log$train_error)
   expect_length(er, 1)
   expect_gt(er, 0.4)
 })
 
-test_that("cb.save.model works as expected", {
+test_that("xgb.cb.save.model works as expected", {
   files <- c('xgboost_01.json', 'xgboost_02.json', 'xgboost.json')
   files <- unname(sapply(files, function(f) file.path(tempdir(), f)))
   for (f in files) if (file.exists(f)) file.remove(f)
@@ -238,8 +284,8 @@ test_that("early stopping using a specific metric works", {
   expect_output(
     bst <- xgb.train(param[-2], dtrain, nrounds = 20, watchlist, eta = 0.6,
                      eval_metric = "logloss", eval_metric = "auc",
-                     callbacks = list(cb.early.stop(stopping_rounds = 3, maximize = FALSE,
-                                                    metric_name = 'test_logloss')))
+                     callbacks = list(xgb.cb.early.stop(stopping_rounds = 3, maximize = FALSE,
+                                                        metric_name = 'test_logloss')))
   , "Stopping. Best iteration")
   expect_false(is.null(xgb.attr(bst, "best_iteration")))
   expect_lt(xgb.attr(bst, "best_iteration"), 19)
@@ -281,10 +327,10 @@ test_that("early stopping xgb.cv works", {
     cv <- xgb.cv(param, dtrain, nfold = 5, eta = 0.3, nrounds = 20,
                  early_stopping_rounds = 3, maximize = FALSE)
   , "Stopping. Best iteration")
-  expect_false(is.null(cv$best_iteration))
-  expect_lt(cv$best_iteration, 19)
+  expect_false(is.null(cv$early_stop$best_iteration))
+  expect_lt(cv$early_stop$best_iteration, 19)
   # the best error is min error:
-  expect_true(cv$evaluation_log[, test_error_mean[cv$best_iteration] == min(test_error_mean)])
+  expect_true(cv$evaluation_log[, test_error_mean[cv$early_stop$best_iteration] == min(test_error_mean)])
 })
 
 test_that("prediction in xgb.cv works", {
@@ -292,19 +338,19 @@ test_that("prediction in xgb.cv works", {
   nrounds <- 4
   cv <- xgb.cv(param, dtrain, nfold = 5, eta = 0.5, nrounds = nrounds, prediction = TRUE, verbose = 0)
   expect_false(is.null(cv$evaluation_log))
-  expect_false(is.null(cv$pred))
-  expect_length(cv$pred, nrow(train$data))
-  err_pred <- mean(sapply(cv$folds, function(f) mean(err(ltrain[f], cv$pred[f]))))
+  expect_false(is.null(cv$cv_predict$pred))
+  expect_length(cv$cv_predict$pred, nrow(train$data))
+  err_pred <- mean(sapply(cv$folds, function(f) mean(err(ltrain[f], cv$cv_predict$pred[f]))))
   err_log <- cv$evaluation_log[nrounds, test_error_mean]
   expect_equal(err_pred, err_log, tolerance = 1e-6)
 
   # save CV models
   set.seed(11)
   cvx <- xgb.cv(param, dtrain, nfold = 5, eta = 0.5, nrounds = nrounds, prediction = TRUE, verbose = 0,
-                callbacks = list(cb.cv.predict(save_models = TRUE)))
+                callbacks = list(xgb.cb.cv.predict(save_models = TRUE)))
   expect_equal(cv$evaluation_log, cvx$evaluation_log)
-  expect_length(cvx$models, 5)
-  expect_true(all(sapply(cvx$models, class) == 'xgb.Booster'))
+  expect_length(cvx$cv_predict$models, 5)
+  expect_true(all(sapply(cvx$cv_predict$models, class) == 'xgb.Booster'))
 })
 
 test_that("prediction in xgb.cv works for gblinear too", {
@@ -312,8 +358,8 @@ test_that("prediction in xgb.cv works for gblinear too", {
   p <- list(booster = 'gblinear', objective = "reg:logistic", nthread = n_threads)
   cv <- xgb.cv(p, dtrain, nfold = 5, eta = 0.5, nrounds = 2, prediction = TRUE, verbose = 0)
   expect_false(is.null(cv$evaluation_log))
-  expect_false(is.null(cv$pred))
-  expect_length(cv$pred, nrow(train$data))
+  expect_false(is.null(cv$cv_predict$pred))
+  expect_length(cv$cv_predict$pred, nrow(train$data))
 })
 
 test_that("prediction in early-stopping xgb.cv works", {
@@ -324,14 +370,14 @@ test_that("prediction in early-stopping xgb.cv works", {
                  prediction = TRUE, base_score = 0.5)
   , "Stopping. Best iteration")
 
-  expect_false(is.null(cv$best_iteration))
-  expect_lt(cv$best_iteration, 19)
+  expect_false(is.null(cv$early_stop$best_iteration))
+  expect_lt(cv$early_stop$best_iteration, 19)
   expect_false(is.null(cv$evaluation_log))
-  expect_false(is.null(cv$pred))
-  expect_length(cv$pred, nrow(train$data))
+  expect_false(is.null(cv$cv_predict$pred))
+  expect_length(cv$cv_predict$pred, nrow(train$data))
 
-  err_pred <- mean(sapply(cv$folds, function(f) mean(err(ltrain[f], cv$pred[f]))))
-  err_log <- cv$evaluation_log[cv$best_iteration, test_error_mean]
+  err_pred <- mean(sapply(cv$folds, function(f) mean(err(ltrain[f], cv$cv_predict$pred[f]))))
+  err_log <- cv$evaluation_log[cv$early_stop$best_iteration, test_error_mean]
   expect_equal(err_pred, err_log, tolerance = 1e-6)
   err_log_last <- cv$evaluation_log[cv$niter, test_error_mean]
   expect_gt(abs(err_pred - err_log_last), 1e-4)
@@ -346,9 +392,9 @@ test_that("prediction in xgb.cv for softprob works", {
                  subsample = 0.8, gamma = 2, verbose = 0,
                  prediction = TRUE, objective = "multi:softprob", num_class = 3)
   , NA)
-  expect_false(is.null(cv$pred))
-  expect_equal(dim(cv$pred), c(nrow(iris), 3))
-  expect_lt(diff(range(rowSums(cv$pred))), 1e-6)
+  expect_false(is.null(cv$cv_predict$pred))
+  expect_equal(dim(cv$cv_predict$pred), c(nrow(iris), 3))
+  expect_lt(diff(range(rowSums(cv$cv_predict$pred))), 1e-6)
 })
 
 test_that("prediction in xgb.cv works for multi-quantile", {
@@ -368,7 +414,7 @@ test_that("prediction in xgb.cv works for multi-quantile", {
     prediction = TRUE,
     verbose = 0
   )
-  expect_equal(dim(cv$pred), c(nrow(x), 5))
+  expect_equal(dim(cv$cv_predict$pred), c(nrow(x), 5))
 })
 
 test_that("prediction in xgb.cv works for multi-output", {
@@ -389,5 +435,46 @@ test_that("prediction in xgb.cv works for multi-output", {
     prediction = TRUE,
     verbose = 0
   )
-  expect_equal(dim(cv$pred), c(nrow(x), 2))
+  expect_equal(dim(cv$cv_predict$pred), c(nrow(x), 2))
+})
+
+test_that("prediction in xgb.cv works for multi-quantile", {
+  data(mtcars)
+  y <- mtcars$mpg
+  x <- as.matrix(mtcars[, -1])
+  dm <- xgb.DMatrix(x, label = y, nthread = 1)
+  cv <- xgb.cv(
+    data = dm,
+    params = list(
+      objective = "reg:quantileerror",
+      quantile_alpha = c(0.1, 0.2, 0.5, 0.8, 0.9),
+      nthread = 1
+    ),
+    nrounds = 5,
+    nfold = 3,
+    prediction = TRUE,
+    verbose = 0
+  )
+  expect_equal(dim(cv$cv_predict$pred), c(nrow(x), 5))
+})
+
+test_that("prediction in xgb.cv works for multi-output", {
+  data(mtcars)
+  y <- mtcars$mpg
+  x <- as.matrix(mtcars[, -1])
+  dm <- xgb.DMatrix(x, label = cbind(y, -y), nthread = 1)
+  cv <- xgb.cv(
+    data = dm,
+    params = list(
+      tree_method = "hist",
+      multi_strategy = "multi_output_tree",
+      objective = "reg:squarederror",
+      nthread = n_threads
+    ),
+    nrounds = 5,
+    nfold = 3,
+    prediction = TRUE,
+    verbose = 0
+  )
+  expect_equal(dim(cv$cv_predict$pred), c(nrow(x), 2))
 })
diff --git a/R-package/tests/testthat/test_glm.R b/R-package/tests/testthat/test_glm.R
index 349bcce8d..c089b4fe0 100644
--- a/R-package/tests/testthat/test_glm.R
+++ b/R-package/tests/testthat/test_glm.R
@@ -27,7 +27,7 @@ test_that("gblinear works", {
   expect_lt(attributes(bst)$evaluation_log$eval_error[n], ERR_UL)
 
   bst <- xgb.train(param, dtrain, n, watchlist, verbose = VERB, feature_selector = 'cyclic',
-                   callbacks = list(cb.gblinear.history()))
+                   callbacks = list(xgb.cb.gblinear.history()))
   expect_lt(attributes(bst)$evaluation_log$eval_error[n], ERR_UL)
   h <- xgb.gblinear.history(bst)
   expect_equal(dim(h), c(n, ncol(dtrain) + 1))
@@ -44,7 +44,7 @@ test_that("gblinear works", {
   expect_lt(attributes(bst)$evaluation_log$eval_error[2], ERR_UL)
 
   bst <- xgb.train(param, dtrain, n, watchlist, verbose = VERB, feature_selector = 'thrifty',
-                   top_k = 50, callbacks = list(cb.gblinear.history(sparse = TRUE)))
+                   top_k = 50, callbacks = list(xgb.cb.gblinear.history(sparse = TRUE)))
   expect_lt(attributes(bst)$evaluation_log$eval_error[n], ERR_UL)
   h <- xgb.gblinear.history(bst)
   expect_equal(dim(h), c(n, ncol(dtrain) + 1))