[R] Use new predict function. (#6819)

* Call new C prediction API. * Add `strict_shape`. * Add `iterationrange`. * Update document.
2021-06-11 13:03:29 +08:00
parent 25514e104a
commit b56614e9b8
18 changed files with 293 additions and 160 deletions
--- a/R-package/R/callbacks.R
+++ b/R-package/R/callbacks.R
@@ -263,10 +263,7 @@ cb.reset.parameters <- function(new_params) {
 #' \itemize{
 #' \item \code{best_score} the evaluation score at the best iteration
 #' \item \code{best_iteration} at which boosting iteration the best score has occurred (1-based index)
 #' \item \code{best_ntreelimit} to use with the \code{ntreelimit} parameter in \code{predict}.
 #'      It differs from \code{best_iteration} in multiclass or random forest settings.
 #' }
 #'
 #' The Same values are also stored as xgb-attributes:
 #' \itemize{
 #' \item \code{best_iteration} is stored as a 0-based iteration index (for interoperability of binary models)
@@ -498,13 +495,12 @@ cb.cv.predict <- function(save_models = FALSE) {
        rep(NA_real_, N)
      }
-    ntreelimit <- NVL(env$basket$best_ntreelimit,
+    iterationrange <- c(1, NVL(env$basket$best_iteration, env$end_iteration) + 1)
                      env$end_iteration * env$num_parallel_tree)
    if (NVL(env$params[['booster']], '') == 'gblinear') {
-      ntreelimit <- 0 # must be 0 for gblinear
+      iterationrange <- c(1, 1)  # must be 0 for gblinear
    }
    for (fd in env$bst_folds) {
-      pr <- predict(fd$bst, fd$watchlist[[2]], ntreelimit = ntreelimit, reshape = TRUE)
+      pr <- predict(fd$bst, fd$watchlist[[2]], iterationrange = iterationrange, reshape = TRUE)
      if (is.matrix(pred)) {
        pred[fd$index, ] <- pr
      } else {
--- a/R-package/R/utils.R
+++ b/R-package/R/utils.R
@@ -178,7 +178,8 @@ xgb.iter.eval <- function(booster_handle, watchlist, iter, feval = NULL) {
  } else {
    res <- sapply(seq_along(watchlist), function(j) {
      w <- watchlist[[j]]
-      preds <- predict(booster_handle, w, outputmargin = TRUE, ntreelimit = 0) # predict using all trees
+      ## predict using all trees
      preds <- predict(booster_handle, w, outputmargin = TRUE, iterationrange = c(1, 1))
      eval_res <- feval(preds, w)
      out <- eval_res$value
      names(out) <- paste0(evnames[j], "-", eval_res$metric)
--- a/R-package/R/xgb.Booster.R
+++ b/R-package/R/xgb.Booster.R
@@ -168,8 +168,7 @@ xgb.Booster.complete <- function(object, saveraw = TRUE) {
 #' @param outputmargin whether the prediction should be returned in the for of original untransformed
 #'        sum of predictions from boosting iterations' results. E.g., setting \code{outputmargin=TRUE} for
 #'        logistic regression would result in predictions for log-odds instead of probabilities.
-#' @param ntreelimit limit the number of model's trees or boosting iterations used in prediction (see Details).
+#' @param ntreelimit Deprecated, use \code{iterationrange} instead.
 #'        It will use all the trees by default (\code{NULL} value).
 #' @param predleaf whether predict leaf index.
 #' @param predcontrib whether to return feature contributions to individual predictions (see Details).
 #' @param approxcontrib whether to use a fast approximation for feature contributions (see Details).
@@ -179,16 +178,19 @@ xgb.Booster.complete <- function(object, saveraw = TRUE) {
 #'        or predinteraction flags is TRUE.
 #' @param training whether is the prediction result used for training.  For dart booster,
 #'        training predicting will perform dropout.
 #' @param iterationrange Specifies which layer of trees are used in prediction.  For
 #'        example, if a random forest is trained with 100 rounds.  Specifying
 #'        `iteration_range=(1, 21)`, then only the forests built during [1, 21) (half open set)
 #'        rounds are used in this prediction.  It's 1-based index just like R vector.  When set
 #'        to \code{c(1, 1)} XGBoost will use all trees.
 #' @param strict_shape  Default is \code{FALSE}. When it's set to \code{TRUE}, output
 #'        type and shape of prediction are invariant to model type.
 #'
 #' @param ... Parameters passed to \code{predict.xgb.Booster}
 #'
 #' @details
 #' Note that \code{ntreelimit} is not necessarily equal to the number of boosting iterations
 #' and it is not necessarily equal to the number of trees in a model.
 #' E.g., in a random forest-like model, \code{ntreelimit} would limit the number of trees.
 #' But for multiclass classification, while there are multiple trees per iteration,
 #' \code{ntreelimit} limits the number of boosting iterations.
 #'
-#' Also note that \code{ntreelimit} would currently do nothing for predictions from gblinear,
+#' Note that \code{iterationrange} would currently do nothing for predictions from gblinear,
 #' since gblinear doesn't keep its boosting history.
 #'
 #' One possible practical applications of the \code{predleaf} option is to use the model
@@ -209,7 +211,8 @@ xgb.Booster.complete <- function(object, saveraw = TRUE) {
 #' of the most important features first. See below about the format of the returned results.
 #'
 #' @return
-#' For regression or binary classification, it returns a vector of length \code{nrows(newdata)}.
+#' The return type is different depending whether \code{strict_shape} is set to \code{TRUE}.  By default,
 #' for regression or binary classification, it returns a vector of length \code{nrows(newdata)}.
 #' For multiclass classification, either a \code{num_class * nrows(newdata)} vector or
 #' a \code{(nrows(newdata), num_class)} dimension matrix is returned, depending on
 #' the \code{reshape} value.
@@ -231,6 +234,13 @@ xgb.Booster.complete <- function(object, saveraw = TRUE) {
 #' For a multiclass case, a list of \code{num_class} elements is returned, where each element is
 #' such an array.
 #'
 #' When \code{strict_shape} is set to \code{TRUE}, the output is always an array.  For
 #' normal prediction, the output is a 2-dimension array \code{(num_class, nrow(newdata))}.
 #'
 #' For \code{predcontrib = TRUE}, output is \code{(ncol(newdata) + 1, num_class, nrow(newdata))}
 #' For \code{predinteraction = TRUE}, output is \code{(ncol(newdata) + 1, ncol(newdata) + 1, num_class, nrow(newdata))}
 #' For \code{predleaf = TRUE}, output is \code{(n_trees_in_forest, num_class, n_iterations, nrow(newdata))}
 #'
 #' @seealso
 #' \code{\link{xgb.train}}.
 #'
@@ -253,7 +263,7 @@ xgb.Booster.complete <- function(object, saveraw = TRUE) {
 #' # use all trees by default
 #' pred <- predict(bst, test$data)
 #' # use only the 1st tree
-#' pred1 <- predict(bst, test$data, ntreelimit = 1)
+#' pred1 <- predict(bst, test$data, iterationrange = c(1, 2))
 #'
 #' # Predicting tree leafs:
 #' # the result is an nsamples X ntrees matrix
@@ -305,31 +315,14 @@ xgb.Booster.complete <- function(object, saveraw = TRUE) {
 #' all.equal(pred, pred_labels)
 #' # prediction from using only 5 iterations should result
 #' # in the same error as seen in iteration 5:
-#' pred5 <- predict(bst, as.matrix(iris[, -5]), ntreelimit=5)
+#' pred5 <- predict(bst, as.matrix(iris[, -5]), iterationrange=c(1, 6))
 #' sum(pred5 != lb)/length(lb)
 #'
 #'
 #' ## random forest-like model of 25 trees for binary classification:
 #'
 #' set.seed(11)
 #' bst <- xgboost(data = train$data, label = train$label, max_depth = 5,
 #'                nthread = 2, nrounds = 1, objective = "binary:logistic",
 #'                num_parallel_tree = 25, subsample = 0.6, colsample_bytree = 0.1)
 #' # Inspect the prediction error vs number of trees:
 #' lb <- test$label
 #' dtest <- xgb.DMatrix(test$data, label=lb)
 #' err <- sapply(1:25, function(n) {
 #'   pred <- predict(bst, dtest, ntreelimit=n)
 #'   sum((pred > 0.5) != lb)/length(lb)
 #' })
 #' plot(err, type='l', ylim=c(0,0.1), xlab='#trees')
 #'
 #' @rdname predict.xgb.Booster
 #' @export
 predict.xgb.Booster <- function(object, newdata, missing = NA, outputmargin = FALSE, ntreelimit = NULL,
                                predleaf = FALSE, predcontrib = FALSE, approxcontrib = FALSE, predinteraction = FALSE,
-                                reshape = FALSE, training = FALSE, ...) {
+                                reshape = FALSE, training = FALSE, iterationrange = NULL, strict_shape = FALSE, ...) {
  object <- xgb.Booster.complete(object, saveraw = FALSE)
  if (!inherits(newdata, "xgb.DMatrix"))
    newdata <- xgb.DMatrix(newdata, missing = missing)
@@ -337,81 +330,114 @@ predict.xgb.Booster <- function(object, newdata, missing = NA, outputmargin = FA
      !is.null(colnames(newdata)) &&
      !identical(object[["feature_names"]], colnames(newdata)))
    stop("Feature names stored in `object` and `newdata` are different!")
-  if (is.null(ntreelimit))
+
-    ntreelimit <- NVL(object$best_ntreelimit, 0)
+  if (NVL(object$params[['booster']], '') == 'gblinear' || is.null(ntreelimit))
  if (NVL(object$params[['booster']], '') == 'gblinear')
    ntreelimit <- 0
  if (ntreelimit < 0)
    stop("ntreelimit cannot be negative")
-  option <- 0L + 1L * as.logical(outputmargin) + 2L * as.logical(predleaf) + 4L * as.logical(predcontrib) +
+  if (ntreelimit != 0 && is.null(iterationrange)) {
-    8L * as.logical(approxcontrib) + 16L * as.logical(predinteraction)
+    ## only ntreelimit, initialize iteration range
-
+    iterationrange <- c(0, 0)
-  ret <- .Call(XGBoosterPredict_R, object$handle, newdata, option[1],
+  } else if (ntreelimit == 0 && !is.null(iterationrange)) {
-               as.integer(ntreelimit), as.integer(training))
+    ## only iteration range, handle 1-based indexing
-
+    iterationrange <- c(iterationrange[1] - 1, iterationrange[2] - 1)
-  n_ret <- length(ret)
+  } else if (ntreelimit != 0 && !is.null(iterationrange)) {
-  n_row <- nrow(newdata)
+    ## both are specified, let libgxgboost throw an error
-  npred_per_case <- n_ret / n_row
+  } else {
-
+    ## no limit is supplied, use best
-  if (n_ret %% n_row != 0)
+    if (is.null(object$best_iteration)) {
-    stop("prediction length ", n_ret, " is not multiple of nrows(newdata) ", n_row)
+      iterationrange <- c(0, 0)
  if (predleaf) {
    ret <- if (n_ret == n_row) {
      matrix(ret, ncol = 1)
    } else {
-      matrix(ret, nrow = n_row, byrow = TRUE)
+      ## We don't need to + 1 as R is 1-based index.
      iterationrange <- c(0, as.integer(object$best_iteration))
    }
-  } else if (predcontrib) {
+  }
-    n_col1 <- ncol(newdata) + 1
+  ## Handle the 0 length values.
-    n_group <- npred_per_case / n_col1
+  box <- function(val) {
-    cnames <- if (!is.null(colnames(newdata))) c(colnames(newdata), "BIAS") else NULL
+    if (length(val) == 0) {
-    ret <- if (n_ret == n_row) {
+      cval <- vector(, 1)
-      matrix(ret, ncol = 1, dimnames = list(NULL, cnames))
+      cval[0] <- val
-    } else if (n_group == 1) {
+      return(cval)
-      matrix(ret, nrow = n_row, byrow = TRUE, dimnames = list(NULL, cnames))
+    }
-    } else {
+    return (val)
-      arr <- aperm(
+  }
-        a = array(
+
-          data = ret,
+  ## We set strict_shape to TRUE then drop the dimensions conditionally
-          dim = c(n_col1, n_group, n_row),
+  args <- list(
-          dimnames = list(cnames, NULL, NULL)
+    training = box(training),
-        ),
+    strict_shape = box(TRUE),
-        perm = c(2, 3, 1)  # [group, row, col]
+    iteration_begin = box(as.integer(iterationrange[1])),
-      )
+    iteration_end = box(as.integer(iterationrange[2])),
-      lapply(seq_len(n_group), function(g) arr[g, , ])
+    ntree_limit = box(as.integer(ntreelimit)),
    type = box(as.integer(0))
  )
  set_type <- function(type) {
    if (args$type != 0) {
      stop("One type of prediction at a time.")
    }
    return(box(as.integer(type)))
  }
  if (outputmargin) {
    args$type <- set_type(1)
  }
  if (predcontrib) {
    args$type <- set_type(if (approxcontrib) 3 else 2)
  }
  if (predinteraction) {
    args$type <- set_type(if (approxcontrib) 5 else 4)
  }
  if (predleaf) {
    args$type <- set_type(6)
  }
  predts <- .Call(
    XGBoosterPredictFromDMatrix_R, object$handle, newdata, jsonlite::toJSON(args, auto_unbox = TRUE)
  )
  names(predts) <- c("shape", "results")
  shape <- predts$shape
  ret <- predts$results
  n_row <- nrow(newdata)
  if (n_row != shape[1]) {
    stop("Incorrect predict shape.")
  }
  arr <- array(data = ret, dim = rev(shape))
  cnames <- if (!is.null(colnames(newdata))) c(colnames(newdata), "BIAS") else NULL
  if (predcontrib) {
    dimnames(arr) <- list(cnames, NULL, NULL)
    if (!strict_shape) {
      arr <- aperm(a = arr, perm = c(2, 3, 1)) # [group, row, col]
    }
  } else if (predinteraction) {
-    n_col1 <- ncol(newdata) + 1
+    dimnames(arr) <- list(cnames, cnames, NULL, NULL)
-    n_group <- npred_per_case / n_col1^2
+    if (!strict_shape) {
-    cnames <- if (!is.null(colnames(newdata))) c(colnames(newdata), "BIAS") else NULL
+      arr <- aperm(a = arr, perm = c(3, 4, 1, 2)) # [group, row, col, col]
    ret <- if (n_ret == n_row) {
      matrix(ret, ncol = 1, dimnames = list(NULL, cnames))
    } else if (n_group == 1) {
      aperm(
        a = array(
          data = ret,
          dim = c(n_col1, n_col1, n_row),
          dimnames = list(cnames, cnames, NULL)
        ),
        perm = c(3, 1, 2)
      )
    } else {
      arr <- aperm(
        a = array(
          data = ret,
          dim = c(n_col1, n_col1, n_group, n_row),
          dimnames = list(cnames, cnames, NULL, NULL)
        ),
        perm = c(3, 4, 1, 2)  # [group, row, col1, col2]
      )
      lapply(seq_len(n_group), function(g) arr[g, , , ])
    }
  } else if (reshape && npred_per_case > 1) {
    ret <- matrix(ret, nrow = n_row, byrow = TRUE)
  }
-  return(ret)
+
  if (!strict_shape) {
    n_groups <- shape[2]
    if (predleaf) {
      arr <- matrix(arr, nrow = n_row, byrow = TRUE)
    } else if (predcontrib && n_groups != 1) {
      arr <- lapply(seq_len(n_groups), function(g) arr[g, , ])
    } else if (predinteraction && n_groups != 1) {
      arr <- lapply(seq_len(n_groups), function(g) arr[g, , , ])
    } else if (!reshape && n_groups != 1) {
      arr <- ret
    } else if (reshape && n_groups != 1) {
      arr <- matrix(arr, ncol = 3, byrow = TRUE)
    }
    arr <- drop(arr)
    if (length(dim(arr)) == 1) {
      arr <- as.vector(arr)
    } else if (length(dim(arr)) == 2) {
      arr <- as.matrix(arr)
    }
  }
  return(arr)
 }
 #' @rdname predict.xgb.Booster
--- a/R-package/R/xgb.cv.R
+++ b/R-package/R/xgb.cv.R
@@ -101,9 +101,7 @@
 #'         parameter or randomly generated.
 #'   \item \code{best_iteration} iteration number with the best evaluation metric value
 #'         (only available with early stopping).
-#'   \item \code{best_ntreelimit} the \code{ntreelimit} value corresponding to the best iteration,
+#'   \item \code{best_ntreelimit} and the \code{ntreelimit} Deprecated attributes, use \code{best_iteration} instead.
 #'         which could further be used in \code{predict} method
 #'         (only available with early stopping).
 #'   \item \code{pred} CV prediction values available when \code{prediction} is set.
 #'         It is either vector or matrix (see \code{\link{cb.cv.predict}}).
 #'   \item \code{models} a list of the CV folds' models. It is only available with the explicit
--- a/R-package/R/xgb.train.R
+++ b/R-package/R/xgb.train.R
@@ -171,9 +171,6 @@
 #'         explicitly passed.
 #'   \item \code{best_iteration} iteration number with the best evaluation metric value
 #'         (only available with early stopping).
 #'   \item \code{best_ntreelimit} the \code{ntreelimit} value corresponding to the best iteration,
 #'         which could further be used in \code{predict} method
 #'         (only available with early stopping).
 #'   \item \code{best_score} the best evaluation metric value during early stopping.
 #'         (only available with early stopping).
 #'   \item \code{feature_names} names of the training dataset features
--- a/R-package/man/cb.early.stop.Rd
+++ b/R-package/man/cb.early.stop.Rd
@@ -38,10 +38,7 @@ The following additional fields are assigned to the model's R object:
 \itemize{
 \item \code{best_score} the evaluation score at the best iteration
 \item \code{best_iteration} at which boosting iteration the best score has occurred (1-based index)
 \item \code{best_ntreelimit} to use with the \code{ntreelimit} parameter in \code{predict}.
     It differs from \code{best_iteration} in multiclass or random forest settings.
 }
 The Same values are also stored as xgb-attributes:
 \itemize{
 \item \code{best_iteration} is stored as a 0-based iteration index (for interoperability of binary models)
--- a/R-package/man/predict.xgb.Booster.Rd
+++ b/R-package/man/predict.xgb.Booster.Rd
@@ -17,6 +17,8 @@
  predinteraction = FALSE,
  reshape = FALSE,
  training = FALSE,
  iterationrange = NULL,
  strict_shape = FALSE,
  ...
 )
@@ -34,8 +36,7 @@ missing values in data (e.g., sometimes 0 or some other extreme value is used).}
 sum of predictions from boosting iterations' results. E.g., setting \code{outputmargin=TRUE} for
 logistic regression would result in predictions for log-odds instead of probabilities.}
-\item{ntreelimit}{limit the number of model's trees or boosting iterations used in prediction (see Details).
+\item{ntreelimit}{Deprecated, use \code{iterationrange} instead.}
 It will use all the trees by default (\code{NULL} value).}
 \item{predleaf}{whether predict leaf index.}
@@ -52,10 +53,20 @@ or predinteraction flags is TRUE.}
 \item{training}{whether is the prediction result used for training.  For dart booster,
 training predicting will perform dropout.}
 \item{iterationrange}{Specifies which layer of trees are used in prediction.  For
 example, if a random forest is trained with 100 rounds.  Specifying
 `iteration_range=(1, 21)`, then only the forests built during [1, 21) (half open set)
 rounds are used in this prediction.  It's 1-based index just like R vector.  When set
 to \code{c(1, 1)} XGBoost will use all trees.}
 \item{strict_shape}{Default is \code{FALSE}. When it's set to \code{TRUE}, output
 type and shape of prediction are invariant to model type.}
 \item{...}{Parameters passed to \code{predict.xgb.Booster}}
 }
 \value{
-For regression or binary classification, it returns a vector of length \code{nrows(newdata)}.
+The return type is different depending whether \code{strict_shape} is set to \code{TRUE}.  By default,
 for regression or binary classification, it returns a vector of length \code{nrows(newdata)}.
 For multiclass classification, either a \code{num_class * nrows(newdata)} vector or
 a \code{(nrows(newdata), num_class)} dimension matrix is returned, depending on
 the \code{reshape} value.
@@ -76,18 +87,19 @@ two dimensions. The "+ 1" columns corresponds to bias. Summing this array along
 produce practically the same result as predict with \code{predcontrib = TRUE}.
 For a multiclass case, a list of \code{num_class} elements is returned, where each element is
 such an array.
 When \code{strict_shape} is set to \code{TRUE}, the output is always an array.  For
 normal prediction, the output is a 2-dimension array \code{(num_class, nrow(newdata))}.
 For \code{predcontrib = TRUE}, output is \code{(ncol(newdata) + 1, num_class, nrow(newdata))}
 For \code{predinteraction = TRUE}, output is \code{(ncol(newdata) + 1, ncol(newdata) + 1, num_class, nrow(newdata))}
 For \code{predleaf = TRUE}, output is \code{(n_trees_in_forest, num_class, n_iterations, nrow(newdata))}
 }
 \description{
 Predicted values based on either xgboost model or model handle object.
 }
 \details{
-Note that \code{ntreelimit} is not necessarily equal to the number of boosting iterations
+Note that \code{iterationrange} would currently do nothing for predictions from gblinear,
 and it is not necessarily equal to the number of trees in a model.
 E.g., in a random forest-like model, \code{ntreelimit} would limit the number of trees.
 But for multiclass classification, while there are multiple trees per iteration,
 \code{ntreelimit} limits the number of boosting iterations.
 Also note that \code{ntreelimit} would currently do nothing for predictions from gblinear,
 since gblinear doesn't keep its boosting history.
 One possible practical applications of the \code{predleaf} option is to use the model
@@ -120,7 +132,7 @@ bst <- xgboost(data = train$data, label = train$label, max_depth = 2,
 # use all trees by default
 pred <- predict(bst, test$data)
 # use only the 1st tree
-pred1 <- predict(bst, test$data, ntreelimit = 1)
+pred1 <- predict(bst, test$data, iterationrange = c(1, 2))
 # Predicting tree leafs:
 # the result is an nsamples X ntrees matrix
@@ -172,25 +184,9 @@ str(pred)
 all.equal(pred, pred_labels)
 # prediction from using only 5 iterations should result
 # in the same error as seen in iteration 5:
-pred5 <- predict(bst, as.matrix(iris[, -5]), ntreelimit=5)
+pred5 <- predict(bst, as.matrix(iris[, -5]), iterationrange=c(1, 6))
 sum(pred5 != lb)/length(lb)
 ## random forest-like model of 25 trees for binary classification:
 set.seed(11)
 bst <- xgboost(data = train$data, label = train$label, max_depth = 5,
               nthread = 2, nrounds = 1, objective = "binary:logistic",
               num_parallel_tree = 25, subsample = 0.6, colsample_bytree = 0.1)
 # Inspect the prediction error vs number of trees:
 lb <- test$label
 dtest <- xgb.DMatrix(test$data, label=lb)
 err <- sapply(1:25, function(n) {
  pred <- predict(bst, dtest, ntreelimit=n)
  sum((pred > 0.5) != lb)/length(lb)
 })
 plot(err, type='l', ylim=c(0,0.1), xlab='#trees')
 }
 \references{
 Scott M. Lundberg, Su-In Lee, "A Unified Approach to Interpreting Model Predictions", NIPS Proceedings 2017, \url{https://arxiv.org/abs/1705.07874}
--- a/R-package/man/xgb.cv.Rd
+++ b/R-package/man/xgb.cv.Rd
@@ -135,9 +135,7 @@ An object of class \code{xgb.cv.synchronous} with the following elements:
        parameter or randomly generated.
  \item \code{best_iteration} iteration number with the best evaluation metric value
        (only available with early stopping).
-  \item \code{best_ntreelimit} the \code{ntreelimit} value corresponding to the best iteration,
+  \item \code{best_ntreelimit} and the \code{ntreelimit} Deprecated attributes, use \code{best_iteration} instead.
        which could further be used in \code{predict} method
        (only available with early stopping).
  \item \code{pred} CV prediction values available when \code{prediction} is set.
        It is either vector or matrix (see \code{\link{cb.cv.predict}}).
  \item \code{models} a list of the CV folds' models. It is only available with the explicit
--- a/R-package/man/xgb.train.Rd
+++ b/R-package/man/xgb.train.Rd
@@ -187,9 +187,6 @@ An object of class \code{xgb.Booster} with the following elements:
        explicitly passed.
  \item \code{best_iteration} iteration number with the best evaluation metric value
        (only available with early stopping).
  \item \code{best_ntreelimit} the \code{ntreelimit} value corresponding to the best iteration,
        which could further be used in \code{predict} method
        (only available with early stopping).
  \item \code{best_score} the best evaluation metric value during early stopping.
        (only available with early stopping).
  \item \code{feature_names} names of the training dataset features
--- a/R-package/src/init.c
+++ b/R-package/src/init.c
@@ -30,6 +30,7 @@ extern SEXP XGBoosterSerializeToBuffer_R(SEXP handle);
 extern SEXP XGBoosterUnserializeFromBuffer_R(SEXP handle, SEXP raw);
 extern SEXP XGBoosterModelToRaw_R(SEXP);
 extern SEXP XGBoosterPredict_R(SEXP, SEXP, SEXP, SEXP, SEXP);
 extern SEXP XGBoosterPredictFromDMatrix_R(SEXP, SEXP, SEXP);
 extern SEXP XGBoosterSaveModel_R(SEXP, SEXP);
 extern SEXP XGBoosterSetAttr_R(SEXP, SEXP, SEXP);
 extern SEXP XGBoosterSetParam_R(SEXP, SEXP, SEXP);
@@ -63,6 +64,7 @@ static const R_CallMethodDef CallEntries[] = {
  {"XGBoosterUnserializeFromBuffer_R", (DL_FUNC) &XGBoosterUnserializeFromBuffer_R, 2},
  {"XGBoosterModelToRaw_R",       (DL_FUNC) &XGBoosterModelToRaw_R,       1},
  {"XGBoosterPredict_R",          (DL_FUNC) &XGBoosterPredict_R,          5},
  {"XGBoosterPredictFromDMatrix_R", (DL_FUNC) &XGBoosterPredictFromDMatrix_R, 3},
  {"XGBoosterSaveModel_R",        (DL_FUNC) &XGBoosterSaveModel_R,        2},
  {"XGBoosterSetAttr_R",          (DL_FUNC) &XGBoosterSetAttr_R,          3},
  {"XGBoosterSetParam_R",         (DL_FUNC) &XGBoosterSetParam_R,         3},
--- a/R-package/src/xgboost_R.cc
+++ b/R-package/src/xgboost_R.cc
@@ -374,6 +374,45 @@ SEXP XGBoosterPredict_R(SEXP handle, SEXP dmat, SEXP option_mask,
  return ret;
 }
 SEXP XGBoosterPredictFromDMatrix_R(SEXP handle, SEXP dmat, SEXP json_config)  {
  SEXP r_out_shape;
  SEXP r_out_result;
  SEXP r_out;
  R_API_BEGIN();
  char const *c_json_config = CHAR(asChar(json_config));
  bst_ulong out_dim;
  bst_ulong const *out_shape;
  float const *out_result;
  CHECK_CALL(XGBoosterPredictFromDMatrix(R_ExternalPtrAddr(handle),
                                         R_ExternalPtrAddr(dmat), c_json_config,
                                         &out_shape, &out_dim, &out_result));
  r_out_shape = PROTECT(allocVector(INTSXP, out_dim));
  size_t len = 1;
  for (size_t i = 0; i < out_dim; ++i) {
    INTEGER(r_out_shape)[i] = out_shape[i];
    len *= out_shape[i];
  }
  r_out_result = PROTECT(allocVector(REALSXP, len));
 #pragma omp parallel for
  for (omp_ulong i = 0; i < len; ++i) {
    REAL(r_out_result)[i] = out_result[i];
  }
  r_out = PROTECT(allocVector(VECSXP, 2));
  SET_VECTOR_ELT(r_out, 0, r_out_shape);
  SET_VECTOR_ELT(r_out, 1, r_out_result);
  R_API_END();
  UNPROTECT(3);
  return r_out;
 }
 SEXP XGBoosterLoadModel_R(SEXP handle, SEXP fname) {
  R_API_BEGIN();
  CHECK_CALL(XGBoosterLoadModel(R_ExternalPtrAddr(handle), CHAR(asChar(fname))));
--- a/R-package/src/xgboost_R.h
+++ b/R-package/src/xgboost_R.h
@@ -164,7 +164,7 @@ XGB_DLL SEXP XGBoosterBoostOneIter_R(SEXP handle, SEXP dtrain, SEXP grad, SEXP h
 XGB_DLL SEXP XGBoosterEvalOneIter_R(SEXP handle, SEXP iter, SEXP dmats, SEXP evnames);
 /*!
- * \brief make prediction based on dmat
+ * \brief (Deprecated) make prediction based on dmat
 * \param handle handle
 * \param dmat data matrix
 * \param option_mask output_margin:1 predict_leaf:2
@@ -173,6 +173,16 @@ XGB_DLL SEXP XGBoosterEvalOneIter_R(SEXP handle, SEXP iter, SEXP dmats, SEXP evn
 */
 XGB_DLL SEXP XGBoosterPredict_R(SEXP handle, SEXP dmat, SEXP option_mask,
                                SEXP ntree_limit, SEXP training);
 /*!
 * \brief Run prediction on DMatrix, replacing `XGBoosterPredict_R`
 * \param handle handle
 * \param dmat data matrix
 * \param json_config See `XGBoosterPredictFromDMatrix` in xgboost c_api.h
 *
 * \return A list containing 2 vectors, first one for shape while second one for prediction result.
 */
 XGB_DLL SEXP XGBoosterPredictFromDMatrix_R(SEXP handle, SEXP dmat, SEXP json_config);
 /*!
 * \brief load model from existing file
 * \param handle handle
--- a/R-package/tests/testthat/test_basic.R
+++ b/R-package/tests/testthat/test_basic.R
@@ -34,6 +34,10 @@ test_that("train and predict binary classification", {
  err_pred1 <- sum((pred1 > 0.5) != train$label) / length(train$label)
  err_log <- bst$evaluation_log[1, train_error]
  expect_lt(abs(err_pred1 - err_log), 10e-6)
  pred2 <- predict(bst, train$data, iterationrange = c(1, 2))
  expect_length(pred1, 6513)
  expect_equal(pred1, pred2)
 })
 test_that("parameter validation works", {
@@ -143,6 +147,9 @@ test_that("train and predict softprob", {
  pred_labels <- max.col(mpred) - 1
  err <- sum(pred_labels != lb) / length(lb)
  expect_equal(bst$evaluation_log[1, train_merror], err, tolerance = 5e-6)
  mpred1 <- predict(bst, as.matrix(iris[, -5]), reshape = TRUE, iterationrange = c(1, 2))
  expect_equal(mpred, mpred1)
 })
 test_that("train and predict softmax", {
@@ -182,10 +189,8 @@ test_that("train and predict RF", {
  pred_err_20 <- sum((pred > 0.5) != lb) / length(lb)
  expect_equal(pred_err_20, pred_err)
-  #pred <- predict(bst, train$data, ntreelimit = 1)
+  pred1 <- predict(bst, train$data, iterationrange = c(1, 2))
-  #pred_err_1 <- sum((pred > 0.5) != lb)/length(lb)
+  expect_equal(pred, pred1)
  #expect_lt(pred_err, pred_err_1)
  #expect_lt(pred_err, 0.08)
 })
 test_that("train and predict RF with softprob", {
@@ -385,3 +390,57 @@ test_that("Configuration works", {
  reloaded_config <- xgb.config(bst)
  expect_equal(config, reloaded_config);
 })
 test_that("strict_shape works", {
  n_rounds <- 2
  test_strict_shape <- function(bst, X, n_groups) {
    predt <- predict(bst, X, strict_shape = TRUE)
    margin <- predict(bst, X, outputmargin = TRUE, strict_shape = TRUE)
    contri <- predict(bst, X, predcontrib = TRUE, strict_shape = TRUE)
    interact <- predict(bst, X, predinteraction = TRUE, strict_shape = TRUE)
    leaf <- predict(bst, X, predleaf = TRUE, strict_shape = TRUE)
    n_rows <- nrow(X)
    n_cols <- ncol(X)
    expect_equal(dim(predt), c(n_groups, n_rows))
    expect_equal(dim(margin), c(n_groups, n_rows))
    expect_equal(dim(contri), c(n_cols + 1, n_groups, n_rows))
    expect_equal(dim(interact), c(n_cols + 1, n_cols + 1, n_groups, n_rows))
    expect_equal(dim(leaf), c(1, n_groups, n_rounds, n_rows))
    if (n_groups != 1) {
      for (g in seq_len(n_groups)) {
        expect_lt(max(abs(colSums(contri[, g, ]) - margin[g, ])), 1e-5)
      }
    }
  }
  test_iris <- function() {
    y <- as.numeric(iris$Species) - 1
    X <- as.matrix(iris[, -5])
    bst <- xgboost(data = X, label = y,
                   max_depth = 2, nrounds = n_rounds,
                   objective = "multi:softprob", num_class = 3, eval_metric = "merror")
    test_strict_shape(bst, X, 3)
  }
  test_agaricus <- function() {
    data(agaricus.train, package = 'xgboost')
    X <- agaricus.train$data
    y <- agaricus.train$label
    bst <- xgboost(data = X, label = y, max_depth = 2,
                   nrounds = n_rounds, objective = "binary:logistic",
                   eval_metric = 'error', eval_metric = 'auc', eval_metric = "logloss")
    test_strict_shape(bst, X, 1)
  }
  test_iris()
  test_agaricus()
 })
--- a/doc/prediction.rst
+++ b/doc/prediction.rst
@@ -6,7 +6,7 @@ Prediction
 There are a number of prediction functions in XGBoost with various parameters.  This
 document attempts to clarify some of confusions around prediction with a focus on the
-Python binding.
+Python binding, R package is similar when ``strict_shape`` is specified (see below).
 ******************
 Prediction Options
@@ -58,6 +58,13 @@ After 1.4 release, we added a new parameter called ``strict_shape``, one can set
  ``apply`` method in scikit learn interface, this is set to False by default.
 For R package, when ``strict_shape`` is specified, an ``array`` is returned, with the same
 value as Python except R array is column-major while Python numpy array is row-major, so
 all the dimensions are reversed.  For example, for a Python ``predict_leaf`` output
 obtained by having ``strict_shape=True`` has 4 dimensions: ``(n_samples, n_iterations,
 n_classes, n_trees_in_forest)``, while R with ``strict_shape=TRUE`` outputs
 ``(n_trees_in_forest, n_classes, n_iterations, n_samples)``.
 Other than these prediction types, there's also a parameter called ``iteration_range``,
 which is similar to model slicing.  But instead of actually splitting up the model into
 multiple stacks, it simply returns the prediction formed by the trees within range.
--- a/python-package/xgboost/core.py
+++ b/python-package/xgboost/core.py
@@ -111,9 +111,8 @@ def _convert_ntree_limit(
            raise ValueError(
                "Only one of `iteration_range` and `ntree_limit` can be non zero."
            )
-        num_parallel_tree, num_groups = _get_booster_layer_trees(booster)
+        num_parallel_tree, _ = _get_booster_layer_trees(booster)
        num_parallel_tree = max([num_parallel_tree, 1])
        num_groups = max([num_groups, 1])
        iteration_range = (0, ntree_limit // num_parallel_tree)
    return iteration_range
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -662,9 +662,21 @@ XGB_DLL int XGBoosterPredictFromDMatrix(BoosterHandle handle,
  auto *learner = static_cast<Learner*>(handle);
  auto& entry = learner->GetThreadLocal().prediction_entry;
  auto p_m = *static_cast<std::shared_ptr<DMatrix> *>(dmat);
-  auto type = PredictionType(get<Integer const>(config["type"]));
+
-  auto iteration_begin = get<Integer const>(config["iteration_begin"]);
+  auto const& j_config = get<Object const>(config);
-  auto iteration_end = get<Integer const>(config["iteration_end"]);
+  auto type = PredictionType(get<Integer const>(j_config.at("type")));
  auto iteration_begin = get<Integer const>(j_config.at("iteration_begin"));
  auto iteration_end = get<Integer const>(j_config.at("iteration_end"));
  auto ntree_limit_it = j_config.find("ntree_limit");
  if (ntree_limit_it != j_config.cend() && !IsA<Null>(ntree_limit_it->second) &&
      get<Integer const>(ntree_limit_it->second) != 0) {
    CHECK(iteration_end == 0) <<
        "Only one of the `ntree_limit` or `iteration_range` can be specified.";
    LOG(WARNING) << "`ntree_limit` is deprecated, use `iteration_range` instead.";
    iteration_end = GetIterationFromTreeLimit(get<Integer const>(ntree_limit_it->second), learner);
  }
  bool approximate = type == PredictionType::kApproxContribution ||
                     type == PredictionType::kApproxInteraction;
  bool contribs = type == PredictionType::kContribution ||
--- a/src/c_api/c_api_utils.h
+++ b/src/c_api/c_api_utils.h
@@ -48,7 +48,7 @@ inline void CalcPredictShape(bool strict_shape, PredictionType type, size_t rows
      *out_dim = 2;
      shape.resize(*out_dim);
      shape.front() = rows;
-      shape.back() = groups;
+      shape.back() = std::min(groups, chunksize);
    }
    break;
  }
--- a/src/tree/updater_quantile_hist.cc
+++ b/src/tree/updater_quantile_hist.cc
@@ -587,7 +587,6 @@ void QuantileHistMaker::Builder<GradientSumT>::InitSampling(const DMatrix& fmat,
 #if XGBOOST_CUSTOMIZE_GLOBAL_PRNG
  std::bernoulli_distribution coin_flip(param_.subsample);
  size_t used = 0, unused = 0;
  for (size_t i = 0; i < info.num_row_; ++i) {
    if (!(gpair_ref[i].GetHess() >= 0.0f && coin_flip(rnd)) || gpair_ref[i].GetGrad() == 0.0f) {
      gpair_ref[i] = GradientPair(0);