[R] Use new predict function. (#6819)

* Call new C prediction API.
* Add `strict_shape`.
* Add `iterationrange`.
* Update document.
This commit is contained in:
Jiaming Yuan 2021-06-11 13:03:29 +08:00 committed by GitHub
parent 25514e104a
commit b56614e9b8
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
18 changed files with 293 additions and 160 deletions

View File

@ -263,10 +263,7 @@ cb.reset.parameters <- function(new_params) {
#' \itemize{ #' \itemize{
#' \item \code{best_score} the evaluation score at the best iteration #' \item \code{best_score} the evaluation score at the best iteration
#' \item \code{best_iteration} at which boosting iteration the best score has occurred (1-based index) #' \item \code{best_iteration} at which boosting iteration the best score has occurred (1-based index)
#' \item \code{best_ntreelimit} to use with the \code{ntreelimit} parameter in \code{predict}.
#' It differs from \code{best_iteration} in multiclass or random forest settings.
#' } #' }
#'
#' The Same values are also stored as xgb-attributes: #' The Same values are also stored as xgb-attributes:
#' \itemize{ #' \itemize{
#' \item \code{best_iteration} is stored as a 0-based iteration index (for interoperability of binary models) #' \item \code{best_iteration} is stored as a 0-based iteration index (for interoperability of binary models)
@ -498,13 +495,12 @@ cb.cv.predict <- function(save_models = FALSE) {
rep(NA_real_, N) rep(NA_real_, N)
} }
ntreelimit <- NVL(env$basket$best_ntreelimit, iterationrange <- c(1, NVL(env$basket$best_iteration, env$end_iteration) + 1)
env$end_iteration * env$num_parallel_tree)
if (NVL(env$params[['booster']], '') == 'gblinear') { if (NVL(env$params[['booster']], '') == 'gblinear') {
ntreelimit <- 0 # must be 0 for gblinear iterationrange <- c(1, 1) # must be 0 for gblinear
} }
for (fd in env$bst_folds) { for (fd in env$bst_folds) {
pr <- predict(fd$bst, fd$watchlist[[2]], ntreelimit = ntreelimit, reshape = TRUE) pr <- predict(fd$bst, fd$watchlist[[2]], iterationrange = iterationrange, reshape = TRUE)
if (is.matrix(pred)) { if (is.matrix(pred)) {
pred[fd$index, ] <- pr pred[fd$index, ] <- pr
} else { } else {

View File

@ -178,7 +178,8 @@ xgb.iter.eval <- function(booster_handle, watchlist, iter, feval = NULL) {
} else { } else {
res <- sapply(seq_along(watchlist), function(j) { res <- sapply(seq_along(watchlist), function(j) {
w <- watchlist[[j]] w <- watchlist[[j]]
preds <- predict(booster_handle, w, outputmargin = TRUE, ntreelimit = 0) # predict using all trees ## predict using all trees
preds <- predict(booster_handle, w, outputmargin = TRUE, iterationrange = c(1, 1))
eval_res <- feval(preds, w) eval_res <- feval(preds, w)
out <- eval_res$value out <- eval_res$value
names(out) <- paste0(evnames[j], "-", eval_res$metric) names(out) <- paste0(evnames[j], "-", eval_res$metric)

View File

@ -168,8 +168,7 @@ xgb.Booster.complete <- function(object, saveraw = TRUE) {
#' @param outputmargin whether the prediction should be returned in the for of original untransformed #' @param outputmargin whether the prediction should be returned in the for of original untransformed
#' sum of predictions from boosting iterations' results. E.g., setting \code{outputmargin=TRUE} for #' sum of predictions from boosting iterations' results. E.g., setting \code{outputmargin=TRUE} for
#' logistic regression would result in predictions for log-odds instead of probabilities. #' logistic regression would result in predictions for log-odds instead of probabilities.
#' @param ntreelimit limit the number of model's trees or boosting iterations used in prediction (see Details). #' @param ntreelimit Deprecated, use \code{iterationrange} instead.
#' It will use all the trees by default (\code{NULL} value).
#' @param predleaf whether predict leaf index. #' @param predleaf whether predict leaf index.
#' @param predcontrib whether to return feature contributions to individual predictions (see Details). #' @param predcontrib whether to return feature contributions to individual predictions (see Details).
#' @param approxcontrib whether to use a fast approximation for feature contributions (see Details). #' @param approxcontrib whether to use a fast approximation for feature contributions (see Details).
@ -179,16 +178,19 @@ xgb.Booster.complete <- function(object, saveraw = TRUE) {
#' or predinteraction flags is TRUE. #' or predinteraction flags is TRUE.
#' @param training whether is the prediction result used for training. For dart booster, #' @param training whether is the prediction result used for training. For dart booster,
#' training predicting will perform dropout. #' training predicting will perform dropout.
#' @param iterationrange Specifies which layer of trees are used in prediction. For
#' example, if a random forest is trained with 100 rounds. Specifying
#' `iteration_range=(1, 21)`, then only the forests built during [1, 21) (half open set)
#' rounds are used in this prediction. It's 1-based index just like R vector. When set
#' to \code{c(1, 1)} XGBoost will use all trees.
#' @param strict_shape Default is \code{FALSE}. When it's set to \code{TRUE}, output
#' type and shape of prediction are invariant to model type.
#'
#' @param ... Parameters passed to \code{predict.xgb.Booster} #' @param ... Parameters passed to \code{predict.xgb.Booster}
#' #'
#' @details #' @details
#' Note that \code{ntreelimit} is not necessarily equal to the number of boosting iterations
#' and it is not necessarily equal to the number of trees in a model.
#' E.g., in a random forest-like model, \code{ntreelimit} would limit the number of trees.
#' But for multiclass classification, while there are multiple trees per iteration,
#' \code{ntreelimit} limits the number of boosting iterations.
#' #'
#' Also note that \code{ntreelimit} would currently do nothing for predictions from gblinear, #' Note that \code{iterationrange} would currently do nothing for predictions from gblinear,
#' since gblinear doesn't keep its boosting history. #' since gblinear doesn't keep its boosting history.
#' #'
#' One possible practical applications of the \code{predleaf} option is to use the model #' One possible practical applications of the \code{predleaf} option is to use the model
@ -209,7 +211,8 @@ xgb.Booster.complete <- function(object, saveraw = TRUE) {
#' of the most important features first. See below about the format of the returned results. #' of the most important features first. See below about the format of the returned results.
#' #'
#' @return #' @return
#' For regression or binary classification, it returns a vector of length \code{nrows(newdata)}. #' The return type is different depending whether \code{strict_shape} is set to \code{TRUE}. By default,
#' for regression or binary classification, it returns a vector of length \code{nrows(newdata)}.
#' For multiclass classification, either a \code{num_class * nrows(newdata)} vector or #' For multiclass classification, either a \code{num_class * nrows(newdata)} vector or
#' a \code{(nrows(newdata), num_class)} dimension matrix is returned, depending on #' a \code{(nrows(newdata), num_class)} dimension matrix is returned, depending on
#' the \code{reshape} value. #' the \code{reshape} value.
@ -231,6 +234,13 @@ xgb.Booster.complete <- function(object, saveraw = TRUE) {
#' For a multiclass case, a list of \code{num_class} elements is returned, where each element is #' For a multiclass case, a list of \code{num_class} elements is returned, where each element is
#' such an array. #' such an array.
#' #'
#' When \code{strict_shape} is set to \code{TRUE}, the output is always an array. For
#' normal prediction, the output is a 2-dimension array \code{(num_class, nrow(newdata))}.
#'
#' For \code{predcontrib = TRUE}, output is \code{(ncol(newdata) + 1, num_class, nrow(newdata))}
#' For \code{predinteraction = TRUE}, output is \code{(ncol(newdata) + 1, ncol(newdata) + 1, num_class, nrow(newdata))}
#' For \code{predleaf = TRUE}, output is \code{(n_trees_in_forest, num_class, n_iterations, nrow(newdata))}
#'
#' @seealso #' @seealso
#' \code{\link{xgb.train}}. #' \code{\link{xgb.train}}.
#' #'
@ -253,7 +263,7 @@ xgb.Booster.complete <- function(object, saveraw = TRUE) {
#' # use all trees by default #' # use all trees by default
#' pred <- predict(bst, test$data) #' pred <- predict(bst, test$data)
#' # use only the 1st tree #' # use only the 1st tree
#' pred1 <- predict(bst, test$data, ntreelimit = 1) #' pred1 <- predict(bst, test$data, iterationrange = c(1, 2))
#' #'
#' # Predicting tree leafs: #' # Predicting tree leafs:
#' # the result is an nsamples X ntrees matrix #' # the result is an nsamples X ntrees matrix
@ -305,31 +315,14 @@ xgb.Booster.complete <- function(object, saveraw = TRUE) {
#' all.equal(pred, pred_labels) #' all.equal(pred, pred_labels)
#' # prediction from using only 5 iterations should result #' # prediction from using only 5 iterations should result
#' # in the same error as seen in iteration 5: #' # in the same error as seen in iteration 5:
#' pred5 <- predict(bst, as.matrix(iris[, -5]), ntreelimit=5) #' pred5 <- predict(bst, as.matrix(iris[, -5]), iterationrange=c(1, 6))
#' sum(pred5 != lb)/length(lb) #' sum(pred5 != lb)/length(lb)
#' #'
#'
#' ## random forest-like model of 25 trees for binary classification:
#'
#' set.seed(11)
#' bst <- xgboost(data = train$data, label = train$label, max_depth = 5,
#' nthread = 2, nrounds = 1, objective = "binary:logistic",
#' num_parallel_tree = 25, subsample = 0.6, colsample_bytree = 0.1)
#' # Inspect the prediction error vs number of trees:
#' lb <- test$label
#' dtest <- xgb.DMatrix(test$data, label=lb)
#' err <- sapply(1:25, function(n) {
#' pred <- predict(bst, dtest, ntreelimit=n)
#' sum((pred > 0.5) != lb)/length(lb)
#' })
#' plot(err, type='l', ylim=c(0,0.1), xlab='#trees')
#'
#' @rdname predict.xgb.Booster #' @rdname predict.xgb.Booster
#' @export #' @export
predict.xgb.Booster <- function(object, newdata, missing = NA, outputmargin = FALSE, ntreelimit = NULL, predict.xgb.Booster <- function(object, newdata, missing = NA, outputmargin = FALSE, ntreelimit = NULL,
predleaf = FALSE, predcontrib = FALSE, approxcontrib = FALSE, predinteraction = FALSE, predleaf = FALSE, predcontrib = FALSE, approxcontrib = FALSE, predinteraction = FALSE,
reshape = FALSE, training = FALSE, ...) { reshape = FALSE, training = FALSE, iterationrange = NULL, strict_shape = FALSE, ...) {
object <- xgb.Booster.complete(object, saveraw = FALSE) object <- xgb.Booster.complete(object, saveraw = FALSE)
if (!inherits(newdata, "xgb.DMatrix")) if (!inherits(newdata, "xgb.DMatrix"))
newdata <- xgb.DMatrix(newdata, missing = missing) newdata <- xgb.DMatrix(newdata, missing = missing)
@ -337,81 +330,114 @@ predict.xgb.Booster <- function(object, newdata, missing = NA, outputmargin = FA
!is.null(colnames(newdata)) && !is.null(colnames(newdata)) &&
!identical(object[["feature_names"]], colnames(newdata))) !identical(object[["feature_names"]], colnames(newdata)))
stop("Feature names stored in `object` and `newdata` are different!") stop("Feature names stored in `object` and `newdata` are different!")
if (is.null(ntreelimit))
ntreelimit <- NVL(object$best_ntreelimit, 0) if (NVL(object$params[['booster']], '') == 'gblinear' || is.null(ntreelimit))
if (NVL(object$params[['booster']], '') == 'gblinear')
ntreelimit <- 0 ntreelimit <- 0
if (ntreelimit < 0)
stop("ntreelimit cannot be negative")
option <- 0L + 1L * as.logical(outputmargin) + 2L * as.logical(predleaf) + 4L * as.logical(predcontrib) + if (ntreelimit != 0 && is.null(iterationrange)) {
8L * as.logical(approxcontrib) + 16L * as.logical(predinteraction) ## only ntreelimit, initialize iteration range
iterationrange <- c(0, 0)
ret <- .Call(XGBoosterPredict_R, object$handle, newdata, option[1], } else if (ntreelimit == 0 && !is.null(iterationrange)) {
as.integer(ntreelimit), as.integer(training)) ## only iteration range, handle 1-based indexing
iterationrange <- c(iterationrange[1] - 1, iterationrange[2] - 1)
n_ret <- length(ret) } else if (ntreelimit != 0 && !is.null(iterationrange)) {
n_row <- nrow(newdata) ## both are specified, let libgxgboost throw an error
npred_per_case <- n_ret / n_row } else {
## no limit is supplied, use best
if (n_ret %% n_row != 0) if (is.null(object$best_iteration)) {
stop("prediction length ", n_ret, " is not multiple of nrows(newdata) ", n_row) iterationrange <- c(0, 0)
if (predleaf) {
ret <- if (n_ret == n_row) {
matrix(ret, ncol = 1)
} else { } else {
matrix(ret, nrow = n_row, byrow = TRUE) ## We don't need to + 1 as R is 1-based index.
iterationrange <- c(0, as.integer(object$best_iteration))
} }
} else if (predcontrib) { }
n_col1 <- ncol(newdata) + 1 ## Handle the 0 length values.
n_group <- npred_per_case / n_col1 box <- function(val) {
cnames <- if (!is.null(colnames(newdata))) c(colnames(newdata), "BIAS") else NULL if (length(val) == 0) {
ret <- if (n_ret == n_row) { cval <- vector(, 1)
matrix(ret, ncol = 1, dimnames = list(NULL, cnames)) cval[0] <- val
} else if (n_group == 1) { return(cval)
matrix(ret, nrow = n_row, byrow = TRUE, dimnames = list(NULL, cnames)) }
} else { return (val)
arr <- aperm( }
a = array(
data = ret, ## We set strict_shape to TRUE then drop the dimensions conditionally
dim = c(n_col1, n_group, n_row), args <- list(
dimnames = list(cnames, NULL, NULL) training = box(training),
), strict_shape = box(TRUE),
perm = c(2, 3, 1) # [group, row, col] iteration_begin = box(as.integer(iterationrange[1])),
) iteration_end = box(as.integer(iterationrange[2])),
lapply(seq_len(n_group), function(g) arr[g, , ]) ntree_limit = box(as.integer(ntreelimit)),
type = box(as.integer(0))
)
set_type <- function(type) {
if (args$type != 0) {
stop("One type of prediction at a time.")
}
return(box(as.integer(type)))
}
if (outputmargin) {
args$type <- set_type(1)
}
if (predcontrib) {
args$type <- set_type(if (approxcontrib) 3 else 2)
}
if (predinteraction) {
args$type <- set_type(if (approxcontrib) 5 else 4)
}
if (predleaf) {
args$type <- set_type(6)
}
predts <- .Call(
XGBoosterPredictFromDMatrix_R, object$handle, newdata, jsonlite::toJSON(args, auto_unbox = TRUE)
)
names(predts) <- c("shape", "results")
shape <- predts$shape
ret <- predts$results
n_row <- nrow(newdata)
if (n_row != shape[1]) {
stop("Incorrect predict shape.")
}
arr <- array(data = ret, dim = rev(shape))
cnames <- if (!is.null(colnames(newdata))) c(colnames(newdata), "BIAS") else NULL
if (predcontrib) {
dimnames(arr) <- list(cnames, NULL, NULL)
if (!strict_shape) {
arr <- aperm(a = arr, perm = c(2, 3, 1)) # [group, row, col]
} }
} else if (predinteraction) { } else if (predinteraction) {
n_col1 <- ncol(newdata) + 1 dimnames(arr) <- list(cnames, cnames, NULL, NULL)
n_group <- npred_per_case / n_col1^2 if (!strict_shape) {
cnames <- if (!is.null(colnames(newdata))) c(colnames(newdata), "BIAS") else NULL arr <- aperm(a = arr, perm = c(3, 4, 1, 2)) # [group, row, col, col]
ret <- if (n_ret == n_row) {
matrix(ret, ncol = 1, dimnames = list(NULL, cnames))
} else if (n_group == 1) {
aperm(
a = array(
data = ret,
dim = c(n_col1, n_col1, n_row),
dimnames = list(cnames, cnames, NULL)
),
perm = c(3, 1, 2)
)
} else {
arr <- aperm(
a = array(
data = ret,
dim = c(n_col1, n_col1, n_group, n_row),
dimnames = list(cnames, cnames, NULL, NULL)
),
perm = c(3, 4, 1, 2) # [group, row, col1, col2]
)
lapply(seq_len(n_group), function(g) arr[g, , , ])
} }
} else if (reshape && npred_per_case > 1) {
ret <- matrix(ret, nrow = n_row, byrow = TRUE)
} }
return(ret)
if (!strict_shape) {
n_groups <- shape[2]
if (predleaf) {
arr <- matrix(arr, nrow = n_row, byrow = TRUE)
} else if (predcontrib && n_groups != 1) {
arr <- lapply(seq_len(n_groups), function(g) arr[g, , ])
} else if (predinteraction && n_groups != 1) {
arr <- lapply(seq_len(n_groups), function(g) arr[g, , , ])
} else if (!reshape && n_groups != 1) {
arr <- ret
} else if (reshape && n_groups != 1) {
arr <- matrix(arr, ncol = 3, byrow = TRUE)
}
arr <- drop(arr)
if (length(dim(arr)) == 1) {
arr <- as.vector(arr)
} else if (length(dim(arr)) == 2) {
arr <- as.matrix(arr)
}
}
return(arr)
} }
#' @rdname predict.xgb.Booster #' @rdname predict.xgb.Booster

View File

@ -101,9 +101,7 @@
#' parameter or randomly generated. #' parameter or randomly generated.
#' \item \code{best_iteration} iteration number with the best evaluation metric value #' \item \code{best_iteration} iteration number with the best evaluation metric value
#' (only available with early stopping). #' (only available with early stopping).
#' \item \code{best_ntreelimit} the \code{ntreelimit} value corresponding to the best iteration, #' \item \code{best_ntreelimit} and the \code{ntreelimit} Deprecated attributes, use \code{best_iteration} instead.
#' which could further be used in \code{predict} method
#' (only available with early stopping).
#' \item \code{pred} CV prediction values available when \code{prediction} is set. #' \item \code{pred} CV prediction values available when \code{prediction} is set.
#' It is either vector or matrix (see \code{\link{cb.cv.predict}}). #' It is either vector or matrix (see \code{\link{cb.cv.predict}}).
#' \item \code{models} a list of the CV folds' models. It is only available with the explicit #' \item \code{models} a list of the CV folds' models. It is only available with the explicit

View File

@ -171,9 +171,6 @@
#' explicitly passed. #' explicitly passed.
#' \item \code{best_iteration} iteration number with the best evaluation metric value #' \item \code{best_iteration} iteration number with the best evaluation metric value
#' (only available with early stopping). #' (only available with early stopping).
#' \item \code{best_ntreelimit} the \code{ntreelimit} value corresponding to the best iteration,
#' which could further be used in \code{predict} method
#' (only available with early stopping).
#' \item \code{best_score} the best evaluation metric value during early stopping. #' \item \code{best_score} the best evaluation metric value during early stopping.
#' (only available with early stopping). #' (only available with early stopping).
#' \item \code{feature_names} names of the training dataset features #' \item \code{feature_names} names of the training dataset features

View File

@ -38,10 +38,7 @@ The following additional fields are assigned to the model's R object:
\itemize{ \itemize{
\item \code{best_score} the evaluation score at the best iteration \item \code{best_score} the evaluation score at the best iteration
\item \code{best_iteration} at which boosting iteration the best score has occurred (1-based index) \item \code{best_iteration} at which boosting iteration the best score has occurred (1-based index)
\item \code{best_ntreelimit} to use with the \code{ntreelimit} parameter in \code{predict}.
It differs from \code{best_iteration} in multiclass or random forest settings.
} }
The Same values are also stored as xgb-attributes: The Same values are also stored as xgb-attributes:
\itemize{ \itemize{
\item \code{best_iteration} is stored as a 0-based iteration index (for interoperability of binary models) \item \code{best_iteration} is stored as a 0-based iteration index (for interoperability of binary models)

View File

@ -17,6 +17,8 @@
predinteraction = FALSE, predinteraction = FALSE,
reshape = FALSE, reshape = FALSE,
training = FALSE, training = FALSE,
iterationrange = NULL,
strict_shape = FALSE,
... ...
) )
@ -34,8 +36,7 @@ missing values in data (e.g., sometimes 0 or some other extreme value is used).}
sum of predictions from boosting iterations' results. E.g., setting \code{outputmargin=TRUE} for sum of predictions from boosting iterations' results. E.g., setting \code{outputmargin=TRUE} for
logistic regression would result in predictions for log-odds instead of probabilities.} logistic regression would result in predictions for log-odds instead of probabilities.}
\item{ntreelimit}{limit the number of model's trees or boosting iterations used in prediction (see Details). \item{ntreelimit}{Deprecated, use \code{iterationrange} instead.}
It will use all the trees by default (\code{NULL} value).}
\item{predleaf}{whether predict leaf index.} \item{predleaf}{whether predict leaf index.}
@ -52,10 +53,20 @@ or predinteraction flags is TRUE.}
\item{training}{whether is the prediction result used for training. For dart booster, \item{training}{whether is the prediction result used for training. For dart booster,
training predicting will perform dropout.} training predicting will perform dropout.}
\item{iterationrange}{Specifies which layer of trees are used in prediction. For
example, if a random forest is trained with 100 rounds. Specifying
`iteration_range=(1, 21)`, then only the forests built during [1, 21) (half open set)
rounds are used in this prediction. It's 1-based index just like R vector. When set
to \code{c(1, 1)} XGBoost will use all trees.}
\item{strict_shape}{Default is \code{FALSE}. When it's set to \code{TRUE}, output
type and shape of prediction are invariant to model type.}
\item{...}{Parameters passed to \code{predict.xgb.Booster}} \item{...}{Parameters passed to \code{predict.xgb.Booster}}
} }
\value{ \value{
For regression or binary classification, it returns a vector of length \code{nrows(newdata)}. The return type is different depending whether \code{strict_shape} is set to \code{TRUE}. By default,
for regression or binary classification, it returns a vector of length \code{nrows(newdata)}.
For multiclass classification, either a \code{num_class * nrows(newdata)} vector or For multiclass classification, either a \code{num_class * nrows(newdata)} vector or
a \code{(nrows(newdata), num_class)} dimension matrix is returned, depending on a \code{(nrows(newdata), num_class)} dimension matrix is returned, depending on
the \code{reshape} value. the \code{reshape} value.
@ -76,18 +87,19 @@ two dimensions. The "+ 1" columns corresponds to bias. Summing this array along
produce practically the same result as predict with \code{predcontrib = TRUE}. produce practically the same result as predict with \code{predcontrib = TRUE}.
For a multiclass case, a list of \code{num_class} elements is returned, where each element is For a multiclass case, a list of \code{num_class} elements is returned, where each element is
such an array. such an array.
When \code{strict_shape} is set to \code{TRUE}, the output is always an array. For
normal prediction, the output is a 2-dimension array \code{(num_class, nrow(newdata))}.
For \code{predcontrib = TRUE}, output is \code{(ncol(newdata) + 1, num_class, nrow(newdata))}
For \code{predinteraction = TRUE}, output is \code{(ncol(newdata) + 1, ncol(newdata) + 1, num_class, nrow(newdata))}
For \code{predleaf = TRUE}, output is \code{(n_trees_in_forest, num_class, n_iterations, nrow(newdata))}
} }
\description{ \description{
Predicted values based on either xgboost model or model handle object. Predicted values based on either xgboost model or model handle object.
} }
\details{ \details{
Note that \code{ntreelimit} is not necessarily equal to the number of boosting iterations Note that \code{iterationrange} would currently do nothing for predictions from gblinear,
and it is not necessarily equal to the number of trees in a model.
E.g., in a random forest-like model, \code{ntreelimit} would limit the number of trees.
But for multiclass classification, while there are multiple trees per iteration,
\code{ntreelimit} limits the number of boosting iterations.
Also note that \code{ntreelimit} would currently do nothing for predictions from gblinear,
since gblinear doesn't keep its boosting history. since gblinear doesn't keep its boosting history.
One possible practical applications of the \code{predleaf} option is to use the model One possible practical applications of the \code{predleaf} option is to use the model
@ -120,7 +132,7 @@ bst <- xgboost(data = train$data, label = train$label, max_depth = 2,
# use all trees by default # use all trees by default
pred <- predict(bst, test$data) pred <- predict(bst, test$data)
# use only the 1st tree # use only the 1st tree
pred1 <- predict(bst, test$data, ntreelimit = 1) pred1 <- predict(bst, test$data, iterationrange = c(1, 2))
# Predicting tree leafs: # Predicting tree leafs:
# the result is an nsamples X ntrees matrix # the result is an nsamples X ntrees matrix
@ -172,25 +184,9 @@ str(pred)
all.equal(pred, pred_labels) all.equal(pred, pred_labels)
# prediction from using only 5 iterations should result # prediction from using only 5 iterations should result
# in the same error as seen in iteration 5: # in the same error as seen in iteration 5:
pred5 <- predict(bst, as.matrix(iris[, -5]), ntreelimit=5) pred5 <- predict(bst, as.matrix(iris[, -5]), iterationrange=c(1, 6))
sum(pred5 != lb)/length(lb) sum(pred5 != lb)/length(lb)
## random forest-like model of 25 trees for binary classification:
set.seed(11)
bst <- xgboost(data = train$data, label = train$label, max_depth = 5,
nthread = 2, nrounds = 1, objective = "binary:logistic",
num_parallel_tree = 25, subsample = 0.6, colsample_bytree = 0.1)
# Inspect the prediction error vs number of trees:
lb <- test$label
dtest <- xgb.DMatrix(test$data, label=lb)
err <- sapply(1:25, function(n) {
pred <- predict(bst, dtest, ntreelimit=n)
sum((pred > 0.5) != lb)/length(lb)
})
plot(err, type='l', ylim=c(0,0.1), xlab='#trees')
} }
\references{ \references{
Scott M. Lundberg, Su-In Lee, "A Unified Approach to Interpreting Model Predictions", NIPS Proceedings 2017, \url{https://arxiv.org/abs/1705.07874} Scott M. Lundberg, Su-In Lee, "A Unified Approach to Interpreting Model Predictions", NIPS Proceedings 2017, \url{https://arxiv.org/abs/1705.07874}

View File

@ -135,9 +135,7 @@ An object of class \code{xgb.cv.synchronous} with the following elements:
parameter or randomly generated. parameter or randomly generated.
\item \code{best_iteration} iteration number with the best evaluation metric value \item \code{best_iteration} iteration number with the best evaluation metric value
(only available with early stopping). (only available with early stopping).
\item \code{best_ntreelimit} the \code{ntreelimit} value corresponding to the best iteration, \item \code{best_ntreelimit} and the \code{ntreelimit} Deprecated attributes, use \code{best_iteration} instead.
which could further be used in \code{predict} method
(only available with early stopping).
\item \code{pred} CV prediction values available when \code{prediction} is set. \item \code{pred} CV prediction values available when \code{prediction} is set.
It is either vector or matrix (see \code{\link{cb.cv.predict}}). It is either vector or matrix (see \code{\link{cb.cv.predict}}).
\item \code{models} a list of the CV folds' models. It is only available with the explicit \item \code{models} a list of the CV folds' models. It is only available with the explicit

View File

@ -187,9 +187,6 @@ An object of class \code{xgb.Booster} with the following elements:
explicitly passed. explicitly passed.
\item \code{best_iteration} iteration number with the best evaluation metric value \item \code{best_iteration} iteration number with the best evaluation metric value
(only available with early stopping). (only available with early stopping).
\item \code{best_ntreelimit} the \code{ntreelimit} value corresponding to the best iteration,
which could further be used in \code{predict} method
(only available with early stopping).
\item \code{best_score} the best evaluation metric value during early stopping. \item \code{best_score} the best evaluation metric value during early stopping.
(only available with early stopping). (only available with early stopping).
\item \code{feature_names} names of the training dataset features \item \code{feature_names} names of the training dataset features

View File

@ -30,6 +30,7 @@ extern SEXP XGBoosterSerializeToBuffer_R(SEXP handle);
extern SEXP XGBoosterUnserializeFromBuffer_R(SEXP handle, SEXP raw); extern SEXP XGBoosterUnserializeFromBuffer_R(SEXP handle, SEXP raw);
extern SEXP XGBoosterModelToRaw_R(SEXP); extern SEXP XGBoosterModelToRaw_R(SEXP);
extern SEXP XGBoosterPredict_R(SEXP, SEXP, SEXP, SEXP, SEXP); extern SEXP XGBoosterPredict_R(SEXP, SEXP, SEXP, SEXP, SEXP);
extern SEXP XGBoosterPredictFromDMatrix_R(SEXP, SEXP, SEXP);
extern SEXP XGBoosterSaveModel_R(SEXP, SEXP); extern SEXP XGBoosterSaveModel_R(SEXP, SEXP);
extern SEXP XGBoosterSetAttr_R(SEXP, SEXP, SEXP); extern SEXP XGBoosterSetAttr_R(SEXP, SEXP, SEXP);
extern SEXP XGBoosterSetParam_R(SEXP, SEXP, SEXP); extern SEXP XGBoosterSetParam_R(SEXP, SEXP, SEXP);
@ -63,6 +64,7 @@ static const R_CallMethodDef CallEntries[] = {
{"XGBoosterUnserializeFromBuffer_R", (DL_FUNC) &XGBoosterUnserializeFromBuffer_R, 2}, {"XGBoosterUnserializeFromBuffer_R", (DL_FUNC) &XGBoosterUnserializeFromBuffer_R, 2},
{"XGBoosterModelToRaw_R", (DL_FUNC) &XGBoosterModelToRaw_R, 1}, {"XGBoosterModelToRaw_R", (DL_FUNC) &XGBoosterModelToRaw_R, 1},
{"XGBoosterPredict_R", (DL_FUNC) &XGBoosterPredict_R, 5}, {"XGBoosterPredict_R", (DL_FUNC) &XGBoosterPredict_R, 5},
{"XGBoosterPredictFromDMatrix_R", (DL_FUNC) &XGBoosterPredictFromDMatrix_R, 3},
{"XGBoosterSaveModel_R", (DL_FUNC) &XGBoosterSaveModel_R, 2}, {"XGBoosterSaveModel_R", (DL_FUNC) &XGBoosterSaveModel_R, 2},
{"XGBoosterSetAttr_R", (DL_FUNC) &XGBoosterSetAttr_R, 3}, {"XGBoosterSetAttr_R", (DL_FUNC) &XGBoosterSetAttr_R, 3},
{"XGBoosterSetParam_R", (DL_FUNC) &XGBoosterSetParam_R, 3}, {"XGBoosterSetParam_R", (DL_FUNC) &XGBoosterSetParam_R, 3},

View File

@ -374,6 +374,45 @@ SEXP XGBoosterPredict_R(SEXP handle, SEXP dmat, SEXP option_mask,
return ret; return ret;
} }
SEXP XGBoosterPredictFromDMatrix_R(SEXP handle, SEXP dmat, SEXP json_config) {
SEXP r_out_shape;
SEXP r_out_result;
SEXP r_out;
R_API_BEGIN();
char const *c_json_config = CHAR(asChar(json_config));
bst_ulong out_dim;
bst_ulong const *out_shape;
float const *out_result;
CHECK_CALL(XGBoosterPredictFromDMatrix(R_ExternalPtrAddr(handle),
R_ExternalPtrAddr(dmat), c_json_config,
&out_shape, &out_dim, &out_result));
r_out_shape = PROTECT(allocVector(INTSXP, out_dim));
size_t len = 1;
for (size_t i = 0; i < out_dim; ++i) {
INTEGER(r_out_shape)[i] = out_shape[i];
len *= out_shape[i];
}
r_out_result = PROTECT(allocVector(REALSXP, len));
#pragma omp parallel for
for (omp_ulong i = 0; i < len; ++i) {
REAL(r_out_result)[i] = out_result[i];
}
r_out = PROTECT(allocVector(VECSXP, 2));
SET_VECTOR_ELT(r_out, 0, r_out_shape);
SET_VECTOR_ELT(r_out, 1, r_out_result);
R_API_END();
UNPROTECT(3);
return r_out;
}
SEXP XGBoosterLoadModel_R(SEXP handle, SEXP fname) { SEXP XGBoosterLoadModel_R(SEXP handle, SEXP fname) {
R_API_BEGIN(); R_API_BEGIN();
CHECK_CALL(XGBoosterLoadModel(R_ExternalPtrAddr(handle), CHAR(asChar(fname)))); CHECK_CALL(XGBoosterLoadModel(R_ExternalPtrAddr(handle), CHAR(asChar(fname))));

View File

@ -164,7 +164,7 @@ XGB_DLL SEXP XGBoosterBoostOneIter_R(SEXP handle, SEXP dtrain, SEXP grad, SEXP h
XGB_DLL SEXP XGBoosterEvalOneIter_R(SEXP handle, SEXP iter, SEXP dmats, SEXP evnames); XGB_DLL SEXP XGBoosterEvalOneIter_R(SEXP handle, SEXP iter, SEXP dmats, SEXP evnames);
/*! /*!
* \brief make prediction based on dmat * \brief (Deprecated) make prediction based on dmat
* \param handle handle * \param handle handle
* \param dmat data matrix * \param dmat data matrix
* \param option_mask output_margin:1 predict_leaf:2 * \param option_mask output_margin:1 predict_leaf:2
@ -173,6 +173,16 @@ XGB_DLL SEXP XGBoosterEvalOneIter_R(SEXP handle, SEXP iter, SEXP dmats, SEXP evn
*/ */
XGB_DLL SEXP XGBoosterPredict_R(SEXP handle, SEXP dmat, SEXP option_mask, XGB_DLL SEXP XGBoosterPredict_R(SEXP handle, SEXP dmat, SEXP option_mask,
SEXP ntree_limit, SEXP training); SEXP ntree_limit, SEXP training);
/*!
* \brief Run prediction on DMatrix, replacing `XGBoosterPredict_R`
* \param handle handle
* \param dmat data matrix
* \param json_config See `XGBoosterPredictFromDMatrix` in xgboost c_api.h
*
* \return A list containing 2 vectors, first one for shape while second one for prediction result.
*/
XGB_DLL SEXP XGBoosterPredictFromDMatrix_R(SEXP handle, SEXP dmat, SEXP json_config);
/*! /*!
* \brief load model from existing file * \brief load model from existing file
* \param handle handle * \param handle handle

View File

@ -34,6 +34,10 @@ test_that("train and predict binary classification", {
err_pred1 <- sum((pred1 > 0.5) != train$label) / length(train$label) err_pred1 <- sum((pred1 > 0.5) != train$label) / length(train$label)
err_log <- bst$evaluation_log[1, train_error] err_log <- bst$evaluation_log[1, train_error]
expect_lt(abs(err_pred1 - err_log), 10e-6) expect_lt(abs(err_pred1 - err_log), 10e-6)
pred2 <- predict(bst, train$data, iterationrange = c(1, 2))
expect_length(pred1, 6513)
expect_equal(pred1, pred2)
}) })
test_that("parameter validation works", { test_that("parameter validation works", {
@ -143,6 +147,9 @@ test_that("train and predict softprob", {
pred_labels <- max.col(mpred) - 1 pred_labels <- max.col(mpred) - 1
err <- sum(pred_labels != lb) / length(lb) err <- sum(pred_labels != lb) / length(lb)
expect_equal(bst$evaluation_log[1, train_merror], err, tolerance = 5e-6) expect_equal(bst$evaluation_log[1, train_merror], err, tolerance = 5e-6)
mpred1 <- predict(bst, as.matrix(iris[, -5]), reshape = TRUE, iterationrange = c(1, 2))
expect_equal(mpred, mpred1)
}) })
test_that("train and predict softmax", { test_that("train and predict softmax", {
@ -182,10 +189,8 @@ test_that("train and predict RF", {
pred_err_20 <- sum((pred > 0.5) != lb) / length(lb) pred_err_20 <- sum((pred > 0.5) != lb) / length(lb)
expect_equal(pred_err_20, pred_err) expect_equal(pred_err_20, pred_err)
#pred <- predict(bst, train$data, ntreelimit = 1) pred1 <- predict(bst, train$data, iterationrange = c(1, 2))
#pred_err_1 <- sum((pred > 0.5) != lb)/length(lb) expect_equal(pred, pred1)
#expect_lt(pred_err, pred_err_1)
#expect_lt(pred_err, 0.08)
}) })
test_that("train and predict RF with softprob", { test_that("train and predict RF with softprob", {
@ -385,3 +390,57 @@ test_that("Configuration works", {
reloaded_config <- xgb.config(bst) reloaded_config <- xgb.config(bst)
expect_equal(config, reloaded_config); expect_equal(config, reloaded_config);
}) })
test_that("strict_shape works", {
n_rounds <- 2
test_strict_shape <- function(bst, X, n_groups) {
predt <- predict(bst, X, strict_shape = TRUE)
margin <- predict(bst, X, outputmargin = TRUE, strict_shape = TRUE)
contri <- predict(bst, X, predcontrib = TRUE, strict_shape = TRUE)
interact <- predict(bst, X, predinteraction = TRUE, strict_shape = TRUE)
leaf <- predict(bst, X, predleaf = TRUE, strict_shape = TRUE)
n_rows <- nrow(X)
n_cols <- ncol(X)
expect_equal(dim(predt), c(n_groups, n_rows))
expect_equal(dim(margin), c(n_groups, n_rows))
expect_equal(dim(contri), c(n_cols + 1, n_groups, n_rows))
expect_equal(dim(interact), c(n_cols + 1, n_cols + 1, n_groups, n_rows))
expect_equal(dim(leaf), c(1, n_groups, n_rounds, n_rows))
if (n_groups != 1) {
for (g in seq_len(n_groups)) {
expect_lt(max(abs(colSums(contri[, g, ]) - margin[g, ])), 1e-5)
}
}
}
test_iris <- function() {
y <- as.numeric(iris$Species) - 1
X <- as.matrix(iris[, -5])
bst <- xgboost(data = X, label = y,
max_depth = 2, nrounds = n_rounds,
objective = "multi:softprob", num_class = 3, eval_metric = "merror")
test_strict_shape(bst, X, 3)
}
test_agaricus <- function() {
data(agaricus.train, package = 'xgboost')
X <- agaricus.train$data
y <- agaricus.train$label
bst <- xgboost(data = X, label = y, max_depth = 2,
nrounds = n_rounds, objective = "binary:logistic",
eval_metric = 'error', eval_metric = 'auc', eval_metric = "logloss")
test_strict_shape(bst, X, 1)
}
test_iris()
test_agaricus()
})

View File

@ -6,7 +6,7 @@ Prediction
There are a number of prediction functions in XGBoost with various parameters. This There are a number of prediction functions in XGBoost with various parameters. This
document attempts to clarify some of confusions around prediction with a focus on the document attempts to clarify some of confusions around prediction with a focus on the
Python binding. Python binding, R package is similar when ``strict_shape`` is specified (see below).
****************** ******************
Prediction Options Prediction Options
@ -58,6 +58,13 @@ After 1.4 release, we added a new parameter called ``strict_shape``, one can set
``apply`` method in scikit learn interface, this is set to False by default. ``apply`` method in scikit learn interface, this is set to False by default.
For R package, when ``strict_shape`` is specified, an ``array`` is returned, with the same
value as Python except R array is column-major while Python numpy array is row-major, so
all the dimensions are reversed. For example, for a Python ``predict_leaf`` output
obtained by having ``strict_shape=True`` has 4 dimensions: ``(n_samples, n_iterations,
n_classes, n_trees_in_forest)``, while R with ``strict_shape=TRUE`` outputs
``(n_trees_in_forest, n_classes, n_iterations, n_samples)``.
Other than these prediction types, there's also a parameter called ``iteration_range``, Other than these prediction types, there's also a parameter called ``iteration_range``,
which is similar to model slicing. But instead of actually splitting up the model into which is similar to model slicing. But instead of actually splitting up the model into
multiple stacks, it simply returns the prediction formed by the trees within range. multiple stacks, it simply returns the prediction formed by the trees within range.

View File

@ -111,9 +111,8 @@ def _convert_ntree_limit(
raise ValueError( raise ValueError(
"Only one of `iteration_range` and `ntree_limit` can be non zero." "Only one of `iteration_range` and `ntree_limit` can be non zero."
) )
num_parallel_tree, num_groups = _get_booster_layer_trees(booster) num_parallel_tree, _ = _get_booster_layer_trees(booster)
num_parallel_tree = max([num_parallel_tree, 1]) num_parallel_tree = max([num_parallel_tree, 1])
num_groups = max([num_groups, 1])
iteration_range = (0, ntree_limit // num_parallel_tree) iteration_range = (0, ntree_limit // num_parallel_tree)
return iteration_range return iteration_range

View File

@ -662,9 +662,21 @@ XGB_DLL int XGBoosterPredictFromDMatrix(BoosterHandle handle,
auto *learner = static_cast<Learner*>(handle); auto *learner = static_cast<Learner*>(handle);
auto& entry = learner->GetThreadLocal().prediction_entry; auto& entry = learner->GetThreadLocal().prediction_entry;
auto p_m = *static_cast<std::shared_ptr<DMatrix> *>(dmat); auto p_m = *static_cast<std::shared_ptr<DMatrix> *>(dmat);
auto type = PredictionType(get<Integer const>(config["type"]));
auto iteration_begin = get<Integer const>(config["iteration_begin"]); auto const& j_config = get<Object const>(config);
auto iteration_end = get<Integer const>(config["iteration_end"]); auto type = PredictionType(get<Integer const>(j_config.at("type")));
auto iteration_begin = get<Integer const>(j_config.at("iteration_begin"));
auto iteration_end = get<Integer const>(j_config.at("iteration_end"));
auto ntree_limit_it = j_config.find("ntree_limit");
if (ntree_limit_it != j_config.cend() && !IsA<Null>(ntree_limit_it->second) &&
get<Integer const>(ntree_limit_it->second) != 0) {
CHECK(iteration_end == 0) <<
"Only one of the `ntree_limit` or `iteration_range` can be specified.";
LOG(WARNING) << "`ntree_limit` is deprecated, use `iteration_range` instead.";
iteration_end = GetIterationFromTreeLimit(get<Integer const>(ntree_limit_it->second), learner);
}
bool approximate = type == PredictionType::kApproxContribution || bool approximate = type == PredictionType::kApproxContribution ||
type == PredictionType::kApproxInteraction; type == PredictionType::kApproxInteraction;
bool contribs = type == PredictionType::kContribution || bool contribs = type == PredictionType::kContribution ||

View File

@ -48,7 +48,7 @@ inline void CalcPredictShape(bool strict_shape, PredictionType type, size_t rows
*out_dim = 2; *out_dim = 2;
shape.resize(*out_dim); shape.resize(*out_dim);
shape.front() = rows; shape.front() = rows;
shape.back() = groups; shape.back() = std::min(groups, chunksize);
} }
break; break;
} }

View File

@ -587,7 +587,6 @@ void QuantileHistMaker::Builder<GradientSumT>::InitSampling(const DMatrix& fmat,
#if XGBOOST_CUSTOMIZE_GLOBAL_PRNG #if XGBOOST_CUSTOMIZE_GLOBAL_PRNG
std::bernoulli_distribution coin_flip(param_.subsample); std::bernoulli_distribution coin_flip(param_.subsample);
size_t used = 0, unused = 0;
for (size_t i = 0; i < info.num_row_; ++i) { for (size_t i = 0; i < info.num_row_; ++i) {
if (!(gpair_ref[i].GetHess() >= 0.0f && coin_flip(rnd)) || gpair_ref[i].GetGrad() == 0.0f) { if (!(gpair_ref[i].GetHess() >= 0.0f && coin_flip(rnd)) || gpair_ref[i].GetGrad() == 0.0f) {
gpair_ref[i] = GradientPair(0); gpair_ref[i] = GradientPair(0);