From b56614e9b86c0eaaf09b9ee9b2d00c4185c3496f Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Fri, 11 Jun 2021 13:03:29 +0800 Subject: [PATCH] [R] Use new predict function. (#6819) * Call new C prediction API. * Add `strict_shape`. * Add `iterationrange`. * Update document. --- R-package/R/callbacks.R | 10 +- R-package/R/utils.R | 3 +- R-package/R/xgb.Booster.R | 218 ++++++++++++++------------ R-package/R/xgb.cv.R | 4 +- R-package/R/xgb.train.R | 3 - R-package/man/cb.early.stop.Rd | 3 - R-package/man/predict.xgb.Booster.Rd | 52 +++--- R-package/man/xgb.cv.Rd | 4 +- R-package/man/xgb.train.Rd | 3 - R-package/src/init.c | 2 + R-package/src/xgboost_R.cc | 39 +++++ R-package/src/xgboost_R.h | 12 +- R-package/tests/testthat/test_basic.R | 67 +++++++- doc/prediction.rst | 9 +- python-package/xgboost/core.py | 3 +- src/c_api/c_api.cc | 18 ++- src/c_api/c_api_utils.h | 2 +- src/tree/updater_quantile_hist.cc | 1 - 18 files changed, 293 insertions(+), 160 deletions(-) diff --git a/R-package/R/callbacks.R b/R-package/R/callbacks.R index 7c0df2f1d..d5c69f52d 100644 --- a/R-package/R/callbacks.R +++ b/R-package/R/callbacks.R @@ -263,10 +263,7 @@ cb.reset.parameters <- function(new_params) { #' \itemize{ #' \item \code{best_score} the evaluation score at the best iteration #' \item \code{best_iteration} at which boosting iteration the best score has occurred (1-based index) -#' \item \code{best_ntreelimit} to use with the \code{ntreelimit} parameter in \code{predict}. -#' It differs from \code{best_iteration} in multiclass or random forest settings. #' } -#' #' The Same values are also stored as xgb-attributes: #' \itemize{ #' \item \code{best_iteration} is stored as a 0-based iteration index (for interoperability of binary models) @@ -498,13 +495,12 @@ cb.cv.predict <- function(save_models = FALSE) { rep(NA_real_, N) } - ntreelimit <- NVL(env$basket$best_ntreelimit, - env$end_iteration * env$num_parallel_tree) + iterationrange <- c(1, NVL(env$basket$best_iteration, env$end_iteration) + 1) if (NVL(env$params[['booster']], '') == 'gblinear') { - ntreelimit <- 0 # must be 0 for gblinear + iterationrange <- c(1, 1) # must be 0 for gblinear } for (fd in env$bst_folds) { - pr <- predict(fd$bst, fd$watchlist[[2]], ntreelimit = ntreelimit, reshape = TRUE) + pr <- predict(fd$bst, fd$watchlist[[2]], iterationrange = iterationrange, reshape = TRUE) if (is.matrix(pred)) { pred[fd$index, ] <- pr } else { diff --git a/R-package/R/utils.R b/R-package/R/utils.R index 1b0f9eefe..8d9413d8a 100644 --- a/R-package/R/utils.R +++ b/R-package/R/utils.R @@ -178,7 +178,8 @@ xgb.iter.eval <- function(booster_handle, watchlist, iter, feval = NULL) { } else { res <- sapply(seq_along(watchlist), function(j) { w <- watchlist[[j]] - preds <- predict(booster_handle, w, outputmargin = TRUE, ntreelimit = 0) # predict using all trees + ## predict using all trees + preds <- predict(booster_handle, w, outputmargin = TRUE, iterationrange = c(1, 1)) eval_res <- feval(preds, w) out <- eval_res$value names(out) <- paste0(evnames[j], "-", eval_res$metric) diff --git a/R-package/R/xgb.Booster.R b/R-package/R/xgb.Booster.R index af243e302..1228a9a61 100644 --- a/R-package/R/xgb.Booster.R +++ b/R-package/R/xgb.Booster.R @@ -168,8 +168,7 @@ xgb.Booster.complete <- function(object, saveraw = TRUE) { #' @param outputmargin whether the prediction should be returned in the for of original untransformed #' sum of predictions from boosting iterations' results. E.g., setting \code{outputmargin=TRUE} for #' logistic regression would result in predictions for log-odds instead of probabilities. -#' @param ntreelimit limit the number of model's trees or boosting iterations used in prediction (see Details). -#' It will use all the trees by default (\code{NULL} value). +#' @param ntreelimit Deprecated, use \code{iterationrange} instead. #' @param predleaf whether predict leaf index. #' @param predcontrib whether to return feature contributions to individual predictions (see Details). #' @param approxcontrib whether to use a fast approximation for feature contributions (see Details). @@ -179,16 +178,19 @@ xgb.Booster.complete <- function(object, saveraw = TRUE) { #' or predinteraction flags is TRUE. #' @param training whether is the prediction result used for training. For dart booster, #' training predicting will perform dropout. +#' @param iterationrange Specifies which layer of trees are used in prediction. For +#' example, if a random forest is trained with 100 rounds. Specifying +#' `iteration_range=(1, 21)`, then only the forests built during [1, 21) (half open set) +#' rounds are used in this prediction. It's 1-based index just like R vector. When set +#' to \code{c(1, 1)} XGBoost will use all trees. +#' @param strict_shape Default is \code{FALSE}. When it's set to \code{TRUE}, output +#' type and shape of prediction are invariant to model type. +#' #' @param ... Parameters passed to \code{predict.xgb.Booster} #' #' @details -#' Note that \code{ntreelimit} is not necessarily equal to the number of boosting iterations -#' and it is not necessarily equal to the number of trees in a model. -#' E.g., in a random forest-like model, \code{ntreelimit} would limit the number of trees. -#' But for multiclass classification, while there are multiple trees per iteration, -#' \code{ntreelimit} limits the number of boosting iterations. #' -#' Also note that \code{ntreelimit} would currently do nothing for predictions from gblinear, +#' Note that \code{iterationrange} would currently do nothing for predictions from gblinear, #' since gblinear doesn't keep its boosting history. #' #' One possible practical applications of the \code{predleaf} option is to use the model @@ -209,7 +211,8 @@ xgb.Booster.complete <- function(object, saveraw = TRUE) { #' of the most important features first. See below about the format of the returned results. #' #' @return -#' For regression or binary classification, it returns a vector of length \code{nrows(newdata)}. +#' The return type is different depending whether \code{strict_shape} is set to \code{TRUE}. By default, +#' for regression or binary classification, it returns a vector of length \code{nrows(newdata)}. #' For multiclass classification, either a \code{num_class * nrows(newdata)} vector or #' a \code{(nrows(newdata), num_class)} dimension matrix is returned, depending on #' the \code{reshape} value. @@ -231,6 +234,13 @@ xgb.Booster.complete <- function(object, saveraw = TRUE) { #' For a multiclass case, a list of \code{num_class} elements is returned, where each element is #' such an array. #' +#' When \code{strict_shape} is set to \code{TRUE}, the output is always an array. For +#' normal prediction, the output is a 2-dimension array \code{(num_class, nrow(newdata))}. +#' +#' For \code{predcontrib = TRUE}, output is \code{(ncol(newdata) + 1, num_class, nrow(newdata))} +#' For \code{predinteraction = TRUE}, output is \code{(ncol(newdata) + 1, ncol(newdata) + 1, num_class, nrow(newdata))} +#' For \code{predleaf = TRUE}, output is \code{(n_trees_in_forest, num_class, n_iterations, nrow(newdata))} +#' #' @seealso #' \code{\link{xgb.train}}. #' @@ -253,7 +263,7 @@ xgb.Booster.complete <- function(object, saveraw = TRUE) { #' # use all trees by default #' pred <- predict(bst, test$data) #' # use only the 1st tree -#' pred1 <- predict(bst, test$data, ntreelimit = 1) +#' pred1 <- predict(bst, test$data, iterationrange = c(1, 2)) #' #' # Predicting tree leafs: #' # the result is an nsamples X ntrees matrix @@ -305,31 +315,14 @@ xgb.Booster.complete <- function(object, saveraw = TRUE) { #' all.equal(pred, pred_labels) #' # prediction from using only 5 iterations should result #' # in the same error as seen in iteration 5: -#' pred5 <- predict(bst, as.matrix(iris[, -5]), ntreelimit=5) +#' pred5 <- predict(bst, as.matrix(iris[, -5]), iterationrange=c(1, 6)) #' sum(pred5 != lb)/length(lb) #' -#' -#' ## random forest-like model of 25 trees for binary classification: -#' -#' set.seed(11) -#' bst <- xgboost(data = train$data, label = train$label, max_depth = 5, -#' nthread = 2, nrounds = 1, objective = "binary:logistic", -#' num_parallel_tree = 25, subsample = 0.6, colsample_bytree = 0.1) -#' # Inspect the prediction error vs number of trees: -#' lb <- test$label -#' dtest <- xgb.DMatrix(test$data, label=lb) -#' err <- sapply(1:25, function(n) { -#' pred <- predict(bst, dtest, ntreelimit=n) -#' sum((pred > 0.5) != lb)/length(lb) -#' }) -#' plot(err, type='l', ylim=c(0,0.1), xlab='#trees') -#' #' @rdname predict.xgb.Booster #' @export predict.xgb.Booster <- function(object, newdata, missing = NA, outputmargin = FALSE, ntreelimit = NULL, predleaf = FALSE, predcontrib = FALSE, approxcontrib = FALSE, predinteraction = FALSE, - reshape = FALSE, training = FALSE, ...) { - + reshape = FALSE, training = FALSE, iterationrange = NULL, strict_shape = FALSE, ...) { object <- xgb.Booster.complete(object, saveraw = FALSE) if (!inherits(newdata, "xgb.DMatrix")) newdata <- xgb.DMatrix(newdata, missing = missing) @@ -337,81 +330,114 @@ predict.xgb.Booster <- function(object, newdata, missing = NA, outputmargin = FA !is.null(colnames(newdata)) && !identical(object[["feature_names"]], colnames(newdata))) stop("Feature names stored in `object` and `newdata` are different!") - if (is.null(ntreelimit)) - ntreelimit <- NVL(object$best_ntreelimit, 0) - if (NVL(object$params[['booster']], '') == 'gblinear') + + if (NVL(object$params[['booster']], '') == 'gblinear' || is.null(ntreelimit)) ntreelimit <- 0 - if (ntreelimit < 0) - stop("ntreelimit cannot be negative") - option <- 0L + 1L * as.logical(outputmargin) + 2L * as.logical(predleaf) + 4L * as.logical(predcontrib) + - 8L * as.logical(approxcontrib) + 16L * as.logical(predinteraction) - - ret <- .Call(XGBoosterPredict_R, object$handle, newdata, option[1], - as.integer(ntreelimit), as.integer(training)) - - n_ret <- length(ret) - n_row <- nrow(newdata) - npred_per_case <- n_ret / n_row - - if (n_ret %% n_row != 0) - stop("prediction length ", n_ret, " is not multiple of nrows(newdata) ", n_row) - - if (predleaf) { - ret <- if (n_ret == n_row) { - matrix(ret, ncol = 1) + if (ntreelimit != 0 && is.null(iterationrange)) { + ## only ntreelimit, initialize iteration range + iterationrange <- c(0, 0) + } else if (ntreelimit == 0 && !is.null(iterationrange)) { + ## only iteration range, handle 1-based indexing + iterationrange <- c(iterationrange[1] - 1, iterationrange[2] - 1) + } else if (ntreelimit != 0 && !is.null(iterationrange)) { + ## both are specified, let libgxgboost throw an error + } else { + ## no limit is supplied, use best + if (is.null(object$best_iteration)) { + iterationrange <- c(0, 0) } else { - matrix(ret, nrow = n_row, byrow = TRUE) + ## We don't need to + 1 as R is 1-based index. + iterationrange <- c(0, as.integer(object$best_iteration)) } - } else if (predcontrib) { - n_col1 <- ncol(newdata) + 1 - n_group <- npred_per_case / n_col1 - cnames <- if (!is.null(colnames(newdata))) c(colnames(newdata), "BIAS") else NULL - ret <- if (n_ret == n_row) { - matrix(ret, ncol = 1, dimnames = list(NULL, cnames)) - } else if (n_group == 1) { - matrix(ret, nrow = n_row, byrow = TRUE, dimnames = list(NULL, cnames)) - } else { - arr <- aperm( - a = array( - data = ret, - dim = c(n_col1, n_group, n_row), - dimnames = list(cnames, NULL, NULL) - ), - perm = c(2, 3, 1) # [group, row, col] - ) - lapply(seq_len(n_group), function(g) arr[g, , ]) + } + ## Handle the 0 length values. + box <- function(val) { + if (length(val) == 0) { + cval <- vector(, 1) + cval[0] <- val + return(cval) + } + return (val) + } + + ## We set strict_shape to TRUE then drop the dimensions conditionally + args <- list( + training = box(training), + strict_shape = box(TRUE), + iteration_begin = box(as.integer(iterationrange[1])), + iteration_end = box(as.integer(iterationrange[2])), + ntree_limit = box(as.integer(ntreelimit)), + type = box(as.integer(0)) + ) + + set_type <- function(type) { + if (args$type != 0) { + stop("One type of prediction at a time.") + } + return(box(as.integer(type))) + } + if (outputmargin) { + args$type <- set_type(1) + } + if (predcontrib) { + args$type <- set_type(if (approxcontrib) 3 else 2) + } + if (predinteraction) { + args$type <- set_type(if (approxcontrib) 5 else 4) + } + if (predleaf) { + args$type <- set_type(6) + } + + predts <- .Call( + XGBoosterPredictFromDMatrix_R, object$handle, newdata, jsonlite::toJSON(args, auto_unbox = TRUE) + ) + names(predts) <- c("shape", "results") + shape <- predts$shape + ret <- predts$results + + n_row <- nrow(newdata) + if (n_row != shape[1]) { + stop("Incorrect predict shape.") + } + + arr <- array(data = ret, dim = rev(shape)) + + cnames <- if (!is.null(colnames(newdata))) c(colnames(newdata), "BIAS") else NULL + if (predcontrib) { + dimnames(arr) <- list(cnames, NULL, NULL) + if (!strict_shape) { + arr <- aperm(a = arr, perm = c(2, 3, 1)) # [group, row, col] } } else if (predinteraction) { - n_col1 <- ncol(newdata) + 1 - n_group <- npred_per_case / n_col1^2 - cnames <- if (!is.null(colnames(newdata))) c(colnames(newdata), "BIAS") else NULL - ret <- if (n_ret == n_row) { - matrix(ret, ncol = 1, dimnames = list(NULL, cnames)) - } else if (n_group == 1) { - aperm( - a = array( - data = ret, - dim = c(n_col1, n_col1, n_row), - dimnames = list(cnames, cnames, NULL) - ), - perm = c(3, 1, 2) - ) - } else { - arr <- aperm( - a = array( - data = ret, - dim = c(n_col1, n_col1, n_group, n_row), - dimnames = list(cnames, cnames, NULL, NULL) - ), - perm = c(3, 4, 1, 2) # [group, row, col1, col2] - ) - lapply(seq_len(n_group), function(g) arr[g, , , ]) + dimnames(arr) <- list(cnames, cnames, NULL, NULL) + if (!strict_shape) { + arr <- aperm(a = arr, perm = c(3, 4, 1, 2)) # [group, row, col, col] } - } else if (reshape && npred_per_case > 1) { - ret <- matrix(ret, nrow = n_row, byrow = TRUE) } - return(ret) + + if (!strict_shape) { + n_groups <- shape[2] + if (predleaf) { + arr <- matrix(arr, nrow = n_row, byrow = TRUE) + } else if (predcontrib && n_groups != 1) { + arr <- lapply(seq_len(n_groups), function(g) arr[g, , ]) + } else if (predinteraction && n_groups != 1) { + arr <- lapply(seq_len(n_groups), function(g) arr[g, , , ]) + } else if (!reshape && n_groups != 1) { + arr <- ret + } else if (reshape && n_groups != 1) { + arr <- matrix(arr, ncol = 3, byrow = TRUE) + } + arr <- drop(arr) + if (length(dim(arr)) == 1) { + arr <- as.vector(arr) + } else if (length(dim(arr)) == 2) { + arr <- as.matrix(arr) + } + } + return(arr) } #' @rdname predict.xgb.Booster diff --git a/R-package/R/xgb.cv.R b/R-package/R/xgb.cv.R index 8ac0154a0..c07d452c1 100644 --- a/R-package/R/xgb.cv.R +++ b/R-package/R/xgb.cv.R @@ -101,9 +101,7 @@ #' parameter or randomly generated. #' \item \code{best_iteration} iteration number with the best evaluation metric value #' (only available with early stopping). -#' \item \code{best_ntreelimit} the \code{ntreelimit} value corresponding to the best iteration, -#' which could further be used in \code{predict} method -#' (only available with early stopping). +#' \item \code{best_ntreelimit} and the \code{ntreelimit} Deprecated attributes, use \code{best_iteration} instead. #' \item \code{pred} CV prediction values available when \code{prediction} is set. #' It is either vector or matrix (see \code{\link{cb.cv.predict}}). #' \item \code{models} a list of the CV folds' models. It is only available with the explicit diff --git a/R-package/R/xgb.train.R b/R-package/R/xgb.train.R index 981412768..580449791 100644 --- a/R-package/R/xgb.train.R +++ b/R-package/R/xgb.train.R @@ -171,9 +171,6 @@ #' explicitly passed. #' \item \code{best_iteration} iteration number with the best evaluation metric value #' (only available with early stopping). -#' \item \code{best_ntreelimit} the \code{ntreelimit} value corresponding to the best iteration, -#' which could further be used in \code{predict} method -#' (only available with early stopping). #' \item \code{best_score} the best evaluation metric value during early stopping. #' (only available with early stopping). #' \item \code{feature_names} names of the training dataset features diff --git a/R-package/man/cb.early.stop.Rd b/R-package/man/cb.early.stop.Rd index 1a099d7d3..7b6efa842 100644 --- a/R-package/man/cb.early.stop.Rd +++ b/R-package/man/cb.early.stop.Rd @@ -38,10 +38,7 @@ The following additional fields are assigned to the model's R object: \itemize{ \item \code{best_score} the evaluation score at the best iteration \item \code{best_iteration} at which boosting iteration the best score has occurred (1-based index) -\item \code{best_ntreelimit} to use with the \code{ntreelimit} parameter in \code{predict}. - It differs from \code{best_iteration} in multiclass or random forest settings. } - The Same values are also stored as xgb-attributes: \itemize{ \item \code{best_iteration} is stored as a 0-based iteration index (for interoperability of binary models) diff --git a/R-package/man/predict.xgb.Booster.Rd b/R-package/man/predict.xgb.Booster.Rd index 6430eabf5..34d1f86e5 100644 --- a/R-package/man/predict.xgb.Booster.Rd +++ b/R-package/man/predict.xgb.Booster.Rd @@ -17,6 +17,8 @@ predinteraction = FALSE, reshape = FALSE, training = FALSE, + iterationrange = NULL, + strict_shape = FALSE, ... ) @@ -34,8 +36,7 @@ missing values in data (e.g., sometimes 0 or some other extreme value is used).} sum of predictions from boosting iterations' results. E.g., setting \code{outputmargin=TRUE} for logistic regression would result in predictions for log-odds instead of probabilities.} -\item{ntreelimit}{limit the number of model's trees or boosting iterations used in prediction (see Details). -It will use all the trees by default (\code{NULL} value).} +\item{ntreelimit}{Deprecated, use \code{iterationrange} instead.} \item{predleaf}{whether predict leaf index.} @@ -52,10 +53,20 @@ or predinteraction flags is TRUE.} \item{training}{whether is the prediction result used for training. For dart booster, training predicting will perform dropout.} +\item{iterationrange}{Specifies which layer of trees are used in prediction. For +example, if a random forest is trained with 100 rounds. Specifying +`iteration_range=(1, 21)`, then only the forests built during [1, 21) (half open set) +rounds are used in this prediction. It's 1-based index just like R vector. When set +to \code{c(1, 1)} XGBoost will use all trees.} + +\item{strict_shape}{Default is \code{FALSE}. When it's set to \code{TRUE}, output +type and shape of prediction are invariant to model type.} + \item{...}{Parameters passed to \code{predict.xgb.Booster}} } \value{ -For regression or binary classification, it returns a vector of length \code{nrows(newdata)}. +The return type is different depending whether \code{strict_shape} is set to \code{TRUE}. By default, +for regression or binary classification, it returns a vector of length \code{nrows(newdata)}. For multiclass classification, either a \code{num_class * nrows(newdata)} vector or a \code{(nrows(newdata), num_class)} dimension matrix is returned, depending on the \code{reshape} value. @@ -76,18 +87,19 @@ two dimensions. The "+ 1" columns corresponds to bias. Summing this array along produce practically the same result as predict with \code{predcontrib = TRUE}. For a multiclass case, a list of \code{num_class} elements is returned, where each element is such an array. + +When \code{strict_shape} is set to \code{TRUE}, the output is always an array. For +normal prediction, the output is a 2-dimension array \code{(num_class, nrow(newdata))}. + +For \code{predcontrib = TRUE}, output is \code{(ncol(newdata) + 1, num_class, nrow(newdata))} +For \code{predinteraction = TRUE}, output is \code{(ncol(newdata) + 1, ncol(newdata) + 1, num_class, nrow(newdata))} +For \code{predleaf = TRUE}, output is \code{(n_trees_in_forest, num_class, n_iterations, nrow(newdata))} } \description{ Predicted values based on either xgboost model or model handle object. } \details{ -Note that \code{ntreelimit} is not necessarily equal to the number of boosting iterations -and it is not necessarily equal to the number of trees in a model. -E.g., in a random forest-like model, \code{ntreelimit} would limit the number of trees. -But for multiclass classification, while there are multiple trees per iteration, -\code{ntreelimit} limits the number of boosting iterations. - -Also note that \code{ntreelimit} would currently do nothing for predictions from gblinear, +Note that \code{iterationrange} would currently do nothing for predictions from gblinear, since gblinear doesn't keep its boosting history. One possible practical applications of the \code{predleaf} option is to use the model @@ -120,7 +132,7 @@ bst <- xgboost(data = train$data, label = train$label, max_depth = 2, # use all trees by default pred <- predict(bst, test$data) # use only the 1st tree -pred1 <- predict(bst, test$data, ntreelimit = 1) +pred1 <- predict(bst, test$data, iterationrange = c(1, 2)) # Predicting tree leafs: # the result is an nsamples X ntrees matrix @@ -172,25 +184,9 @@ str(pred) all.equal(pred, pred_labels) # prediction from using only 5 iterations should result # in the same error as seen in iteration 5: -pred5 <- predict(bst, as.matrix(iris[, -5]), ntreelimit=5) +pred5 <- predict(bst, as.matrix(iris[, -5]), iterationrange=c(1, 6)) sum(pred5 != lb)/length(lb) - -## random forest-like model of 25 trees for binary classification: - -set.seed(11) -bst <- xgboost(data = train$data, label = train$label, max_depth = 5, - nthread = 2, nrounds = 1, objective = "binary:logistic", - num_parallel_tree = 25, subsample = 0.6, colsample_bytree = 0.1) -# Inspect the prediction error vs number of trees: -lb <- test$label -dtest <- xgb.DMatrix(test$data, label=lb) -err <- sapply(1:25, function(n) { - pred <- predict(bst, dtest, ntreelimit=n) - sum((pred > 0.5) != lb)/length(lb) -}) -plot(err, type='l', ylim=c(0,0.1), xlab='#trees') - } \references{ Scott M. Lundberg, Su-In Lee, "A Unified Approach to Interpreting Model Predictions", NIPS Proceedings 2017, \url{https://arxiv.org/abs/1705.07874} diff --git a/R-package/man/xgb.cv.Rd b/R-package/man/xgb.cv.Rd index fa67b86d5..09f5091bd 100644 --- a/R-package/man/xgb.cv.Rd +++ b/R-package/man/xgb.cv.Rd @@ -135,9 +135,7 @@ An object of class \code{xgb.cv.synchronous} with the following elements: parameter or randomly generated. \item \code{best_iteration} iteration number with the best evaluation metric value (only available with early stopping). - \item \code{best_ntreelimit} the \code{ntreelimit} value corresponding to the best iteration, - which could further be used in \code{predict} method - (only available with early stopping). + \item \code{best_ntreelimit} and the \code{ntreelimit} Deprecated attributes, use \code{best_iteration} instead. \item \code{pred} CV prediction values available when \code{prediction} is set. It is either vector or matrix (see \code{\link{cb.cv.predict}}). \item \code{models} a list of the CV folds' models. It is only available with the explicit diff --git a/R-package/man/xgb.train.Rd b/R-package/man/xgb.train.Rd index 848fddca4..5313bb15e 100644 --- a/R-package/man/xgb.train.Rd +++ b/R-package/man/xgb.train.Rd @@ -187,9 +187,6 @@ An object of class \code{xgb.Booster} with the following elements: explicitly passed. \item \code{best_iteration} iteration number with the best evaluation metric value (only available with early stopping). - \item \code{best_ntreelimit} the \code{ntreelimit} value corresponding to the best iteration, - which could further be used in \code{predict} method - (only available with early stopping). \item \code{best_score} the best evaluation metric value during early stopping. (only available with early stopping). \item \code{feature_names} names of the training dataset features diff --git a/R-package/src/init.c b/R-package/src/init.c index f8f71843b..789a0b625 100644 --- a/R-package/src/init.c +++ b/R-package/src/init.c @@ -30,6 +30,7 @@ extern SEXP XGBoosterSerializeToBuffer_R(SEXP handle); extern SEXP XGBoosterUnserializeFromBuffer_R(SEXP handle, SEXP raw); extern SEXP XGBoosterModelToRaw_R(SEXP); extern SEXP XGBoosterPredict_R(SEXP, SEXP, SEXP, SEXP, SEXP); +extern SEXP XGBoosterPredictFromDMatrix_R(SEXP, SEXP, SEXP); extern SEXP XGBoosterSaveModel_R(SEXP, SEXP); extern SEXP XGBoosterSetAttr_R(SEXP, SEXP, SEXP); extern SEXP XGBoosterSetParam_R(SEXP, SEXP, SEXP); @@ -63,6 +64,7 @@ static const R_CallMethodDef CallEntries[] = { {"XGBoosterUnserializeFromBuffer_R", (DL_FUNC) &XGBoosterUnserializeFromBuffer_R, 2}, {"XGBoosterModelToRaw_R", (DL_FUNC) &XGBoosterModelToRaw_R, 1}, {"XGBoosterPredict_R", (DL_FUNC) &XGBoosterPredict_R, 5}, + {"XGBoosterPredictFromDMatrix_R", (DL_FUNC) &XGBoosterPredictFromDMatrix_R, 3}, {"XGBoosterSaveModel_R", (DL_FUNC) &XGBoosterSaveModel_R, 2}, {"XGBoosterSetAttr_R", (DL_FUNC) &XGBoosterSetAttr_R, 3}, {"XGBoosterSetParam_R", (DL_FUNC) &XGBoosterSetParam_R, 3}, diff --git a/R-package/src/xgboost_R.cc b/R-package/src/xgboost_R.cc index ccb193bc7..0746da30f 100644 --- a/R-package/src/xgboost_R.cc +++ b/R-package/src/xgboost_R.cc @@ -374,6 +374,45 @@ SEXP XGBoosterPredict_R(SEXP handle, SEXP dmat, SEXP option_mask, return ret; } +SEXP XGBoosterPredictFromDMatrix_R(SEXP handle, SEXP dmat, SEXP json_config) { + SEXP r_out_shape; + SEXP r_out_result; + SEXP r_out; + + R_API_BEGIN(); + char const *c_json_config = CHAR(asChar(json_config)); + + bst_ulong out_dim; + bst_ulong const *out_shape; + float const *out_result; + CHECK_CALL(XGBoosterPredictFromDMatrix(R_ExternalPtrAddr(handle), + R_ExternalPtrAddr(dmat), c_json_config, + &out_shape, &out_dim, &out_result)); + + r_out_shape = PROTECT(allocVector(INTSXP, out_dim)); + size_t len = 1; + for (size_t i = 0; i < out_dim; ++i) { + INTEGER(r_out_shape)[i] = out_shape[i]; + len *= out_shape[i]; + } + r_out_result = PROTECT(allocVector(REALSXP, len)); + +#pragma omp parallel for + for (omp_ulong i = 0; i < len; ++i) { + REAL(r_out_result)[i] = out_result[i]; + } + + r_out = PROTECT(allocVector(VECSXP, 2)); + + SET_VECTOR_ELT(r_out, 0, r_out_shape); + SET_VECTOR_ELT(r_out, 1, r_out_result); + + R_API_END(); + UNPROTECT(3); + + return r_out; +} + SEXP XGBoosterLoadModel_R(SEXP handle, SEXP fname) { R_API_BEGIN(); CHECK_CALL(XGBoosterLoadModel(R_ExternalPtrAddr(handle), CHAR(asChar(fname)))); diff --git a/R-package/src/xgboost_R.h b/R-package/src/xgboost_R.h index 95a62aaed..dcf5327dd 100644 --- a/R-package/src/xgboost_R.h +++ b/R-package/src/xgboost_R.h @@ -164,7 +164,7 @@ XGB_DLL SEXP XGBoosterBoostOneIter_R(SEXP handle, SEXP dtrain, SEXP grad, SEXP h XGB_DLL SEXP XGBoosterEvalOneIter_R(SEXP handle, SEXP iter, SEXP dmats, SEXP evnames); /*! - * \brief make prediction based on dmat + * \brief (Deprecated) make prediction based on dmat * \param handle handle * \param dmat data matrix * \param option_mask output_margin:1 predict_leaf:2 @@ -173,6 +173,16 @@ XGB_DLL SEXP XGBoosterEvalOneIter_R(SEXP handle, SEXP iter, SEXP dmats, SEXP evn */ XGB_DLL SEXP XGBoosterPredict_R(SEXP handle, SEXP dmat, SEXP option_mask, SEXP ntree_limit, SEXP training); + +/*! + * \brief Run prediction on DMatrix, replacing `XGBoosterPredict_R` + * \param handle handle + * \param dmat data matrix + * \param json_config See `XGBoosterPredictFromDMatrix` in xgboost c_api.h + * + * \return A list containing 2 vectors, first one for shape while second one for prediction result. + */ +XGB_DLL SEXP XGBoosterPredictFromDMatrix_R(SEXP handle, SEXP dmat, SEXP json_config); /*! * \brief load model from existing file * \param handle handle diff --git a/R-package/tests/testthat/test_basic.R b/R-package/tests/testthat/test_basic.R index 5e2e3ee6c..a97eb01ba 100644 --- a/R-package/tests/testthat/test_basic.R +++ b/R-package/tests/testthat/test_basic.R @@ -34,6 +34,10 @@ test_that("train and predict binary classification", { err_pred1 <- sum((pred1 > 0.5) != train$label) / length(train$label) err_log <- bst$evaluation_log[1, train_error] expect_lt(abs(err_pred1 - err_log), 10e-6) + + pred2 <- predict(bst, train$data, iterationrange = c(1, 2)) + expect_length(pred1, 6513) + expect_equal(pred1, pred2) }) test_that("parameter validation works", { @@ -143,6 +147,9 @@ test_that("train and predict softprob", { pred_labels <- max.col(mpred) - 1 err <- sum(pred_labels != lb) / length(lb) expect_equal(bst$evaluation_log[1, train_merror], err, tolerance = 5e-6) + + mpred1 <- predict(bst, as.matrix(iris[, -5]), reshape = TRUE, iterationrange = c(1, 2)) + expect_equal(mpred, mpred1) }) test_that("train and predict softmax", { @@ -182,10 +189,8 @@ test_that("train and predict RF", { pred_err_20 <- sum((pred > 0.5) != lb) / length(lb) expect_equal(pred_err_20, pred_err) - #pred <- predict(bst, train$data, ntreelimit = 1) - #pred_err_1 <- sum((pred > 0.5) != lb)/length(lb) - #expect_lt(pred_err, pred_err_1) - #expect_lt(pred_err, 0.08) + pred1 <- predict(bst, train$data, iterationrange = c(1, 2)) + expect_equal(pred, pred1) }) test_that("train and predict RF with softprob", { @@ -385,3 +390,57 @@ test_that("Configuration works", { reloaded_config <- xgb.config(bst) expect_equal(config, reloaded_config); }) + +test_that("strict_shape works", { + n_rounds <- 2 + + test_strict_shape <- function(bst, X, n_groups) { + predt <- predict(bst, X, strict_shape = TRUE) + margin <- predict(bst, X, outputmargin = TRUE, strict_shape = TRUE) + contri <- predict(bst, X, predcontrib = TRUE, strict_shape = TRUE) + interact <- predict(bst, X, predinteraction = TRUE, strict_shape = TRUE) + leaf <- predict(bst, X, predleaf = TRUE, strict_shape = TRUE) + + n_rows <- nrow(X) + n_cols <- ncol(X) + + expect_equal(dim(predt), c(n_groups, n_rows)) + expect_equal(dim(margin), c(n_groups, n_rows)) + expect_equal(dim(contri), c(n_cols + 1, n_groups, n_rows)) + expect_equal(dim(interact), c(n_cols + 1, n_cols + 1, n_groups, n_rows)) + expect_equal(dim(leaf), c(1, n_groups, n_rounds, n_rows)) + + if (n_groups != 1) { + for (g in seq_len(n_groups)) { + expect_lt(max(abs(colSums(contri[, g, ]) - margin[g, ])), 1e-5) + } + } + } + + test_iris <- function() { + y <- as.numeric(iris$Species) - 1 + X <- as.matrix(iris[, -5]) + + bst <- xgboost(data = X, label = y, + max_depth = 2, nrounds = n_rounds, + objective = "multi:softprob", num_class = 3, eval_metric = "merror") + + test_strict_shape(bst, X, 3) + } + + + test_agaricus <- function() { + data(agaricus.train, package = 'xgboost') + X <- agaricus.train$data + y <- agaricus.train$label + + bst <- xgboost(data = X, label = y, max_depth = 2, + nrounds = n_rounds, objective = "binary:logistic", + eval_metric = 'error', eval_metric = 'auc', eval_metric = "logloss") + + test_strict_shape(bst, X, 1) + } + + test_iris() + test_agaricus() +}) diff --git a/doc/prediction.rst b/doc/prediction.rst index 39ca3f739..60b6ad089 100644 --- a/doc/prediction.rst +++ b/doc/prediction.rst @@ -6,7 +6,7 @@ Prediction There are a number of prediction functions in XGBoost with various parameters. This document attempts to clarify some of confusions around prediction with a focus on the -Python binding. +Python binding, R package is similar when ``strict_shape`` is specified (see below). ****************** Prediction Options @@ -58,6 +58,13 @@ After 1.4 release, we added a new parameter called ``strict_shape``, one can set ``apply`` method in scikit learn interface, this is set to False by default. +For R package, when ``strict_shape`` is specified, an ``array`` is returned, with the same +value as Python except R array is column-major while Python numpy array is row-major, so +all the dimensions are reversed. For example, for a Python ``predict_leaf`` output +obtained by having ``strict_shape=True`` has 4 dimensions: ``(n_samples, n_iterations, +n_classes, n_trees_in_forest)``, while R with ``strict_shape=TRUE`` outputs +``(n_trees_in_forest, n_classes, n_iterations, n_samples)``. + Other than these prediction types, there's also a parameter called ``iteration_range``, which is similar to model slicing. But instead of actually splitting up the model into multiple stacks, it simply returns the prediction formed by the trees within range. diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py index 0247e0ad8..9a9177ff3 100644 --- a/python-package/xgboost/core.py +++ b/python-package/xgboost/core.py @@ -111,9 +111,8 @@ def _convert_ntree_limit( raise ValueError( "Only one of `iteration_range` and `ntree_limit` can be non zero." ) - num_parallel_tree, num_groups = _get_booster_layer_trees(booster) + num_parallel_tree, _ = _get_booster_layer_trees(booster) num_parallel_tree = max([num_parallel_tree, 1]) - num_groups = max([num_groups, 1]) iteration_range = (0, ntree_limit // num_parallel_tree) return iteration_range diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc index 32a5ae167..0b048c988 100644 --- a/src/c_api/c_api.cc +++ b/src/c_api/c_api.cc @@ -662,9 +662,21 @@ XGB_DLL int XGBoosterPredictFromDMatrix(BoosterHandle handle, auto *learner = static_cast(handle); auto& entry = learner->GetThreadLocal().prediction_entry; auto p_m = *static_cast *>(dmat); - auto type = PredictionType(get(config["type"])); - auto iteration_begin = get(config["iteration_begin"]); - auto iteration_end = get(config["iteration_end"]); + + auto const& j_config = get(config); + auto type = PredictionType(get(j_config.at("type"))); + auto iteration_begin = get(j_config.at("iteration_begin")); + auto iteration_end = get(j_config.at("iteration_end")); + + auto ntree_limit_it = j_config.find("ntree_limit"); + if (ntree_limit_it != j_config.cend() && !IsA(ntree_limit_it->second) && + get(ntree_limit_it->second) != 0) { + CHECK(iteration_end == 0) << + "Only one of the `ntree_limit` or `iteration_range` can be specified."; + LOG(WARNING) << "`ntree_limit` is deprecated, use `iteration_range` instead."; + iteration_end = GetIterationFromTreeLimit(get(ntree_limit_it->second), learner); + } + bool approximate = type == PredictionType::kApproxContribution || type == PredictionType::kApproxInteraction; bool contribs = type == PredictionType::kContribution || diff --git a/src/c_api/c_api_utils.h b/src/c_api/c_api_utils.h index 46e12bd83..3a8b13059 100644 --- a/src/c_api/c_api_utils.h +++ b/src/c_api/c_api_utils.h @@ -48,7 +48,7 @@ inline void CalcPredictShape(bool strict_shape, PredictionType type, size_t rows *out_dim = 2; shape.resize(*out_dim); shape.front() = rows; - shape.back() = groups; + shape.back() = std::min(groups, chunksize); } break; } diff --git a/src/tree/updater_quantile_hist.cc b/src/tree/updater_quantile_hist.cc index a81c10965..2a8669f00 100644 --- a/src/tree/updater_quantile_hist.cc +++ b/src/tree/updater_quantile_hist.cc @@ -587,7 +587,6 @@ void QuantileHistMaker::Builder::InitSampling(const DMatrix& fmat, #if XGBOOST_CUSTOMIZE_GLOBAL_PRNG std::bernoulli_distribution coin_flip(param_.subsample); - size_t used = 0, unused = 0; for (size_t i = 0; i < info.num_row_; ++i) { if (!(gpair_ref[i].GetHess() >= 0.0f && coin_flip(rnd)) || gpair_ref[i].GetGrad() == 0.0f) { gpair_ref[i] = GradientPair(0);