merge latest changes

This commit is contained in:
Hui Liu
2024-01-24 13:30:08 -08:00
83 changed files with 1408 additions and 1273 deletions

View File

@@ -65,6 +65,6 @@ Imports:
data.table (>= 1.9.6),
jsonlite (>= 1.0)
Roxygen: list(markdown = TRUE)
RoxygenNote: 7.2.3
RoxygenNote: 7.3.0
Encoding: UTF-8
SystemRequirements: GNU make, C++17

View File

@@ -1,5 +1,6 @@
# Generated by roxygen2: do not edit by hand
S3method("[",xgb.Booster)
S3method("[",xgb.DMatrix)
S3method("dimnames<-",xgb.DMatrix)
S3method(coef,xgb.Booster)
@@ -7,6 +8,7 @@ S3method(dim,xgb.DMatrix)
S3method(dimnames,xgb.DMatrix)
S3method(getinfo,xgb.Booster)
S3method(getinfo,xgb.DMatrix)
S3method(length,xgb.Booster)
S3method(predict,xgb.Booster)
S3method(print,xgb.Booster)
S3method(print,xgb.DMatrix)
@@ -62,6 +64,7 @@ export(xgb.plot.tree)
export(xgb.save)
export(xgb.save.raw)
export(xgb.set.config)
export(xgb.slice.Booster)
export(xgb.train)
export(xgboost)
import(methods)

View File

@@ -280,7 +280,6 @@ cb.reset.parameters <- function(new_params) {
#' \code{iteration},
#' \code{begin_iteration},
#' \code{end_iteration},
#' \code{num_parallel_tree}.
#'
#' @seealso
#' \code{\link{callbacks}},
@@ -291,7 +290,6 @@ cb.early.stop <- function(stopping_rounds, maximize = FALSE,
metric_name = NULL, verbose = TRUE) {
# state variables
best_iteration <- -1
best_ntreelimit <- -1
best_score <- Inf
best_msg <- NULL
metric_idx <- 1
@@ -358,12 +356,10 @@ cb.early.stop <- function(stopping_rounds, maximize = FALSE,
# If the difference is due to floating-point truncation, update best_score
best_score <- attr_best_score
}
xgb.attr(env$bst, "best_iteration") <- best_iteration
xgb.attr(env$bst, "best_ntreelimit") <- best_ntreelimit
xgb.attr(env$bst, "best_iteration") <- best_iteration - 1
xgb.attr(env$bst, "best_score") <- best_score
} else {
env$basket$best_iteration <- best_iteration
env$basket$best_ntreelimit <- best_ntreelimit
}
}
@@ -385,14 +381,13 @@ cb.early.stop <- function(stopping_rounds, maximize = FALSE,
)
best_score <<- score
best_iteration <<- i
best_ntreelimit <<- best_iteration * env$num_parallel_tree
# save the property to attributes, so they will occur in checkpoint
if (!is.null(env$bst)) {
xgb.attributes(env$bst) <- list(
best_iteration = best_iteration - 1, # convert to 0-based index
best_score = best_score,
best_msg = best_msg,
best_ntreelimit = best_ntreelimit)
best_msg = best_msg
)
}
} else if (i - best_iteration >= stopping_rounds) {
env$stop_condition <- TRUE
@@ -475,8 +470,6 @@ cb.save.model <- function(save_period = 0, save_name = "xgboost.ubj") {
#' \code{data},
#' \code{end_iteration},
#' \code{params},
#' \code{num_parallel_tree},
#' \code{num_class}.
#'
#' @return
#' Predictions are returned inside of the \code{pred} element, which is either a vector or a matrix,
@@ -499,19 +492,21 @@ cb.cv.predict <- function(save_models = FALSE) {
stop("'cb.cv.predict' callback requires 'basket' and 'bst_folds' lists in its calling frame")
N <- nrow(env$data)
pred <-
if (env$num_class > 1) {
matrix(NA_real_, N, env$num_class)
} else {
rep(NA_real_, N)
}
pred <- NULL
iterationrange <- c(1, NVL(env$basket$best_iteration, env$end_iteration) + 1)
iterationrange <- c(1, NVL(env$basket$best_iteration, env$end_iteration))
if (NVL(env$params[['booster']], '') == 'gblinear') {
iterationrange <- c(1, 1) # must be 0 for gblinear
iterationrange <- "all"
}
for (fd in env$bst_folds) {
pr <- predict(fd$bst, fd$watchlist[[2]], iterationrange = iterationrange, reshape = TRUE)
if (is.null(pred)) {
if (NCOL(pr) > 1L) {
pred <- matrix(NA_real_, N, ncol(pr))
} else {
pred <- matrix(NA_real_, N)
}
}
if (is.matrix(pred)) {
pred[fd$index, ] <- pr
} else {

View File

@@ -208,7 +208,7 @@ xgb.iter.eval <- function(bst, watchlist, iter, feval) {
res <- sapply(seq_along(watchlist), function(j) {
w <- watchlist[[j]]
## predict using all trees
preds <- predict(bst, w, outputmargin = TRUE, iterationrange = c(1, 1))
preds <- predict(bst, w, outputmargin = TRUE, iterationrange = "all")
eval_res <- feval(preds, w)
out <- eval_res$value
names(out) <- paste0(evnames[j], "-", eval_res$metric)

View File

@@ -89,7 +89,6 @@ xgb.get.handle <- function(object) {
#' @param outputmargin Whether the prediction should be returned in the form of original untransformed
#' sum of predictions from boosting iterations' results. E.g., setting `outputmargin=TRUE` for
#' logistic regression would return log-odds instead of probabilities.
#' @param ntreelimit Deprecated, use `iterationrange` instead.
#' @param predleaf Whether to predict pre-tree leaf indices.
#' @param predcontrib Whether to return feature contributions to individual predictions (see Details).
#' @param approxcontrib Whether to use a fast approximation for feature contributions (see Details).
@@ -99,11 +98,17 @@ xgb.get.handle <- function(object) {
#' or `predinteraction` is `TRUE`.
#' @param training Whether the predictions are used for training. For dart booster,
#' training predicting will perform dropout.
#' @param iterationrange Specifies which trees are used in prediction. For
#' example, take a random forest with 100 rounds.
#' With `iterationrange=c(1, 21)`, only the trees built during `[1, 21)` (half open set)
#' rounds are used in this prediction. The index is 1-based just like an R vector. When set
#' to `c(1, 1)`, XGBoost will use all trees.
#' @param iterationrange Sequence of rounds/iterations from the model to use for prediction, specified by passing
#' a two-dimensional vector with the start and end numbers in the sequence (same format as R's `seq` - i.e.
#' base-1 indexing, and inclusive of both ends).
#'
#' For example, passing `c(1,20)` will predict using the first twenty iterations, while passing `c(1,1)` will
#' predict using only the first one.
#'
#' If passing `NULL`, will either stop at the best iteration if the model used early stopping, or use all
#' of the iterations (rounds) otherwise.
#'
#' If passing "all", will use all of the rounds regardless of whether the model had early stopping or not.
#' @param strict_shape Default is `FALSE`. When set to `TRUE`, the output
#' type and shape of predictions are invariant to the model type.
#' @param ... Not used.
@@ -189,7 +194,7 @@ xgb.get.handle <- function(object) {
#' # use all trees by default
#' pred <- predict(bst, test$data)
#' # use only the 1st tree
#' pred1 <- predict(bst, test$data, iterationrange = c(1, 2))
#' pred1 <- predict(bst, test$data, iterationrange = c(1, 1))
#'
#' # Predicting tree leafs:
#' # the result is an nsamples X ntrees matrix
@@ -260,11 +265,11 @@ xgb.get.handle <- function(object) {
#' all.equal(pred, pred_labels)
#' # prediction from using only 5 iterations should result
#' # in the same error as seen in iteration 5:
#' pred5 <- predict(bst, as.matrix(iris[, -5]), iterationrange = c(1, 6))
#' pred5 <- predict(bst, as.matrix(iris[, -5]), iterationrange = c(1, 5))
#' sum(pred5 != lb) / length(lb)
#'
#' @export
predict.xgb.Booster <- function(object, newdata, missing = NA, outputmargin = FALSE, ntreelimit = NULL,
predict.xgb.Booster <- function(object, newdata, missing = NA, outputmargin = FALSE,
predleaf = FALSE, predcontrib = FALSE, approxcontrib = FALSE, predinteraction = FALSE,
reshape = FALSE, training = FALSE, iterationrange = NULL, strict_shape = FALSE, ...) {
if (!inherits(newdata, "xgb.DMatrix")) {
@@ -275,25 +280,21 @@ predict.xgb.Booster <- function(object, newdata, missing = NA, outputmargin = FA
)
}
if (NVL(xgb.booster_type(object), '') == 'gblinear' || is.null(ntreelimit))
ntreelimit <- 0
if (ntreelimit != 0 && is.null(iterationrange)) {
## only ntreelimit, initialize iteration range
iterationrange <- c(0, 0)
} else if (ntreelimit == 0 && !is.null(iterationrange)) {
## only iteration range, handle 1-based indexing
iterationrange <- c(iterationrange[1] - 1, iterationrange[2] - 1)
} else if (ntreelimit != 0 && !is.null(iterationrange)) {
## both are specified, let libgxgboost throw an error
if (!is.null(iterationrange)) {
if (is.character(iterationrange)) {
stopifnot(iterationrange == "all")
iterationrange <- c(0, 0)
} else {
iterationrange[1] <- iterationrange[1] - 1 # base-0 indexing
}
} else {
## no limit is supplied, use best
best_iteration <- xgb.best_iteration(object)
if (is.null(best_iteration)) {
iterationrange <- c(0, 0)
} else {
## We don't need to + 1 as R is 1-based index.
iterationrange <- c(0, as.integer(best_iteration))
iterationrange <- c(0, as.integer(best_iteration) + 1L)
}
}
## Handle the 0 length values.
@@ -312,7 +313,6 @@ predict.xgb.Booster <- function(object, newdata, missing = NA, outputmargin = FA
strict_shape = box(TRUE),
iteration_begin = box(as.integer(iterationrange[1])),
iteration_end = box(as.integer(iterationrange[2])),
ntree_limit = box(as.integer(ntreelimit)),
type = box(as.integer(0))
)
@@ -343,24 +343,24 @@ predict.xgb.Booster <- function(object, newdata, missing = NA, outputmargin = FA
)
names(predts) <- c("shape", "results")
shape <- predts$shape
ret <- predts$results
arr <- predts$results
n_ret <- length(ret)
n_ret <- length(arr)
n_row <- nrow(newdata)
if (n_row != shape[1]) {
stop("Incorrect predict shape.")
}
arr <- array(data = ret, dim = rev(shape))
.Call(XGSetArrayDimInplace_R, arr, rev(shape))
cnames <- if (!is.null(colnames(newdata))) c(colnames(newdata), "BIAS") else NULL
n_groups <- shape[2]
## Needed regardless of whether strict shape is being used.
if (predcontrib) {
dimnames(arr) <- list(cnames, NULL, NULL)
.Call(XGSetArrayDimNamesInplace_R, arr, list(cnames, NULL, NULL))
} else if (predinteraction) {
dimnames(arr) <- list(cnames, cnames, NULL, NULL)
.Call(XGSetArrayDimNamesInplace_R, arr, list(cnames, cnames, NULL, NULL))
}
if (strict_shape) {
return(arr) # strict shape is calculated by libxgboost uniformly.
@@ -368,43 +368,51 @@ predict.xgb.Booster <- function(object, newdata, missing = NA, outputmargin = FA
if (predleaf) {
## Predict leaf
arr <- if (n_ret == n_row) {
matrix(arr, ncol = 1)
if (n_ret == n_row) {
.Call(XGSetArrayDimInplace_R, arr, c(n_row, 1L))
} else {
matrix(arr, nrow = n_row, byrow = TRUE)
arr <- matrix(arr, nrow = n_row, byrow = TRUE)
}
} else if (predcontrib) {
## Predict contribution
arr <- aperm(a = arr, perm = c(2, 3, 1)) # [group, row, col]
arr <- if (n_ret == n_row) {
matrix(arr, ncol = 1, dimnames = list(NULL, cnames))
if (n_ret == n_row) {
.Call(XGSetArrayDimInplace_R, arr, c(n_row, 1L))
.Call(XGSetArrayDimNamesInplace_R, arr, list(NULL, cnames))
} else if (n_groups != 1) {
## turns array into list of matrices
lapply(seq_len(n_groups), function(g) arr[g, , ])
arr <- lapply(seq_len(n_groups), function(g) arr[g, , ])
} else {
## remove the first axis (group)
dn <- dimnames(arr)
matrix(arr[1, , ], nrow = dim(arr)[2], ncol = dim(arr)[3], dimnames = c(dn[2], dn[3]))
newdim <- dim(arr)[2:3]
newdn <- dimnames(arr)[2:3]
arr <- arr[1, , ]
.Call(XGSetArrayDimInplace_R, arr, newdim)
.Call(XGSetArrayDimNamesInplace_R, arr, newdn)
}
} else if (predinteraction) {
## Predict interaction
arr <- aperm(a = arr, perm = c(3, 4, 1, 2)) # [group, row, col, col]
arr <- if (n_ret == n_row) {
matrix(arr, ncol = 1, dimnames = list(NULL, cnames))
if (n_ret == n_row) {
.Call(XGSetArrayDimInplace_R, arr, c(n_row, 1L))
.Call(XGSetArrayDimNamesInplace_R, arr, list(NULL, cnames))
} else if (n_groups != 1) {
## turns array into list of matrices
lapply(seq_len(n_groups), function(g) arr[g, , , ])
arr <- lapply(seq_len(n_groups), function(g) arr[g, , , ])
} else {
## remove the first axis (group)
arr <- arr[1, , , , drop = FALSE]
array(arr, dim = dim(arr)[2:4], dimnames(arr)[2:4])
newdim <- dim(arr)[2:4]
newdn <- dimnames(arr)[2:4]
.Call(XGSetArrayDimInplace_R, arr, newdim)
.Call(XGSetArrayDimNamesInplace_R, arr, newdn)
}
} else {
## Normal prediction
arr <- if (reshape && n_groups != 1) {
matrix(arr, ncol = n_groups, byrow = TRUE)
if (reshape && n_groups != 1) {
arr <- matrix(arr, ncol = n_groups, byrow = TRUE)
} else {
as.vector(ret)
.Call(XGSetArrayDimInplace_R, arr, NULL)
}
}
return(arr)
@@ -492,7 +500,7 @@ xgb.attr <- function(object, name) {
return(NULL)
}
if (!is.null(out)) {
if (name %in% c("best_iteration", "best_ntreelimit", "best_score")) {
if (name %in% c("best_iteration", "best_score")) {
out <- as.numeric(out)
}
}
@@ -685,16 +693,94 @@ setinfo.xgb.Booster <- function(object, name, info) {
}
#' @title Get number of boosting in a fitted booster
#' @param model A fitted `xgb.Booster` model.
#' @param model,x A fitted `xgb.Booster` model.
#' @return The number of rounds saved in the model, as an integer.
#' @details Note that setting booster parameters related to training
#' continuation / updates through \link{xgb.parameters<-} will reset the
#' number of rounds to zero.
#' @export
#' @rdname xgb.get.num.boosted.rounds
xgb.get.num.boosted.rounds <- function(model) {
return(.Call(XGBoosterBoostedRounds_R, xgb.get.handle(model)))
}
#' @rdname xgb.get.num.boosted.rounds
#' @export
length.xgb.Booster <- function(x) {
return(xgb.get.num.boosted.rounds(x))
}
#' @title Slice Booster by Rounds
#' @description Creates a new booster including only a selected range of rounds / iterations
#' from an existing booster, as given by the sequence `seq(start, end, step)`.
#' @details Note that any R attributes that the booster might have, will not be copied into
#' the resulting object.
#' @param model,x A fitted `xgb.Booster` object, which is to be sliced by taking only a subset
#' of its rounds / iterations.
#' @param start Start of the slice (base-1 and inclusive, like R's \link{seq}).
#' @param end End of the slice (base-1 and inclusive, like R's \link{seq}).
#'
#' Passing a value of zero here is equivalent to passing the full number of rounds in the
#' booster object.
#' @param step Step size of the slice. Passing '1' will take every round in the sequence defined by
#' `(start, end)`, while passing '2' will take every second value, and so on.
#' @return A sliced booster object containing only the requested rounds.
#' @examples
#' data(mtcars)
#' y <- mtcars$mpg
#' x <- as.matrix(mtcars[, -1])
#' dm <- xgb.DMatrix(x, label = y, nthread = 1)
#' model <- xgb.train(data = dm, params = list(nthread = 1), nrounds = 5)
#' model_slice <- xgb.slice.Booster(model, 1, 3)
#' # Prediction for first three rounds
#' predict(model, x, predleaf = TRUE)[, 1:3]
#'
#' # The new model has only those rounds, so
#' # a full prediction from it is equivalent
#' predict(model_slice, x, predleaf = TRUE)
#' @export
#' @rdname xgb.slice.Booster
xgb.slice.Booster <- function(model, start, end = xgb.get.num.boosted.rounds(model), step = 1L) {
# This makes the slice mimic the behavior of R's 'seq',
# which truncates on the end of the slice when the step
# doesn't reach it.
if (end > start && step > 1) {
d <- (end - start + 1) / step
if (d != floor(d)) {
end <- start + step * ceiling(d) - 1
}
}
return(
.Call(
XGBoosterSlice_R,
xgb.get.handle(model),
start - 1,
end,
step
)
)
}
#' @export
#' @rdname xgb.slice.Booster
#' @param i The indices - must be an increasing sequence as generated by e.g. `seq(...)`.
`[.xgb.Booster` <- function(x, i) {
if (missing(i)) {
return(xgb.slice.Booster(x, 1, 0))
}
if (length(i) == 1) {
return(xgb.slice.Booster(x, i, i))
}
steps <- diff(i)
if (any(steps < 0)) {
stop("Can only slice booster with ascending sequences.")
}
if (length(unique(steps)) > 1) {
stop("Can only slice booster with fixed-step sequences.")
}
return(xgb.slice.Booster(x, i[1L], i[length(i)], steps[1L]))
}
#' @title Get Features Names from Booster
#' @description Returns the feature / variable / column names from a fitted
#' booster object, which are set automatically during the call to \link{xgb.train}
@@ -710,12 +796,6 @@ variable.names.xgb.Booster <- function(object, ...) {
return(getinfo(object, "feature_name"))
}
xgb.ntree <- function(bst) {
config <- xgb.config(bst)
out <- strtoi(config$learner$gradient_booster$gbtree_model_param$num_trees)
return(out)
}
xgb.nthread <- function(bst) {
config <- xgb.config(bst)
out <- strtoi(config$learner$generic_param$nthread)

View File

@@ -103,7 +103,6 @@
#' parameter or randomly generated.
#' \item \code{best_iteration} iteration number with the best evaluation metric value
#' (only available with early stopping).
#' \item \code{best_ntreelimit} and the \code{ntreelimit} Deprecated attributes, use \code{best_iteration} instead.
#' \item \code{pred} CV prediction values available when \code{prediction} is set.
#' It is either vector or matrix (see \code{\link{cb.cv.predict}}).
#' \item \code{models} a list of the CV folds' models. It is only available with the explicit
@@ -218,7 +217,6 @@ xgb.cv <- function(params = list(), data, nrounds, nfold, label = NULL, missing
# extract parameters that can affect the relationship b/w #trees and #iterations
num_class <- max(as.numeric(NVL(params[['num_class']], 1)), 1) # nolint
num_parallel_tree <- max(as.numeric(NVL(params[['num_parallel_tree']], 1)), 1) # nolint
# those are fixed for CV (no training continuation)
begin_iteration <- 1
@@ -318,7 +316,7 @@ print.xgb.cv.synchronous <- function(x, verbose = FALSE, ...) {
})
}
for (n in c('niter', 'best_iteration', 'best_ntreelimit')) {
for (n in c('niter', 'best_iteration')) {
if (is.null(x[[n]]))
next
cat(n, ': ', x[[n]], '\n', sep = '')

View File

@@ -113,19 +113,12 @@
#' xgb.importance(model = mbst)
#'
#' @export
xgb.importance <- function(feature_names = NULL, model = NULL, trees = NULL,
xgb.importance <- function(model = NULL, feature_names = getinfo(model, "feature_name"), trees = NULL,
data = NULL, label = NULL, target = NULL) {
if (!(is.null(data) && is.null(label) && is.null(target)))
warning("xgb.importance: parameters 'data', 'label' and 'target' are deprecated")
if (is.null(feature_names)) {
model_feature_names <- xgb.feature_names(model)
if (NROW(model_feature_names)) {
feature_names <- model_feature_names
}
}
if (!(is.null(feature_names) || is.character(feature_names)))
stop("feature_names: Has to be a character vector")

View File

@@ -2,11 +2,8 @@
#'
#' Parse a boosted tree model text dump into a `data.table` structure.
#'
#' @param feature_names Character vector of feature names. If the model already
#' contains feature names, those will be used when \code{feature_names=NULL} (default value).
#'
#' Note that, if the model already contains feature names, it's \bold{not} possible to override them here.
#' @param model Object of class `xgb.Booster`.
#' @param model Object of class `xgb.Booster`. If it contains feature names (they can be set through
#' \link{setinfo}), they will be used in the output from this function.
#' @param text Character vector previously generated by the function [xgb.dump()]
#' (called with parameter `with_stats = TRUE`). `text` takes precedence over `model`.
#' @param trees An integer vector of tree indices that should be used.
@@ -58,7 +55,7 @@
#'
#' # This bst model already has feature_names stored with it, so those would be used when
#' # feature_names is not set:
#' (dt <- xgb.model.dt.tree(model = bst))
#' dt <- xgb.model.dt.tree(bst)
#'
#' # How to match feature names of splits that are following a current 'Yes' branch:
#' merge(
@@ -69,7 +66,7 @@
#' ]
#'
#' @export
xgb.model.dt.tree <- function(feature_names = NULL, model = NULL, text = NULL,
xgb.model.dt.tree <- function(model = NULL, text = NULL,
trees = NULL, use_int_id = FALSE, ...) {
check.deprecation(...)
@@ -79,24 +76,15 @@ xgb.model.dt.tree <- function(feature_names = NULL, model = NULL, text = NULL,
" (or NULL if 'model' was provided).")
}
model_feature_names <- NULL
if (inherits(model, "xgb.Booster")) {
model_feature_names <- xgb.feature_names(model)
if (NROW(model_feature_names) && !is.null(feature_names)) {
stop("'model' contains feature names. Cannot override them.")
}
}
if (is.null(feature_names) && !is.null(model) && !is.null(model_feature_names))
feature_names <- model_feature_names
if (!(is.null(feature_names) || is.character(feature_names))) {
stop("feature_names: must be a character vector")
}
if (!(is.null(trees) || is.numeric(trees))) {
stop("trees: must be a vector of integers.")
}
feature_names <- NULL
if (inherits(model, "xgb.Booster")) {
feature_names <- xgb.feature_names(model)
}
from_text <- TRUE
if (is.null(text)) {
text <- xgb.dump(model = model, with_stats = TRUE)
@@ -134,7 +122,7 @@ xgb.model.dt.tree <- function(feature_names = NULL, model = NULL, text = NULL,
branch_rx_w_names <- paste0("\\d+:\\[(.+)<(", anynumber_regex, ")\\] yes=(\\d+),no=(\\d+),missing=(\\d+),",
"gain=(", anynumber_regex, "),cover=(", anynumber_regex, ")")
text_has_feature_names <- FALSE
if (NROW(model_feature_names)) {
if (NROW(feature_names)) {
branch_rx <- branch_rx_w_names
text_has_feature_names <- TRUE
} else {
@@ -148,9 +136,6 @@ xgb.model.dt.tree <- function(feature_names = NULL, model = NULL, text = NULL,
}
}
}
if (text_has_feature_names && is.null(model) && !is.null(feature_names)) {
stop("'text' contains feature names. Cannot override them.")
}
branch_cols <- c("Feature", "Split", "Yes", "No", "Missing", "Gain", "Cover")
td[
isLeaf == FALSE,

View File

@@ -62,13 +62,13 @@
#' }
#'
#' @export
xgb.plot.multi.trees <- function(model, feature_names = NULL, features_keep = 5, plot_width = NULL, plot_height = NULL,
xgb.plot.multi.trees <- function(model, features_keep = 5, plot_width = NULL, plot_height = NULL,
render = TRUE, ...) {
if (!requireNamespace("DiagrammeR", quietly = TRUE)) {
stop("DiagrammeR is required for xgb.plot.multi.trees")
}
check.deprecation(...)
tree.matrix <- xgb.model.dt.tree(feature_names = feature_names, model = model)
tree.matrix <- xgb.model.dt.tree(model = model)
# first number of the path represents the tree, then the following numbers are related to the path to follow
# root init

View File

@@ -2,9 +2,8 @@
#'
#' Read a tree model text dump and plot the model.
#'
#' @param feature_names Character vector used to overwrite the feature names
#' of the model. The default (`NULL`) uses the original feature names.
#' @param model Object of class `xgb.Booster`.
#' @param model Object of class `xgb.Booster`. If it contains feature names (they can be set through
#' \link{setinfo}), they will be used in the output from this function.
#' @param trees An integer vector of tree indices that should be used.
#' The default (`NULL`) uses all trees.
#' Useful, e.g., in multiclass classification to get only
@@ -103,7 +102,7 @@
#' }
#'
#' @export
xgb.plot.tree <- function(feature_names = NULL, model = NULL, trees = NULL, plot_width = NULL, plot_height = NULL,
xgb.plot.tree <- function(model = NULL, trees = NULL, plot_width = NULL, plot_height = NULL,
render = TRUE, show_node_id = FALSE, style = c("R", "xgboost"), ...) {
check.deprecation(...)
if (!inherits(model, "xgb.Booster")) {
@@ -120,17 +119,12 @@ xgb.plot.tree <- function(feature_names = NULL, model = NULL, trees = NULL, plot
if (NROW(trees) != 1L || !render || show_node_id) {
stop("style='xgboost' is only supported for single, rendered tree, without node IDs.")
}
if (!is.null(feature_names)) {
stop(
"style='xgboost' cannot override 'feature_names'. Will automatically take them from the model."
)
}
txt <- xgb.dump(model, dump_format = "dot")
return(DiagrammeR::grViz(txt[[trees + 1]], width = plot_width, height = plot_height))
}
dt <- xgb.model.dt.tree(feature_names = feature_names, model = model, trees = trees)
dt <- xgb.model.dt.tree(model = model, trees = trees)
dt[, label := paste0(Feature, "\nCover: ", Cover, ifelse(Feature == "Leaf", "\nValue: ", "\nGain: "), Gain)]
if (show_node_id)

View File

@@ -393,7 +393,6 @@ xgb.train <- function(params = list(), data, nrounds, watchlist = list(),
# Note: it might look like these aren't used, but they need to be defined in this
# environment for the callbacks for work correctly.
num_class <- max(as.numeric(NVL(params[['num_class']], 1)), 1) # nolint
num_parallel_tree <- max(as.numeric(NVL(params[['num_parallel_tree']], 1)), 1) # nolint
if (is_update && nrounds > niter_init)
stop("nrounds cannot be larger than ", niter_init, " (nrounds of xgb_model)")

View File

@@ -15,7 +15,7 @@ cat('start testing prediction from first n trees\n')
labels <- getinfo(dtest, 'label')
### predict using first 1 tree
ypred1 <- predict(bst, dtest, ntreelimit = 1)
ypred1 <- predict(bst, dtest, iterationrange = c(1, 1))
# by default, we predict using all the trees
ypred2 <- predict(bst, dtest)

View File

@@ -35,8 +35,6 @@ Callback function expects the following values to be set in its calling frame:
\code{data},
\code{end_iteration},
\code{params},
\code{num_parallel_tree},
\code{num_class}.
}
\seealso{
\code{\link{callbacks}}

View File

@@ -55,7 +55,6 @@ Callback function expects the following values to be set in its calling frame:
\code{iteration},
\code{begin_iteration},
\code{end_iteration},
\code{num_parallel_tree}.
}
\seealso{
\code{\link{callbacks}},

View File

@@ -9,7 +9,6 @@
newdata,
missing = NA,
outputmargin = FALSE,
ntreelimit = NULL,
predleaf = FALSE,
predcontrib = FALSE,
approxcontrib = FALSE,
@@ -36,8 +35,6 @@ missing values in data (e.g., 0 or some other extreme value).}
sum of predictions from boosting iterations' results. E.g., setting \code{outputmargin=TRUE} for
logistic regression would return log-odds instead of probabilities.}
\item{ntreelimit}{Deprecated, use \code{iterationrange} instead.}
\item{predleaf}{Whether to predict pre-tree leaf indices.}
\item{predcontrib}{Whether to return feature contributions to individual predictions (see Details).}
@@ -53,11 +50,18 @@ or \code{predinteraction} is \code{TRUE}.}
\item{training}{Whether the predictions are used for training. For dart booster,
training predicting will perform dropout.}
\item{iterationrange}{Specifies which trees are used in prediction. For
example, take a random forest with 100 rounds.
With \code{iterationrange=c(1, 21)}, only the trees built during \verb{[1, 21)} (half open set)
rounds are used in this prediction. The index is 1-based just like an R vector. When set
to \code{c(1, 1)}, XGBoost will use all trees.}
\item{iterationrange}{Sequence of rounds/iterations from the model to use for prediction, specified by passing
a two-dimensional vector with the start and end numbers in the sequence (same format as R's \code{seq} - i.e.
base-1 indexing, and inclusive of both ends).
\if{html}{\out{<div class="sourceCode">}}\preformatted{ For example, passing `c(1,20)` will predict using the first twenty iterations, while passing `c(1,1)` will
predict using only the first one.
If passing `NULL`, will either stop at the best iteration if the model used early stopping, or use all
of the iterations (rounds) otherwise.
If passing "all", will use all of the rounds regardless of whether the model had early stopping or not.
}\if{html}{\out{</div>}}}
\item{strict_shape}{Default is \code{FALSE}. When set to \code{TRUE}, the output
type and shape of predictions are invariant to the model type.}
@@ -145,7 +149,7 @@ bst <- xgb.train(
# use all trees by default
pred <- predict(bst, test$data)
# use only the 1st tree
pred1 <- predict(bst, test$data, iterationrange = c(1, 2))
pred1 <- predict(bst, test$data, iterationrange = c(1, 1))
# Predicting tree leafs:
# the result is an nsamples X ntrees matrix
@@ -216,7 +220,7 @@ str(pred)
all.equal(pred, pred_labels)
# prediction from using only 5 iterations should result
# in the same error as seen in iteration 5:
pred5 <- predict(bst, as.matrix(iris[, -5]), iterationrange = c(1, 6))
pred5 <- predict(bst, as.matrix(iris[, -5]), iterationrange = c(1, 5))
sum(pred5 != lb) / length(lb)
}

View File

@@ -135,7 +135,6 @@ It is created by the \code{\link{cb.evaluation.log}} callback.
parameter or randomly generated.
\item \code{best_iteration} iteration number with the best evaluation metric value
(only available with early stopping).
\item \code{best_ntreelimit} and the \code{ntreelimit} Deprecated attributes, use \code{best_iteration} instead.
\item \code{pred} CV prediction values available when \code{prediction} is set.
It is either vector or matrix (see \code{\link{cb.cv.predict}}).
\item \code{models} a list of the CV folds' models. It is only available with the explicit

View File

@@ -2,12 +2,15 @@
% Please edit documentation in R/xgb.Booster.R
\name{xgb.get.num.boosted.rounds}
\alias{xgb.get.num.boosted.rounds}
\alias{length.xgb.Booster}
\title{Get number of boosting in a fitted booster}
\usage{
xgb.get.num.boosted.rounds(model)
\method{length}{xgb.Booster}(x)
}
\arguments{
\item{model}{A fitted \code{xgb.Booster} model.}
\item{model, x}{A fitted \code{xgb.Booster} model.}
}
\value{
The number of rounds saved in the model, as an integer.

View File

@@ -5,8 +5,8 @@
\title{Feature importance}
\usage{
xgb.importance(
feature_names = NULL,
model = NULL,
feature_names = getinfo(model, "feature_name"),
trees = NULL,
data = NULL,
label = NULL,
@@ -14,11 +14,11 @@ xgb.importance(
)
}
\arguments{
\item{model}{Object of class \code{xgb.Booster}.}
\item{feature_names}{Character vector used to overwrite the feature names
of the model. The default is \code{NULL} (use original feature names).}
\item{model}{Object of class \code{xgb.Booster}.}
\item{trees}{An integer vector of tree indices that should be included
into the importance calculation (only for the "gbtree" booster).
The default (\code{NULL}) parses all trees.

View File

@@ -5,7 +5,6 @@
\title{Parse model text dump}
\usage{
xgb.model.dt.tree(
feature_names = NULL,
model = NULL,
text = NULL,
trees = NULL,
@@ -14,13 +13,8 @@ xgb.model.dt.tree(
)
}
\arguments{
\item{feature_names}{Character vector of feature names. If the model already
contains feature names, those will be used when \code{feature_names=NULL} (default value).
\if{html}{\out{<div class="sourceCode">}}\preformatted{ Note that, if the model already contains feature names, it's \\bold\{not\} possible to override them here.
}\if{html}{\out{</div>}}}
\item{model}{Object of class \code{xgb.Booster}.}
\item{model}{Object of class \code{xgb.Booster}. If it contains feature names (they can be set through
\link{setinfo}), they will be used in the output from this function.}
\item{text}{Character vector previously generated by the function \code{\link[=xgb.dump]{xgb.dump()}}
(called with parameter \code{with_stats = TRUE}). \code{text} takes precedence over \code{model}.}
@@ -81,7 +75,7 @@ bst <- xgboost(
# This bst model already has feature_names stored with it, so those would be used when
# feature_names is not set:
(dt <- xgb.model.dt.tree(model = bst))
dt <- xgb.model.dt.tree(bst)
# How to match feature names of splits that are following a current 'Yes' branch:
merge(

View File

@@ -6,7 +6,6 @@
\usage{
xgb.plot.multi.trees(
model,
feature_names = NULL,
features_keep = 5,
plot_width = NULL,
plot_height = NULL,
@@ -15,10 +14,8 @@ xgb.plot.multi.trees(
)
}
\arguments{
\item{model}{Object of class \code{xgb.Booster}.}
\item{feature_names}{Character vector used to overwrite the feature names
of the model. The default (\code{NULL}) uses the original feature names.}
\item{model}{Object of class \code{xgb.Booster}. If it contains feature names (they can be set through
\link{setinfo}), they will be used in the output from this function.}
\item{features_keep}{Number of features to keep in each position of the multi trees,
by default 5.}

View File

@@ -5,7 +5,6 @@
\title{Plot boosted trees}
\usage{
xgb.plot.tree(
feature_names = NULL,
model = NULL,
trees = NULL,
plot_width = NULL,
@@ -17,10 +16,8 @@ xgb.plot.tree(
)
}
\arguments{
\item{feature_names}{Character vector used to overwrite the feature names
of the model. The default (\code{NULL}) uses the original feature names.}
\item{model}{Object of class \code{xgb.Booster}.}
\item{model}{Object of class \code{xgb.Booster}. If it contains feature names (they can be set through
\link{setinfo}), they will be used in the output from this function.}
\item{trees}{An integer vector of tree indices that should be used.
The default (\code{NULL}) uses all trees.

View File

@@ -0,0 +1,57 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/xgb.Booster.R
\name{xgb.slice.Booster}
\alias{xgb.slice.Booster}
\alias{[.xgb.Booster}
\title{Slice Booster by Rounds}
\usage{
xgb.slice.Booster(
model,
start,
end = xgb.get.num.boosted.rounds(model),
step = 1L
)
\method{[}{xgb.Booster}(x, i)
}
\arguments{
\item{model, x}{A fitted \code{xgb.Booster} object, which is to be sliced by taking only a subset
of its rounds / iterations.}
\item{start}{Start of the slice (base-1 and inclusive, like R's \link{seq}).}
\item{end}{End of the slice (base-1 and inclusive, like R's \link{seq}).
Passing a value of zero here is equivalent to passing the full number of rounds in the
booster object.}
\item{step}{Step size of the slice. Passing '1' will take every round in the sequence defined by
\verb{(start, end)}, while passing '2' will take every second value, and so on.}
\item{i}{The indices - must be an increasing sequence as generated by e.g. \code{seq(...)}.}
}
\value{
A sliced booster object containing only the requested rounds.
}
\description{
Creates a new booster including only a selected range of rounds / iterations
from an existing booster, as given by the sequence \code{seq(start, end, step)}.
}
\details{
Note that any R attributes that the booster might have, will not be copied into
the resulting object.
}
\examples{
data(mtcars)
y <- mtcars$mpg
x <- as.matrix(mtcars[, -1])
dm <- xgb.DMatrix(x, label = y, nthread = 1)
model <- xgb.train(data = dm, params = list(nthread = 1), nrounds = 5)
model_slice <- xgb.slice.Booster(model, 1, 3)
# Prediction for first three rounds
predict(model, x, predleaf = TRUE)[, 1:3]
# The new model has only those rounds, so
# a full prediction from it is equivalent
predict(model_slice, x, predleaf = TRUE)
}

View File

@@ -42,6 +42,8 @@ extern SEXP XGBoosterSetAttr_R(SEXP, SEXP, SEXP);
extern SEXP XGBoosterSetParam_R(SEXP, SEXP, SEXP);
extern SEXP XGBoosterUpdateOneIter_R(SEXP, SEXP, SEXP);
extern SEXP XGCheckNullPtr_R(SEXP);
extern SEXP XGSetArrayDimInplace_R(SEXP, SEXP);
extern SEXP XGSetArrayDimNamesInplace_R(SEXP, SEXP);
extern SEXP XGDMatrixCreateFromCSC_R(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP);
extern SEXP XGDMatrixCreateFromCSR_R(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP);
extern SEXP XGDMatrixCreateFromFile_R(SEXP, SEXP);
@@ -62,6 +64,7 @@ extern SEXP XGDMatrixSliceDMatrix_R(SEXP, SEXP);
extern SEXP XGBSetGlobalConfig_R(SEXP);
extern SEXP XGBGetGlobalConfig_R(void);
extern SEXP XGBoosterFeatureScore_R(SEXP, SEXP);
extern SEXP XGBoosterSlice_R(SEXP, SEXP, SEXP, SEXP);
static const R_CallMethodDef CallEntries[] = {
{"XGDuplicate_R", (DL_FUNC) &XGDuplicate_R, 1},
@@ -90,6 +93,8 @@ static const R_CallMethodDef CallEntries[] = {
{"XGBoosterSetParam_R", (DL_FUNC) &XGBoosterSetParam_R, 3},
{"XGBoosterUpdateOneIter_R", (DL_FUNC) &XGBoosterUpdateOneIter_R, 3},
{"XGCheckNullPtr_R", (DL_FUNC) &XGCheckNullPtr_R, 1},
{"XGSetArrayDimInplace_R", (DL_FUNC) &XGSetArrayDimInplace_R, 2},
{"XGSetArrayDimNamesInplace_R", (DL_FUNC) &XGSetArrayDimNamesInplace_R, 2},
{"XGDMatrixCreateFromCSC_R", (DL_FUNC) &XGDMatrixCreateFromCSC_R, 6},
{"XGDMatrixCreateFromCSR_R", (DL_FUNC) &XGDMatrixCreateFromCSR_R, 6},
{"XGDMatrixCreateFromFile_R", (DL_FUNC) &XGDMatrixCreateFromFile_R, 2},
@@ -110,6 +115,7 @@ static const R_CallMethodDef CallEntries[] = {
{"XGBSetGlobalConfig_R", (DL_FUNC) &XGBSetGlobalConfig_R, 1},
{"XGBGetGlobalConfig_R", (DL_FUNC) &XGBGetGlobalConfig_R, 0},
{"XGBoosterFeatureScore_R", (DL_FUNC) &XGBoosterFeatureScore_R, 2},
{"XGBoosterSlice_R", (DL_FUNC) &XGBoosterSlice_R, 4},
{NULL, NULL, 0}
};

View File

@@ -263,6 +263,16 @@ XGB_DLL SEXP XGCheckNullPtr_R(SEXP handle) {
return Rf_ScalarLogical(R_ExternalPtrAddr(handle) == nullptr);
}
XGB_DLL SEXP XGSetArrayDimInplace_R(SEXP arr, SEXP dims) {
Rf_setAttrib(arr, R_DimSymbol, dims);
return R_NilValue;
}
XGB_DLL SEXP XGSetArrayDimNamesInplace_R(SEXP arr, SEXP dim_names) {
Rf_setAttrib(arr, R_DimNamesSymbol, dim_names);
return R_NilValue;
}
namespace {
void _DMatrixFinalizer(SEXP ext) {
R_API_BEGIN();
@@ -1279,3 +1289,18 @@ XGB_DLL SEXP XGBoosterFeatureScore_R(SEXP handle, SEXP json_config) {
return r_out;
}
XGB_DLL SEXP XGBoosterSlice_R(SEXP handle, SEXP begin_layer, SEXP end_layer, SEXP step) {
SEXP out = Rf_protect(XGBMakeEmptyAltrep());
R_API_BEGIN();
BoosterHandle handle_out = nullptr;
CHECK_CALL(XGBoosterSlice(R_ExternalPtrAddr(handle),
Rf_asInteger(begin_layer),
Rf_asInteger(end_layer),
Rf_asInteger(step),
&handle_out));
XGBAltrepSetPointer(out, handle_out);
R_API_END();
Rf_unprotect(1);
return out;
}

View File

@@ -23,6 +23,22 @@
*/
XGB_DLL SEXP XGCheckNullPtr_R(SEXP handle);
/*!
* \brief set the dimensions of an array in-place
* \param arr
* \param dims dimensions to set to the array
* \return NULL value
*/
XGB_DLL SEXP XGSetArrayDimInplace_R(SEXP arr, SEXP dims);
/*!
* \brief set the names of the dimensions of an array in-place
* \param arr
* \param dim_names names for the dimensions to set
* \return NULL value
*/
XGB_DLL SEXP XGSetArrayDimNamesInplace_R(SEXP arr, SEXP dim_names);
/*!
* \brief Set global configuration
* \param json_str a JSON string representing the list of key-value pairs
@@ -386,4 +402,14 @@ XGB_DLL SEXP XGBoosterGetAttrNames_R(SEXP handle);
*/
XGB_DLL SEXP XGBoosterFeatureScore_R(SEXP handle, SEXP json_config);
/*!
* \brief Slice a fitted booster model (by rounds)
* \param handle handle to the fitted booster
* \param begin_layer start of the slice
* \param end_later end of the slice; end_layer=0 is equivalent to end_layer=num_boost_round
* \param step step size of the slice
* \return The sliced booster with the requested rounds only
*/
XGB_DLL SEXP XGBoosterSlice_R(SEXP handle, SEXP begin_layer, SEXP end_layer, SEXP step);
#endif // XGBOOST_WRAPPER_R_H_ // NOLINT(*)

View File

@@ -33,15 +33,11 @@ test_that("train and predict binary classification", {
pred <- predict(bst, test$data)
expect_length(pred, 1611)
pred1 <- predict(bst, train$data, ntreelimit = 1)
pred1 <- predict(bst, train$data, iterationrange = c(1, 1))
expect_length(pred1, 6513)
err_pred1 <- sum((pred1 > 0.5) != train$label) / length(train$label)
err_log <- attributes(bst)$evaluation_log[1, train_error]
expect_lt(abs(err_pred1 - err_log), 10e-6)
pred2 <- predict(bst, train$data, iterationrange = c(1, 2))
expect_length(pred1, 6513)
expect_equal(pred1, pred2)
})
test_that("parameter validation works", {
@@ -117,8 +113,8 @@ test_that("dart prediction works", {
nrounds = nrounds,
objective = "reg:squarederror"
)
pred_by_xgboost_0 <- predict(booster_by_xgboost, newdata = d, ntreelimit = 0)
pred_by_xgboost_1 <- predict(booster_by_xgboost, newdata = d, ntreelimit = nrounds)
pred_by_xgboost_0 <- predict(booster_by_xgboost, newdata = d, iterationrange = NULL)
pred_by_xgboost_1 <- predict(booster_by_xgboost, newdata = d, iterationrange = c(1, nrounds))
expect_true(all(matrix(pred_by_xgboost_0, byrow = TRUE) == matrix(pred_by_xgboost_1, byrow = TRUE)))
pred_by_xgboost_2 <- predict(booster_by_xgboost, newdata = d, training = TRUE)
@@ -139,8 +135,8 @@ test_that("dart prediction works", {
data = dtrain,
nrounds = nrounds
)
pred_by_train_0 <- predict(booster_by_train, newdata = dtrain, ntreelimit = 0)
pred_by_train_1 <- predict(booster_by_train, newdata = dtrain, ntreelimit = nrounds)
pred_by_train_0 <- predict(booster_by_train, newdata = dtrain, iterationrange = NULL)
pred_by_train_1 <- predict(booster_by_train, newdata = dtrain, iterationrange = c(1, nrounds))
pred_by_train_2 <- predict(booster_by_train, newdata = dtrain, training = TRUE)
expect_true(all(matrix(pred_by_train_0, byrow = TRUE) == matrix(pred_by_xgboost_0, byrow = TRUE)))
@@ -162,7 +158,7 @@ test_that("train and predict softprob", {
)
expect_false(is.null(attributes(bst)$evaluation_log))
expect_lt(attributes(bst)$evaluation_log[, min(train_merror)], 0.025)
expect_equal(xgb.get.num.boosted.rounds(bst) * 3, xgb.ntree(bst))
expect_equal(xgb.get.num.boosted.rounds(bst), 5)
pred <- predict(bst, as.matrix(iris[, -5]))
expect_length(pred, nrow(iris) * 3)
# row sums add up to total probability of 1:
@@ -174,12 +170,12 @@ test_that("train and predict softprob", {
err <- sum(pred_labels != lb) / length(lb)
expect_equal(attributes(bst)$evaluation_log[5, train_merror], err, tolerance = 5e-6)
# manually calculate error at the 1st iteration:
mpred <- predict(bst, as.matrix(iris[, -5]), reshape = TRUE, ntreelimit = 1)
mpred <- predict(bst, as.matrix(iris[, -5]), reshape = TRUE, iterationrange = c(1, 1))
pred_labels <- max.col(mpred) - 1
err <- sum(pred_labels != lb) / length(lb)
expect_equal(attributes(bst)$evaluation_log[1, train_merror], err, tolerance = 5e-6)
mpred1 <- predict(bst, as.matrix(iris[, -5]), reshape = TRUE, iterationrange = c(1, 2))
mpred1 <- predict(bst, as.matrix(iris[, -5]), reshape = TRUE, iterationrange = c(1, 1))
expect_equal(mpred, mpred1)
d <- cbind(
@@ -213,7 +209,7 @@ test_that("train and predict softmax", {
)
expect_false(is.null(attributes(bst)$evaluation_log))
expect_lt(attributes(bst)$evaluation_log[, min(train_merror)], 0.025)
expect_equal(xgb.get.num.boosted.rounds(bst) * 3, xgb.ntree(bst))
expect_equal(xgb.get.num.boosted.rounds(bst), 5)
pred <- predict(bst, as.matrix(iris[, -5]))
expect_length(pred, nrow(iris))
@@ -233,19 +229,15 @@ test_that("train and predict RF", {
watchlist = list(train = xgb.DMatrix(train$data, label = lb))
)
expect_equal(xgb.get.num.boosted.rounds(bst), 1)
expect_equal(xgb.ntree(bst), 20)
pred <- predict(bst, train$data)
pred_err <- sum((pred > 0.5) != lb) / length(lb)
expect_lt(abs(attributes(bst)$evaluation_log[1, train_error] - pred_err), 10e-6)
# expect_lt(pred_err, 0.03)
pred <- predict(bst, train$data, ntreelimit = 20)
pred <- predict(bst, train$data, iterationrange = c(1, 1))
pred_err_20 <- sum((pred > 0.5) != lb) / length(lb)
expect_equal(pred_err_20, pred_err)
pred1 <- predict(bst, train$data, iterationrange = c(1, 2))
expect_equal(pred, pred1)
})
test_that("train and predict RF with softprob", {
@@ -261,7 +253,6 @@ test_that("train and predict RF with softprob", {
watchlist = list(train = xgb.DMatrix(as.matrix(iris[, -5]), label = lb))
)
expect_equal(xgb.get.num.boosted.rounds(bst), 15)
expect_equal(xgb.ntree(bst), 15 * 3 * 4)
# predict for all iterations:
pred <- predict(bst, as.matrix(iris[, -5]), reshape = TRUE)
expect_equal(dim(pred), c(nrow(iris), 3))
@@ -269,7 +260,7 @@ test_that("train and predict RF with softprob", {
err <- sum(pred_labels != lb) / length(lb)
expect_equal(attributes(bst)$evaluation_log[nrounds, train_merror], err, tolerance = 5e-6)
# predict for 7 iterations and adjust for 4 parallel trees per iteration
pred <- predict(bst, as.matrix(iris[, -5]), reshape = TRUE, ntreelimit = 7 * 4)
pred <- predict(bst, as.matrix(iris[, -5]), reshape = TRUE, iterationrange = c(1, 7))
err <- sum((max.col(pred) - 1) != lb) / length(lb)
expect_equal(attributes(bst)$evaluation_log[7, train_merror], err, tolerance = 5e-6)
})

View File

@@ -0,0 +1,67 @@
context("testing xgb.Booster slicing")
data(agaricus.train, package = "xgboost")
dm <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label, nthread = 1)
# Note: here need large step sizes in order for the predictions
# to have substantially different leaf assignments on each tree
model <- xgb.train(
params = list(objective = "binary:logistic", nthread = 1, max_depth = 4, eta = 0.5),
data = dm,
nrounds = 20
)
pred <- predict(model, dm, predleaf = TRUE, reshape = TRUE)
test_that("Slicing full model", {
new_model <- xgb.slice.Booster(model, 1, 0)
expect_equal(xgb.save.raw(new_model), xgb.save.raw(model))
new_model <- model[]
expect_equal(xgb.save.raw(new_model), xgb.save.raw(model))
new_model <- model[1:length(model)] # nolint
expect_equal(xgb.save.raw(new_model), xgb.save.raw(model))
})
test_that("Slicing sequence from start", {
new_model <- xgb.slice.Booster(model, 1, 10)
new_pred <- predict(new_model, dm, predleaf = TRUE, reshape = TRUE)
expect_equal(new_pred, pred[, seq(1, 10)])
new_model <- model[1:10]
new_pred <- predict(new_model, dm, predleaf = TRUE, reshape = TRUE)
expect_equal(new_pred, pred[, seq(1, 10)])
})
test_that("Slicing sequence from middle", {
new_model <- xgb.slice.Booster(model, 5, 10)
new_pred <- predict(new_model, dm, predleaf = TRUE, reshape = TRUE)
expect_equal(new_pred, pred[, seq(5, 10)])
new_model <- model[5:10]
new_pred <- predict(new_model, dm, predleaf = TRUE, reshape = TRUE)
expect_equal(new_pred, pred[, seq(5, 10)])
})
test_that("Slicing with non-unit step", {
for (s in 2:5) {
new_model <- xgb.slice.Booster(model, 1, 17, s)
new_pred <- predict(new_model, dm, predleaf = TRUE, reshape = TRUE)
expect_equal(new_pred, pred[, seq(1, 17, s)])
new_model <- model[seq(1, 17, s)]
new_pred <- predict(new_model, dm, predleaf = TRUE, reshape = TRUE)
expect_equal(new_pred, pred[, seq(1, 17, s)])
}
})
test_that("Slicing with non-unit step from middle", {
for (s in 2:5) {
new_model <- xgb.slice.Booster(model, 4, 17, s)
new_pred <- predict(new_model, dm, predleaf = TRUE, reshape = TRUE)
expect_equal(new_pred, pred[, seq(4, 17, s)])
new_model <- model[seq(4, 17, s)]
new_pred <- predict(new_model, dm, predleaf = TRUE, reshape = TRUE)
expect_equal(new_pred, pred[, seq(4, 17, s)])
}
})

View File

@@ -211,12 +211,11 @@ test_that("early stopping xgb.train works", {
, "Stopping. Best iteration")
expect_false(is.null(xgb.attr(bst, "best_iteration")))
expect_lt(xgb.attr(bst, "best_iteration"), 19)
expect_equal(xgb.attr(bst, "best_iteration"), xgb.attr(bst, "best_ntreelimit"))
pred <- predict(bst, dtest)
expect_equal(length(pred), 1611)
err_pred <- err(ltest, pred)
err_log <- attributes(bst)$evaluation_log[xgb.attr(bst, "best_iteration"), test_error]
err_log <- attributes(bst)$evaluation_log[xgb.attr(bst, "best_iteration") + 1, test_error]
expect_equal(err_log, err_pred, tolerance = 5e-6)
set.seed(11)
@@ -231,8 +230,7 @@ test_that("early stopping xgb.train works", {
loaded <- xgb.load(fname)
expect_false(is.null(xgb.attr(loaded, "best_iteration")))
expect_equal(xgb.attr(loaded, "best_iteration"), xgb.attr(bst, "best_ntreelimit"))
expect_equal(xgb.attr(loaded, "best_ntreelimit"), xgb.attr(bst, "best_ntreelimit"))
expect_equal(xgb.attr(loaded, "best_iteration"), xgb.attr(bst, "best_iteration"))
})
test_that("early stopping using a specific metric works", {
@@ -245,12 +243,11 @@ test_that("early stopping using a specific metric works", {
, "Stopping. Best iteration")
expect_false(is.null(xgb.attr(bst, "best_iteration")))
expect_lt(xgb.attr(bst, "best_iteration"), 19)
expect_equal(xgb.attr(bst, "best_iteration"), xgb.attr(bst, "best_ntreelimit"))
pred <- predict(bst, dtest, ntreelimit = xgb.attr(bst, "best_ntreelimit"))
pred <- predict(bst, dtest, iterationrange = c(1, xgb.attr(bst, "best_iteration") + 1))
expect_equal(length(pred), 1611)
logloss_pred <- sum(-ltest * log(pred) - (1 - ltest) * log(1 - pred)) / length(ltest)
logloss_log <- attributes(bst)$evaluation_log[xgb.attr(bst, "best_iteration"), test_logloss]
logloss_log <- attributes(bst)$evaluation_log[xgb.attr(bst, "best_iteration") + 1, test_logloss]
expect_equal(logloss_log, logloss_pred, tolerance = 1e-5)
})
@@ -286,7 +283,6 @@ test_that("early stopping xgb.cv works", {
, "Stopping. Best iteration")
expect_false(is.null(cv$best_iteration))
expect_lt(cv$best_iteration, 19)
expect_equal(cv$best_iteration, cv$best_ntreelimit)
# the best error is min error:
expect_true(cv$evaluation_log[, test_error_mean[cv$best_iteration] == min(test_error_mean)])
})
@@ -354,3 +350,44 @@ test_that("prediction in xgb.cv for softprob works", {
expect_equal(dim(cv$pred), c(nrow(iris), 3))
expect_lt(diff(range(rowSums(cv$pred))), 1e-6)
})
test_that("prediction in xgb.cv works for multi-quantile", {
data(mtcars)
y <- mtcars$mpg
x <- as.matrix(mtcars[, -1])
dm <- xgb.DMatrix(x, label = y, nthread = 1)
cv <- xgb.cv(
data = dm,
params = list(
objective = "reg:quantileerror",
quantile_alpha = c(0.1, 0.2, 0.5, 0.8, 0.9),
nthread = 1
),
nrounds = 5,
nfold = 3,
prediction = TRUE,
verbose = 0
)
expect_equal(dim(cv$pred), c(nrow(x), 5))
})
test_that("prediction in xgb.cv works for multi-output", {
data(mtcars)
y <- mtcars$mpg
x <- as.matrix(mtcars[, -1])
dm <- xgb.DMatrix(x, label = cbind(y, -y), nthread = 1)
cv <- xgb.cv(
data = dm,
params = list(
tree_method = "hist",
multi_strategy = "multi_output_tree",
objective = "reg:squarederror",
nthread = n_threads
),
nrounds = 5,
nfold = 3,
prediction = TRUE,
verbose = 0
)
expect_equal(dim(cv$pred), c(nrow(x), 2))
})

View File

@@ -72,10 +72,10 @@ test_that("gblinear early stopping works", {
booster <- xgb.train(
param, dtrain, n, list(eval = dtest, train = dtrain), early_stopping_rounds = es_round
)
expect_equal(xgb.attr(booster, "best_iteration"), 5)
expect_equal(xgb.attr(booster, "best_iteration"), 4)
predt_es <- predict(booster, dtrain)
n <- xgb.attr(booster, "best_iteration") + es_round
n <- xgb.attr(booster, "best_iteration") + es_round + 1
booster <- xgb.train(
param, dtrain, n, list(eval = dtest, train = dtrain), early_stopping_rounds = es_round
)

View File

@@ -282,9 +282,6 @@ test_that("xgb.model.dt.tree works with and without feature names", {
expect_equal(dim(dt.tree), c(188, 10))
expect_output(str(dt.tree), 'Feature.*\\"Age\\"')
dt.tree.0 <- xgb.model.dt.tree(model = bst.Tree)
expect_equal(dt.tree, dt.tree.0)
# when model contains no feature names:
dt.tree.x <- xgb.model.dt.tree(model = bst.Tree.unnamed)
expect_output(str(dt.tree.x), 'Feature.*\\"3\\"')
@@ -304,7 +301,7 @@ test_that("xgb.model.dt.tree throws error for gblinear", {
test_that("xgb.importance works with and without feature names", {
.skip_if_vcd_not_available()
importance.Tree <- xgb.importance(feature_names = feature.names, model = bst.Tree)
importance.Tree <- xgb.importance(feature_names = feature.names, model = bst.Tree.unnamed)
if (!flag_32bit)
expect_equal(dim(importance.Tree), c(7, 4))
expect_equal(colnames(importance.Tree), c("Feature", "Gain", "Cover", "Frequency"))
@@ -330,9 +327,8 @@ test_that("xgb.importance works with and without feature names", {
importance <- xgb.importance(feature_names = feature.names, model = bst.Tree, trees = trees)
importance_from_dump <- function() {
model_text_dump <- xgb.dump(model = bst.Tree.unnamed, with_stats = TRUE, trees = trees)
model_text_dump <- xgb.dump(model = bst.Tree, with_stats = TRUE, trees = trees)
imp <- xgb.model.dt.tree(
feature_names = feature.names,
text = model_text_dump,
trees = trees
)[

View File

@@ -44,7 +44,7 @@ test_that('Test ranking with weighted data', {
expect_true(all(diff(attributes(bst)$evaluation_log$train_auc) >= 0))
expect_true(all(diff(attributes(bst)$evaluation_log$train_aucpr) >= 0))
for (i in 1:10) {
pred <- predict(bst, newdata = dtrain, ntreelimit = i)
pred <- predict(bst, newdata = dtrain, iterationrange = c(1, i))
# is_sorted[i]: is i-th group correctly sorted by the ranking predictor?
is_sorted <- lapply(seq(1, 20, by = 5),
function(k) {