merge latest, Jan 12 2024
This commit is contained in:
commit
1e1e8be3a5
6
.github/workflows/r_tests.yml
vendored
6
.github/workflows/r_tests.yml
vendored
@ -25,7 +25,7 @@ jobs:
|
||||
with:
|
||||
submodules: 'true'
|
||||
|
||||
- uses: r-lib/actions/setup-r@11a22a908006c25fe054c4ef0ac0436b1de3edbe # v2.6.4
|
||||
- uses: r-lib/actions/setup-r@e40ad904310fc92e96951c1b0d64f3de6cbe9e14 # v2.6.5
|
||||
with:
|
||||
r-version: ${{ matrix.config.r }}
|
||||
|
||||
@ -54,7 +54,7 @@ jobs:
|
||||
matrix:
|
||||
config:
|
||||
- {os: windows-latest, r: 'release', compiler: 'mingw', build: 'autotools'}
|
||||
- {os: windows-latest, r: '4.2.0', compiler: 'msvc', build: 'cmake'}
|
||||
- {os: windows-latest, r: '4.3.0', compiler: 'msvc', build: 'cmake'}
|
||||
env:
|
||||
R_REMOTES_NO_ERRORS_FROM_WARNINGS: true
|
||||
RSPM: ${{ matrix.config.rspm }}
|
||||
@ -64,7 +64,7 @@ jobs:
|
||||
with:
|
||||
submodules: 'true'
|
||||
|
||||
- uses: r-lib/actions/setup-r@11a22a908006c25fe054c4ef0ac0436b1de3edbe # v2.6.4
|
||||
- uses: r-lib/actions/setup-r@e40ad904310fc92e96951c1b0d64f3de6cbe9e14 # v2.6.5
|
||||
with:
|
||||
r-version: ${{ matrix.config.r }}
|
||||
|
||||
|
||||
@ -14,6 +14,15 @@ if(ENABLE_ALL_WARNINGS)
|
||||
target_compile_options(xgboost-r PRIVATE -Wall -Wextra)
|
||||
endif()
|
||||
|
||||
if(MSVC)
|
||||
# https://github.com/microsoft/LightGBM/pull/6061
|
||||
# MSVC doesn't work with anonymous types in structs. (R complex)
|
||||
#
|
||||
# syntax error: missing ';' before identifier 'private_data_c'
|
||||
#
|
||||
target_compile_definitions(xgboost-r PRIVATE -DR_LEGACY_RCOMPLEX)
|
||||
endif()
|
||||
|
||||
target_compile_definitions(
|
||||
xgboost-r PUBLIC
|
||||
-DXGBOOST_STRICT_R_MODE=1
|
||||
|
||||
@ -58,12 +58,13 @@ Suggests:
|
||||
float,
|
||||
titanic
|
||||
Depends:
|
||||
R (>= 3.3.0)
|
||||
R (>= 4.3.0)
|
||||
Imports:
|
||||
Matrix (>= 1.1-0),
|
||||
methods,
|
||||
data.table (>= 1.9.6),
|
||||
jsonlite (>= 1.0),
|
||||
jsonlite (>= 1.0)
|
||||
Roxygen: list(markdown = TRUE)
|
||||
RoxygenNote: 7.2.3
|
||||
Encoding: UTF-8
|
||||
SystemRequirements: GNU make, C++17
|
||||
|
||||
@ -2,16 +2,19 @@
|
||||
|
||||
S3method("[",xgb.DMatrix)
|
||||
S3method("dimnames<-",xgb.DMatrix)
|
||||
S3method(coef,xgb.Booster)
|
||||
S3method(dim,xgb.DMatrix)
|
||||
S3method(dimnames,xgb.DMatrix)
|
||||
S3method(getinfo,xgb.Booster)
|
||||
S3method(getinfo,xgb.DMatrix)
|
||||
S3method(predict,xgb.Booster)
|
||||
S3method(predict,xgb.Booster.handle)
|
||||
S3method(print,xgb.Booster)
|
||||
S3method(print,xgb.DMatrix)
|
||||
S3method(print,xgb.cv.synchronous)
|
||||
S3method(setinfo,xgb.Booster)
|
||||
S3method(setinfo,xgb.DMatrix)
|
||||
S3method(slice,xgb.DMatrix)
|
||||
S3method(variable.names,xgb.Booster)
|
||||
export("xgb.attr<-")
|
||||
export("xgb.attributes<-")
|
||||
export("xgb.config<-")
|
||||
@ -26,21 +29,27 @@ export(cb.save.model)
|
||||
export(getinfo)
|
||||
export(setinfo)
|
||||
export(slice)
|
||||
export(xgb.Booster.complete)
|
||||
export(xgb.DMatrix)
|
||||
export(xgb.DMatrix.hasinfo)
|
||||
export(xgb.DMatrix.save)
|
||||
export(xgb.attr)
|
||||
export(xgb.attributes)
|
||||
export(xgb.config)
|
||||
export(xgb.copy.Booster)
|
||||
export(xgb.create.features)
|
||||
export(xgb.cv)
|
||||
export(xgb.dump)
|
||||
export(xgb.gblinear.history)
|
||||
export(xgb.get.DMatrix.data)
|
||||
export(xgb.get.DMatrix.num.non.missing)
|
||||
export(xgb.get.DMatrix.qcut)
|
||||
export(xgb.get.config)
|
||||
export(xgb.get.num.boosted.rounds)
|
||||
export(xgb.ggplot.deepness)
|
||||
export(xgb.ggplot.importance)
|
||||
export(xgb.ggplot.shap.summary)
|
||||
export(xgb.importance)
|
||||
export(xgb.is.same.Booster)
|
||||
export(xgb.load)
|
||||
export(xgb.load.raw)
|
||||
export(xgb.model.dt.tree)
|
||||
@ -52,13 +61,12 @@ export(xgb.plot.shap.summary)
|
||||
export(xgb.plot.tree)
|
||||
export(xgb.save)
|
||||
export(xgb.save.raw)
|
||||
export(xgb.serialize)
|
||||
export(xgb.set.config)
|
||||
export(xgb.train)
|
||||
export(xgb.unserialize)
|
||||
export(xgboost)
|
||||
import(methods)
|
||||
importClassesFrom(Matrix,dgCMatrix)
|
||||
importClassesFrom(Matrix,dgRMatrix)
|
||||
importClassesFrom(Matrix,dgeMatrix)
|
||||
importFrom(Matrix,colSums)
|
||||
importFrom(Matrix,sparse.model.matrix)
|
||||
@ -82,8 +90,11 @@ importFrom(graphics,points)
|
||||
importFrom(graphics,title)
|
||||
importFrom(jsonlite,fromJSON)
|
||||
importFrom(jsonlite,toJSON)
|
||||
importFrom(methods,new)
|
||||
importFrom(stats,coef)
|
||||
importFrom(stats,median)
|
||||
importFrom(stats,predict)
|
||||
importFrom(stats,variable.names)
|
||||
importFrom(utils,head)
|
||||
importFrom(utils,object.size)
|
||||
importFrom(utils,str)
|
||||
|
||||
@ -228,7 +228,7 @@ cb.reset.parameters <- function(new_params) {
|
||||
})
|
||||
|
||||
if (!is.null(env$bst)) {
|
||||
xgb.parameters(env$bst$handle) <- pars
|
||||
xgb.parameters(env$bst) <- pars
|
||||
} else {
|
||||
for (fd in env$bst_folds)
|
||||
xgb.parameters(fd$bst) <- pars
|
||||
@ -333,13 +333,13 @@ cb.early.stop <- function(stopping_rounds, maximize = FALSE,
|
||||
if (!is.null(env$bst)) {
|
||||
if (!inherits(env$bst, 'xgb.Booster'))
|
||||
stop("'bst' in the parent frame must be an 'xgb.Booster'")
|
||||
if (!is.null(best_score <- xgb.attr(env$bst$handle, 'best_score'))) {
|
||||
if (!is.null(best_score <- xgb.attr(env$bst, 'best_score'))) {
|
||||
best_score <<- as.numeric(best_score)
|
||||
best_iteration <<- as.numeric(xgb.attr(env$bst$handle, 'best_iteration')) + 1
|
||||
best_msg <<- as.numeric(xgb.attr(env$bst$handle, 'best_msg'))
|
||||
best_iteration <<- as.numeric(xgb.attr(env$bst, 'best_iteration')) + 1
|
||||
best_msg <<- as.numeric(xgb.attr(env$bst, 'best_msg'))
|
||||
} else {
|
||||
xgb.attributes(env$bst$handle) <- list(best_iteration = best_iteration - 1,
|
||||
best_score = best_score)
|
||||
xgb.attributes(env$bst) <- list(best_iteration = best_iteration - 1,
|
||||
best_score = best_score)
|
||||
}
|
||||
} else if (is.null(env$bst_folds) || is.null(env$basket)) {
|
||||
stop("Parent frame has neither 'bst' nor ('bst_folds' and 'basket')")
|
||||
@ -348,7 +348,7 @@ cb.early.stop <- function(stopping_rounds, maximize = FALSE,
|
||||
|
||||
finalizer <- function(env) {
|
||||
if (!is.null(env$bst)) {
|
||||
attr_best_score <- as.numeric(xgb.attr(env$bst$handle, 'best_score'))
|
||||
attr_best_score <- as.numeric(xgb.attr(env$bst, 'best_score'))
|
||||
if (best_score != attr_best_score) {
|
||||
# If the difference is too big, throw an error
|
||||
if (abs(best_score - attr_best_score) >= 1e-14) {
|
||||
@ -358,9 +358,9 @@ cb.early.stop <- function(stopping_rounds, maximize = FALSE,
|
||||
# If the difference is due to floating-point truncation, update best_score
|
||||
best_score <- attr_best_score
|
||||
}
|
||||
env$bst$best_iteration <- best_iteration
|
||||
env$bst$best_ntreelimit <- best_ntreelimit
|
||||
env$bst$best_score <- best_score
|
||||
xgb.attr(env$bst, "best_iteration") <- best_iteration
|
||||
xgb.attr(env$bst, "best_ntreelimit") <- best_ntreelimit
|
||||
xgb.attr(env$bst, "best_score") <- best_score
|
||||
} else {
|
||||
env$basket$best_iteration <- best_iteration
|
||||
env$basket$best_ntreelimit <- best_ntreelimit
|
||||
@ -412,11 +412,15 @@ cb.early.stop <- function(stopping_rounds, maximize = FALSE,
|
||||
#' @param save_period save the model to disk after every
|
||||
#' \code{save_period} iterations; 0 means save the model at the end.
|
||||
#' @param save_name the name or path for the saved model file.
|
||||
#'
|
||||
#' Note that the format of the model being saved is determined by the file
|
||||
#' extension specified here (see \link{xgb.save} for details about how it works).
|
||||
#'
|
||||
#' It can contain a \code{\link[base]{sprintf}} formatting specifier
|
||||
#' to include the integer iteration number in the file name.
|
||||
#' E.g., with \code{save_name} = 'xgboost_%04d.model',
|
||||
#' the file saved at iteration 50 would be named "xgboost_0050.model".
|
||||
#'
|
||||
#' E.g., with \code{save_name} = 'xgboost_%04d.ubj',
|
||||
#' the file saved at iteration 50 would be named "xgboost_0050.ubj".
|
||||
#' @seealso \link{xgb.save}
|
||||
#' @details
|
||||
#' This callback function allows to save an xgb-model file, either periodically after each \code{save_period}'s or at the end.
|
||||
#'
|
||||
@ -430,7 +434,7 @@ cb.early.stop <- function(stopping_rounds, maximize = FALSE,
|
||||
#' \code{\link{callbacks}}
|
||||
#'
|
||||
#' @export
|
||||
cb.save.model <- function(save_period = 0, save_name = "xgboost.model") {
|
||||
cb.save.model <- function(save_period = 0, save_name = "xgboost.ubj") {
|
||||
|
||||
if (save_period < 0)
|
||||
stop("'save_period' cannot be negative")
|
||||
@ -440,8 +444,13 @@ cb.save.model <- function(save_period = 0, save_name = "xgboost.model") {
|
||||
stop("'save_model' callback requires the 'bst' booster object in its calling frame")
|
||||
|
||||
if ((save_period > 0 && (env$iteration - env$begin_iteration) %% save_period == 0) ||
|
||||
(save_period == 0 && env$iteration == env$end_iteration))
|
||||
xgb.save(env$bst, sprintf(save_name, env$iteration))
|
||||
(save_period == 0 && env$iteration == env$end_iteration)) {
|
||||
# Note: this throws a warning if the name doesn't have anything to format through 'sprintf'
|
||||
suppressWarnings({
|
||||
save_name <- sprintf(save_name, env$iteration)
|
||||
})
|
||||
xgb.save(env$bst, save_name)
|
||||
}
|
||||
}
|
||||
attr(callback, 'call') <- match.call()
|
||||
attr(callback, 'name') <- 'cb.save.model'
|
||||
@ -512,8 +521,7 @@ cb.cv.predict <- function(save_models = FALSE) {
|
||||
env$basket$pred <- pred
|
||||
if (save_models) {
|
||||
env$basket$models <- lapply(env$bst_folds, function(fd) {
|
||||
xgb.attr(fd$bst, 'niter') <- env$end_iteration - 1
|
||||
xgb.Booster.complete(xgb.handleToBooster(handle = fd$bst, raw = NULL), saveraw = TRUE)
|
||||
return(fd$bst)
|
||||
})
|
||||
}
|
||||
}
|
||||
@ -665,7 +673,7 @@ cb.gblinear.history <- function(sparse = FALSE) {
|
||||
} else { # xgb.cv:
|
||||
cf <- vector("list", length(env$bst_folds))
|
||||
for (i in seq_along(env$bst_folds)) {
|
||||
dmp <- xgb.dump(xgb.handleToBooster(handle = env$bst_folds[[i]]$bst, raw = NULL))
|
||||
dmp <- xgb.dump(env$bst_folds[[i]]$bst)
|
||||
cf[[i]] <- as.numeric(grep('(booster|bias|weigh)', dmp, invert = TRUE, value = TRUE))
|
||||
if (sparse) cf[[i]] <- as(cf[[i]], "sparseVector")
|
||||
}
|
||||
@ -685,14 +693,19 @@ cb.gblinear.history <- function(sparse = FALSE) {
|
||||
callback
|
||||
}
|
||||
|
||||
#' Extract gblinear coefficients history.
|
||||
#'
|
||||
#' A helper function to extract the matrix of linear coefficients' history
|
||||
#' @title Extract gblinear coefficients history.
|
||||
#' @description A helper function to extract the matrix of linear coefficients' history
|
||||
#' from a gblinear model created while using the \code{cb.gblinear.history()}
|
||||
#' callback.
|
||||
#' @details Note that this is an R-specific function that relies on R attributes that
|
||||
#' are not saved when using xgboost's own serialization functions like \link{xgb.load}
|
||||
#' or \link{xgb.load.raw}.
|
||||
#'
|
||||
#' In order for a serialized model to be accepted by tgis function, one must use R
|
||||
#' serializers such as \link{saveRDS}.
|
||||
#' @param model either an \code{xgb.Booster} or a result of \code{xgb.cv()}, trained
|
||||
#' using the \code{cb.gblinear.history()} callback.
|
||||
#' using the \code{cb.gblinear.history()} callback, but \bold{not} a booster
|
||||
#' loaded from \link{xgb.load} or \link{xgb.load.raw}.
|
||||
#' @param class_index zero-based class index to extract the coefficients for only that
|
||||
#' specific class in a multinomial multiclass model. When it is NULL, all the
|
||||
#' coefficients are returned. Has no effect in non-multiclass models.
|
||||
@ -713,20 +726,18 @@ xgb.gblinear.history <- function(model, class_index = NULL) {
|
||||
stop("model must be an object of either xgb.Booster or xgb.cv.synchronous class")
|
||||
is_cv <- inherits(model, "xgb.cv.synchronous")
|
||||
|
||||
if (is.null(model[["callbacks"]]) || is.null(model$callbacks[["cb.gblinear.history"]]))
|
||||
if (is_cv) {
|
||||
callbacks <- model$callbacks
|
||||
} else {
|
||||
callbacks <- attributes(model)$callbacks
|
||||
}
|
||||
|
||||
if (is.null(callbacks) || is.null(callbacks$cb.gblinear.history))
|
||||
stop("model must be trained while using the cb.gblinear.history() callback")
|
||||
|
||||
if (!is_cv) {
|
||||
# extract num_class & num_feat from the internal model
|
||||
dmp <- xgb.dump(model)
|
||||
if (length(dmp) < 2 || dmp[2] != "bias:")
|
||||
stop("It does not appear to be a gblinear model")
|
||||
dmp <- dmp[-c(1, 2)]
|
||||
n <- which(dmp == 'weight:')
|
||||
if (length(n) != 1)
|
||||
stop("It does not appear to be a gblinear model")
|
||||
num_class <- n - 1
|
||||
num_feat <- (length(dmp) - 4) / num_class
|
||||
num_class <- xgb.num_class(model)
|
||||
num_feat <- xgb.num_feature(model)
|
||||
} else {
|
||||
# in case of CV, the object is expected to have this info
|
||||
if (model$params$booster != "gblinear")
|
||||
@ -742,7 +753,7 @@ xgb.gblinear.history <- function(model, class_index = NULL) {
|
||||
(class_index[1] < 0 || class_index[1] >= num_class))
|
||||
stop("class_index has to be within [0,", num_class - 1, "]")
|
||||
|
||||
coef_path <- environment(model$callbacks$cb.gblinear.history)[["coefs"]]
|
||||
coef_path <- environment(callbacks$cb.gblinear.history)[["coefs"]]
|
||||
if (!is.null(class_index) && num_class > 1) {
|
||||
coef_path <- if (is.list(coef_path)) {
|
||||
lapply(coef_path,
|
||||
@ -770,7 +781,8 @@ xgb.gblinear.history <- function(model, class_index = NULL) {
|
||||
if (!is.null(eval_err)) {
|
||||
if (length(eval_res) != length(eval_err))
|
||||
stop('eval_res & eval_err lengths mismatch')
|
||||
res <- paste0(sprintf("%s:%f+%f", enames, eval_res, eval_err), collapse = '\t')
|
||||
# Note: UTF-8 code for plus/minus sign is U+00B1
|
||||
res <- paste0(sprintf("%s:%f\U00B1%f", enames, eval_res, eval_err), collapse = '\t')
|
||||
} else {
|
||||
res <- paste0(sprintf("%s:%f", enames, eval_res), collapse = '\t')
|
||||
}
|
||||
|
||||
@ -93,6 +93,14 @@ check.booster.params <- function(params, ...) {
|
||||
interaction_constraints <- sapply(params[['interaction_constraints']], function(x) paste0('[', paste(x, collapse = ','), ']'))
|
||||
params[['interaction_constraints']] <- paste0('[', paste(interaction_constraints, collapse = ','), ']')
|
||||
}
|
||||
|
||||
# for evaluation metrics, should generate multiple entries per metric
|
||||
if (NROW(params[['eval_metric']]) > 1) {
|
||||
eval_metrics <- as.list(params[["eval_metric"]])
|
||||
names(eval_metrics) <- rep("eval_metric", length(eval_metrics))
|
||||
params_without_ev_metrics <- within(params, rm("eval_metric"))
|
||||
params <- c(params_without_ev_metrics, eval_metrics)
|
||||
}
|
||||
return(params)
|
||||
}
|
||||
|
||||
@ -140,19 +148,17 @@ check.custom.eval <- function(env = parent.frame()) {
|
||||
|
||||
|
||||
# Update a booster handle for an iteration with dtrain data
|
||||
xgb.iter.update <- function(booster_handle, dtrain, iter, obj) {
|
||||
if (!identical(class(booster_handle), "xgb.Booster.handle")) {
|
||||
stop("booster_handle must be of xgb.Booster.handle class")
|
||||
}
|
||||
xgb.iter.update <- function(bst, dtrain, iter, obj) {
|
||||
if (!inherits(dtrain, "xgb.DMatrix")) {
|
||||
stop("dtrain must be of xgb.DMatrix class")
|
||||
}
|
||||
handle <- xgb.get.handle(bst)
|
||||
|
||||
if (is.null(obj)) {
|
||||
.Call(XGBoosterUpdateOneIter_R, booster_handle, as.integer(iter), dtrain)
|
||||
.Call(XGBoosterUpdateOneIter_R, handle, as.integer(iter), dtrain)
|
||||
} else {
|
||||
pred <- predict(
|
||||
booster_handle,
|
||||
bst,
|
||||
dtrain,
|
||||
outputmargin = TRUE,
|
||||
training = TRUE,
|
||||
@ -160,23 +166,24 @@ xgb.iter.update <- function(booster_handle, dtrain, iter, obj) {
|
||||
)
|
||||
gpair <- obj(pred, dtrain)
|
||||
n_samples <- dim(dtrain)[1]
|
||||
grad <- gpair$grad
|
||||
hess <- gpair$hess
|
||||
|
||||
msg <- paste(
|
||||
"Since 2.1.0, the shape of the gradient and hessian is required to be ",
|
||||
"(n_samples, n_targets) or (n_samples, n_classes).",
|
||||
sep = ""
|
||||
)
|
||||
if (is.matrix(gpair$grad) && dim(gpair$grad)[1] != n_samples) {
|
||||
warning(msg)
|
||||
}
|
||||
if (is.numeric(gpair$grad) && length(gpair$grad) != n_samples) {
|
||||
warning(msg)
|
||||
if ((is.matrix(grad) && dim(grad)[1] != n_samples) ||
|
||||
(is.vector(grad) && length(grad) != n_samples) ||
|
||||
(is.vector(grad) != is.vector(hess))) {
|
||||
warning(paste(
|
||||
"Since 2.1.0, the shape of the gradient and hessian is required to be ",
|
||||
"(n_samples, n_targets) or (n_samples, n_classes). Will reshape assuming ",
|
||||
"column-major order.",
|
||||
sep = ""
|
||||
))
|
||||
grad <- matrix(grad, nrow = n_samples)
|
||||
hess <- matrix(hess, nrow = n_samples)
|
||||
}
|
||||
|
||||
gpair$grad <- matrix(gpair$grad, nrow = n_samples)
|
||||
gpair$hess <- matrix(gpair$hess, nrow = n_samples)
|
||||
.Call(
|
||||
XGBoosterBoostOneIter_R, booster_handle, dtrain, iter, gpair$grad, gpair$hess
|
||||
XGBoosterTrainOneIter_R, handle, dtrain, iter, grad, hess
|
||||
)
|
||||
}
|
||||
return(TRUE)
|
||||
@ -186,23 +193,22 @@ xgb.iter.update <- function(booster_handle, dtrain, iter, obj) {
|
||||
# Evaluate one iteration.
|
||||
# Returns a named vector of evaluation metrics
|
||||
# with the names in a 'datasetname-metricname' format.
|
||||
xgb.iter.eval <- function(booster_handle, watchlist, iter, feval) {
|
||||
if (!identical(class(booster_handle), "xgb.Booster.handle"))
|
||||
stop("class of booster_handle must be xgb.Booster.handle")
|
||||
xgb.iter.eval <- function(bst, watchlist, iter, feval) {
|
||||
handle <- xgb.get.handle(bst)
|
||||
|
||||
if (length(watchlist) == 0)
|
||||
return(NULL)
|
||||
|
||||
evnames <- names(watchlist)
|
||||
if (is.null(feval)) {
|
||||
msg <- .Call(XGBoosterEvalOneIter_R, booster_handle, as.integer(iter), watchlist, as.list(evnames))
|
||||
msg <- .Call(XGBoosterEvalOneIter_R, handle, as.integer(iter), watchlist, as.list(evnames))
|
||||
mat <- matrix(strsplit(msg, '\\s+|:')[[1]][-1], nrow = 2)
|
||||
res <- structure(as.numeric(mat[2, ]), names = mat[1, ])
|
||||
} else {
|
||||
res <- sapply(seq_along(watchlist), function(j) {
|
||||
w <- watchlist[[j]]
|
||||
## predict using all trees
|
||||
preds <- predict(booster_handle, w, outputmargin = TRUE, iterationrange = c(1, 1))
|
||||
preds <- predict(bst, w, outputmargin = TRUE, iterationrange = c(1, 1))
|
||||
eval_res <- feval(preds, w)
|
||||
out <- eval_res$value
|
||||
names(out) <- paste0(evnames[j], "-", eval_res$metric)
|
||||
@ -343,16 +349,45 @@ xgb.createFolds <- function(y, k) {
|
||||
#' @name xgboost-deprecated
|
||||
NULL
|
||||
|
||||
#' Do not use \code{\link[base]{saveRDS}} or \code{\link[base]{save}} for long-term archival of
|
||||
#' models. Instead, use \code{\link{xgb.save}} or \code{\link{xgb.save.raw}}.
|
||||
#' @title Model Serialization and Compatibility
|
||||
#' @description
|
||||
#'
|
||||
#' It is a common practice to use the built-in \code{\link[base]{saveRDS}} function (or
|
||||
#' \code{\link[base]{save}}) to persist R objects to the disk. While it is possible to persist
|
||||
#' \code{xgb.Booster} objects using \code{\link[base]{saveRDS}}, it is not advisable to do so if
|
||||
#' the model is to be accessed in the future. If you train a model with the current version of
|
||||
#' XGBoost and persist it with \code{\link[base]{saveRDS}}, the model is not guaranteed to be
|
||||
#' accessible in later releases of XGBoost. To ensure that your model can be accessed in future
|
||||
#' releases of XGBoost, use \code{\link{xgb.save}} or \code{\link{xgb.save.raw}} instead.
|
||||
#' When it comes to serializing XGBoost models, it's possible to use R serializers such as
|
||||
#' \link{save} or \link{saveRDS} to serialize an XGBoost R model, but XGBoost also provides
|
||||
#' its own serializers with better compatibility guarantees, which allow loading
|
||||
#' said models in other language bindings of XGBoost.
|
||||
#'
|
||||
#' Note that an `xgb.Booster` object, outside of its core components, might also keep:\itemize{
|
||||
#' \item Additional model configuration (accessible through \link{xgb.config}),
|
||||
#' which includes model fitting parameters like `max_depth` and runtime parameters like `nthread`.
|
||||
#' These are not necessarily useful for prediction/importance/plotting.
|
||||
#' \item Additional R-specific attributes - e.g. results of callbacks, such as evaluation logs,
|
||||
#' which are kept as a `data.table` object, accessible through `attributes(model)$evaluation_log`
|
||||
#' if present.
|
||||
#' }
|
||||
#'
|
||||
#' The first one (configurations) does not have the same compatibility guarantees as
|
||||
#' the model itself, including attributes that are set and accessed through \link{xgb.attributes} - that is, such configuration
|
||||
#' might be lost after loading the booster in a different XGBoost version, regardless of the
|
||||
#' serializer that was used. These are saved when using \link{saveRDS}, but will be discarded
|
||||
#' if loaded into an incompatible XGBoost version. They are not saved when using XGBoost's
|
||||
#' serializers from its public interface including \link{xgb.save} and \link{xgb.save.raw}.
|
||||
#'
|
||||
#' The second ones (R attributes) are not part of the standard XGBoost model structure, and thus are
|
||||
#' not saved when using XGBoost's own serializers. These attributes are only used for informational
|
||||
#' purposes, such as keeping track of evaluation metrics as the model was fit, or saving the R
|
||||
#' call that produced the model, but are otherwise not used for prediction / importance / plotting / etc.
|
||||
#' These R attributes are only preserved when using R's serializers.
|
||||
#'
|
||||
#' Note that XGBoost models in R starting from version `2.1.0` and onwards, and XGBoost models
|
||||
#' before version `2.1.0`; have a very different R object structure and are incompatible with
|
||||
#' each other. Hence, models that were saved with R serializers live `saveRDS` or `save` before
|
||||
#' version `2.1.0` will not work with latter `xgboost` versions and vice versa. Be aware that
|
||||
#' the structure of R model objects could in theory change again in the future, so XGBoost's serializers
|
||||
#' should be preferred for long-term storage.
|
||||
#'
|
||||
#' Furthermore, note that using the package `qs` for serialization will require version 0.26 or
|
||||
#' higher of said package, and will have the same compatibility restrictions as R serializers.
|
||||
#'
|
||||
#' @details
|
||||
#' Use \code{\link{xgb.save}} to save the XGBoost model as a stand-alone file. You may opt into
|
||||
@ -365,26 +400,29 @@ NULL
|
||||
#' The \code{\link{xgb.save.raw}} function is useful if you'd like to persist the XGBoost model
|
||||
#' as part of another R object.
|
||||
#'
|
||||
#' Note: Do not use \code{\link{xgb.serialize}} to store models long-term. It persists not only the
|
||||
#' model but also internal configurations and parameters, and its format is not stable across
|
||||
#' multiple XGBoost versions. Use \code{\link{xgb.serialize}} only for checkpointing.
|
||||
#' Use \link{saveRDS} if you require the R-specific attributes that a booster might have, such
|
||||
#' as evaluation logs, but note that future compatibility of such objects is outside XGBoost's
|
||||
#' control as it relies on R's serialization format (see e.g. the details section in
|
||||
#' \link{serialize} and \link{save} from base R).
|
||||
#'
|
||||
#' For more details and explanation about model persistence and archival, consult the page
|
||||
#' \url{https://xgboost.readthedocs.io/en/latest/tutorials/saving_model.html}.
|
||||
#'
|
||||
#' @examples
|
||||
#' data(agaricus.train, package='xgboost')
|
||||
#' bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max_depth = 2,
|
||||
#' eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic")
|
||||
#' bst <- xgb.train(data = xgb.DMatrix(agaricus.train$data, label = agaricus.train$label),
|
||||
#' max_depth = 2, eta = 1, nthread = 2, nrounds = 2,
|
||||
#' objective = "binary:logistic")
|
||||
#'
|
||||
#' # Save as a stand-alone file; load it with xgb.load()
|
||||
#' xgb.save(bst, 'xgb.model')
|
||||
#' bst2 <- xgb.load('xgb.model')
|
||||
#' fname <- file.path(tempdir(), "xgb_model.ubj")
|
||||
#' xgb.save(bst, fname)
|
||||
#' bst2 <- xgb.load(fname)
|
||||
#'
|
||||
#' # Save as a stand-alone file (JSON); load it with xgb.load()
|
||||
#' xgb.save(bst, 'xgb.model.json')
|
||||
#' bst2 <- xgb.load('xgb.model.json')
|
||||
#' if (file.exists('xgb.model.json')) file.remove('xgb.model.json')
|
||||
#' fname <- file.path(tempdir(), "xgb_model.json")
|
||||
#' xgb.save(bst, fname)
|
||||
#' bst2 <- xgb.load(fname)
|
||||
#'
|
||||
#' # Save as a raw byte vector; load it with xgb.load.raw()
|
||||
#' xgb_bytes <- xgb.save.raw(bst)
|
||||
@ -395,12 +433,12 @@ NULL
|
||||
#' # Persist the R object. Here, saveRDS() is okay, since it doesn't persist
|
||||
#' # xgb.Booster directly. What's being persisted is the future-proof byte representation
|
||||
#' # as given by xgb.save.raw().
|
||||
#' saveRDS(obj, 'my_object.rds')
|
||||
#' fname <- file.path(tempdir(), "my_object.Rds")
|
||||
#' saveRDS(obj, fname)
|
||||
#' # Read back the R object
|
||||
#' obj2 <- readRDS('my_object.rds')
|
||||
#' obj2 <- readRDS(fname)
|
||||
#' # Re-construct xgb.Booster object from the bytes
|
||||
#' bst2 <- xgb.load.raw(obj2$xgb_model_bytes)
|
||||
#' if (file.exists('my_object.rds')) file.remove('my_object.rds')
|
||||
#'
|
||||
#' @name a-compatibility-note-for-saveRDS-save
|
||||
NULL
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@ -8,13 +8,45 @@
|
||||
#' a \code{dgRMatrix} object,
|
||||
#' a \code{dsparseVector} object (only when making predictions from a fitted model, will be
|
||||
#' interpreted as a row vector), or a character string representing a filename.
|
||||
#' @param info a named list of additional information to store in the \code{xgb.DMatrix} object.
|
||||
#' See \code{\link{setinfo}} for the specific allowed kinds of
|
||||
#' @param label Label of the training data.
|
||||
#' @param weight Weight for each instance.
|
||||
#'
|
||||
#' Note that, for ranking task, weights are per-group. In ranking task, one weight
|
||||
#' is assigned to each group (not each data point). This is because we
|
||||
#' only care about the relative ordering of data points within each group,
|
||||
#' so it doesn't make sense to assign weights to individual data points.
|
||||
#' @param base_margin Base margin used for boosting from existing model.
|
||||
#'
|
||||
#' In the case of multi-output models, one can also pass multi-dimensional base_margin.
|
||||
#' @param missing a float value to represents missing values in data (used only when input is a dense matrix).
|
||||
#' It is useful when a 0 or some other extreme value represents missing values in data.
|
||||
#' @param silent whether to suppress printing an informational message after loading from a file.
|
||||
#' @param feature_names Set names for features. Overrides column names in data
|
||||
#' frame and matrix.
|
||||
#' @param nthread Number of threads used for creating DMatrix.
|
||||
#' @param ... the \code{info} data could be passed directly as parameters, without creating an \code{info} list.
|
||||
#' @param group Group size for all ranking group.
|
||||
#' @param qid Query ID for data samples, used for ranking.
|
||||
#' @param label_lower_bound Lower bound for survival training.
|
||||
#' @param label_upper_bound Upper bound for survival training.
|
||||
#' @param feature_weights Set feature weights for column sampling.
|
||||
#' @param enable_categorical Experimental support of specializing for categorical features.
|
||||
#'
|
||||
#' If passing 'TRUE' and 'data' is a data frame,
|
||||
#' columns of categorical types will automatically
|
||||
#' be set to be of categorical type (feature_type='c') in the resulting DMatrix.
|
||||
#'
|
||||
#' If passing 'FALSE' and 'data' is a data frame with categorical columns,
|
||||
#' it will result in an error being thrown.
|
||||
#'
|
||||
#' If 'data' is not a data frame, this argument is ignored.
|
||||
#'
|
||||
#' JSON/UBJSON serialization format is required for this.
|
||||
#'
|
||||
#' @details
|
||||
#' Note that DMatrix objects are not serializable through R functions such as \code{saveRDS} or \code{save}.
|
||||
#' If a DMatrix gets serialized and then de-serialized (for example, when saving data in an R session or caching
|
||||
#' chunks in an Rmd file), the resulting object will not be usable anymore and will need to be reconstructed
|
||||
#' from the original source of data.
|
||||
#'
|
||||
#' @examples
|
||||
#' data(agaricus.train, package='xgboost')
|
||||
@ -24,21 +56,43 @@
|
||||
#' dtrain <- with(
|
||||
#' agaricus.train, xgb.DMatrix(data, label = label, nthread = nthread)
|
||||
#' )
|
||||
#' xgb.DMatrix.save(dtrain, 'xgb.DMatrix.data')
|
||||
#' dtrain <- xgb.DMatrix('xgb.DMatrix.data')
|
||||
#' if (file.exists('xgb.DMatrix.data')) file.remove('xgb.DMatrix.data')
|
||||
#' fname <- file.path(tempdir(), "xgb.DMatrix.data")
|
||||
#' xgb.DMatrix.save(dtrain, fname)
|
||||
#' dtrain <- xgb.DMatrix(fname)
|
||||
#' @export
|
||||
xgb.DMatrix <- function(data, info = list(), missing = NA, silent = FALSE, nthread = NULL, ...) {
|
||||
cnames <- NULL
|
||||
xgb.DMatrix <- function(
|
||||
data,
|
||||
label = NULL,
|
||||
weight = NULL,
|
||||
base_margin = NULL,
|
||||
missing = NA,
|
||||
silent = FALSE,
|
||||
feature_names = colnames(data),
|
||||
nthread = NULL,
|
||||
group = NULL,
|
||||
qid = NULL,
|
||||
label_lower_bound = NULL,
|
||||
label_upper_bound = NULL,
|
||||
feature_weights = NULL,
|
||||
enable_categorical = FALSE
|
||||
) {
|
||||
if (!is.null(group) && !is.null(qid)) {
|
||||
stop("Either one of 'group' or 'qid' should be NULL")
|
||||
}
|
||||
ctypes <- NULL
|
||||
if (typeof(data) == "character") {
|
||||
if (length(data) > 1)
|
||||
stop("'data' has class 'character' and length ", length(data),
|
||||
".\n 'data' accepts either a numeric matrix or a single filename.")
|
||||
if (length(data) > 1) {
|
||||
stop(
|
||||
"'data' has class 'character' and length ", length(data),
|
||||
".\n 'data' accepts either a numeric matrix or a single filename."
|
||||
)
|
||||
}
|
||||
data <- path.expand(data)
|
||||
handle <- .Call(XGDMatrixCreateFromFile_R, data, as.integer(silent))
|
||||
} else if (is.matrix(data)) {
|
||||
handle <- .Call(XGDMatrixCreateFromMat_R, data, missing, as.integer(NVL(nthread, -1)))
|
||||
cnames <- colnames(data)
|
||||
handle <- .Call(
|
||||
XGDMatrixCreateFromMat_R, data, missing, as.integer(NVL(nthread, -1))
|
||||
)
|
||||
} else if (inherits(data, "dgCMatrix")) {
|
||||
handle <- .Call(
|
||||
XGDMatrixCreateFromCSC_R,
|
||||
@ -49,7 +103,6 @@ xgb.DMatrix <- function(data, info = list(), missing = NA, silent = FALSE, nthre
|
||||
missing,
|
||||
as.integer(NVL(nthread, -1))
|
||||
)
|
||||
cnames <- colnames(data)
|
||||
} else if (inherits(data, "dgRMatrix")) {
|
||||
handle <- .Call(
|
||||
XGDMatrixCreateFromCSR_R,
|
||||
@ -60,7 +113,6 @@ xgb.DMatrix <- function(data, info = list(), missing = NA, silent = FALSE, nthre
|
||||
missing,
|
||||
as.integer(NVL(nthread, -1))
|
||||
)
|
||||
cnames <- colnames(data)
|
||||
} else if (inherits(data, "dsparseVector")) {
|
||||
indptr <- c(0L, as.integer(length(data@i)))
|
||||
ind <- as.integer(data@i) - 1L
|
||||
@ -73,23 +125,112 @@ xgb.DMatrix <- function(data, info = list(), missing = NA, silent = FALSE, nthre
|
||||
missing,
|
||||
as.integer(NVL(nthread, -1))
|
||||
)
|
||||
} else if (is.data.frame(data)) {
|
||||
ctypes <- sapply(data, function(x) {
|
||||
if (is.factor(x)) {
|
||||
if (!enable_categorical) {
|
||||
stop(
|
||||
"When factor type is used, the parameter `enable_categorical`",
|
||||
" must be set to TRUE."
|
||||
)
|
||||
}
|
||||
"c"
|
||||
} else if (is.integer(x)) {
|
||||
"int"
|
||||
} else if (is.logical(x)) {
|
||||
"i"
|
||||
} else {
|
||||
if (!is.numeric(x)) {
|
||||
stop("Invalid type in dataframe.")
|
||||
}
|
||||
"float"
|
||||
}
|
||||
})
|
||||
## as.data.frame somehow converts integer/logical into real.
|
||||
data <- as.data.frame(sapply(data, function(x) {
|
||||
if (is.factor(x)) {
|
||||
## XGBoost uses 0-based indexing.
|
||||
as.numeric(x) - 1
|
||||
} else {
|
||||
x
|
||||
}
|
||||
}))
|
||||
handle <- .Call(
|
||||
XGDMatrixCreateFromDF_R, data, missing, as.integer(NVL(nthread, -1))
|
||||
)
|
||||
} else {
|
||||
stop("xgb.DMatrix does not support construction from ", typeof(data))
|
||||
}
|
||||
|
||||
dmat <- handle
|
||||
attributes(dmat) <- list(class = "xgb.DMatrix")
|
||||
if (!is.null(cnames)) {
|
||||
setinfo(dmat, "feature_name", cnames)
|
||||
attributes(dmat) <- list(
|
||||
class = "xgb.DMatrix",
|
||||
fields = new.env()
|
||||
)
|
||||
|
||||
if (!is.null(label)) {
|
||||
setinfo(dmat, "label", label)
|
||||
}
|
||||
if (!is.null(weight)) {
|
||||
setinfo(dmat, "weight", weight)
|
||||
}
|
||||
if (!is.null(base_margin)) {
|
||||
setinfo(dmat, "base_margin", base_margin)
|
||||
}
|
||||
if (!is.null(feature_names)) {
|
||||
setinfo(dmat, "feature_name", feature_names)
|
||||
}
|
||||
if (!is.null(group)) {
|
||||
setinfo(dmat, "group", group)
|
||||
}
|
||||
if (!is.null(qid)) {
|
||||
setinfo(dmat, "qid", qid)
|
||||
}
|
||||
if (!is.null(label_lower_bound)) {
|
||||
setinfo(dmat, "label_lower_bound", label_lower_bound)
|
||||
}
|
||||
if (!is.null(label_upper_bound)) {
|
||||
setinfo(dmat, "label_upper_bound", label_upper_bound)
|
||||
}
|
||||
if (!is.null(feature_weights)) {
|
||||
setinfo(dmat, "feature_weights", feature_weights)
|
||||
}
|
||||
if (!is.null(ctypes)) {
|
||||
setinfo(dmat, "feature_type", ctypes)
|
||||
}
|
||||
|
||||
info <- append(info, list(...))
|
||||
for (i in seq_along(info)) {
|
||||
p <- info[i]
|
||||
setinfo(dmat, names(p), p[[1]])
|
||||
}
|
||||
return(dmat)
|
||||
}
|
||||
|
||||
#' @title Check whether DMatrix object has a field
|
||||
#' @description Checks whether an xgb.DMatrix object has a given field assigned to
|
||||
#' it, such as weights, labels, etc.
|
||||
#' @param object The DMatrix object to check for the given \code{info} field.
|
||||
#' @param info The field to check for presence or absence in \code{object}.
|
||||
#' @seealso \link{xgb.DMatrix}, \link{getinfo.xgb.DMatrix}, \link{setinfo.xgb.DMatrix}
|
||||
#' @examples
|
||||
#' library(xgboost)
|
||||
#' x <- matrix(1:10, nrow = 5)
|
||||
#' dm <- xgb.DMatrix(x, nthread = 1)
|
||||
#'
|
||||
#' # 'dm' so far doesn't have any fields set
|
||||
#' xgb.DMatrix.hasinfo(dm, "label")
|
||||
#'
|
||||
#' # Fields can be added after construction
|
||||
#' setinfo(dm, "label", 1:5)
|
||||
#' xgb.DMatrix.hasinfo(dm, "label")
|
||||
#' @export
|
||||
xgb.DMatrix.hasinfo <- function(object, info) {
|
||||
if (!inherits(object, "xgb.DMatrix")) {
|
||||
stop("Object is not an 'xgb.DMatrix'.")
|
||||
}
|
||||
if (.Call(XGCheckNullPtr_R, object)) {
|
||||
warning("xgb.DMatrix object is invalid. Must be constructed again.")
|
||||
return(FALSE)
|
||||
}
|
||||
return(NVL(attr(object, "fields")[[info]], FALSE))
|
||||
}
|
||||
|
||||
|
||||
# get dmatrix from data, label
|
||||
# internal helper method
|
||||
@ -194,26 +335,38 @@ dimnames.xgb.DMatrix <- function(x) {
|
||||
}
|
||||
|
||||
|
||||
#' Get information of an xgb.DMatrix object
|
||||
#'
|
||||
#' Get information of an xgb.DMatrix object
|
||||
#' @param object Object of class \code{xgb.DMatrix}
|
||||
#' @title Get or set information of xgb.DMatrix and xgb.Booster objects
|
||||
#' @param object Object of class \code{xgb.DMatrix} of `xgb.Booster`.
|
||||
#' @param name the name of the information field to get (see details)
|
||||
#' @param ... other parameters
|
||||
#'
|
||||
#' @return For `getinfo`, will return the requested field. For `setinfo`, will always return value `TRUE`
|
||||
#' if it succeeds.
|
||||
#' @details
|
||||
#' The \code{name} field can be one of the following:
|
||||
#' The \code{name} field can be one of the following for `xgb.DMatrix`:
|
||||
#'
|
||||
#' \itemize{
|
||||
#' \item \code{label}: label XGBoost learn from ;
|
||||
#' \item \code{weight}: to do a weight rescale ;
|
||||
#' \item \code{base_margin}: base margin is the base prediction XGBoost will boost from ;
|
||||
#' \item \code{nrow}: number of rows of the \code{xgb.DMatrix}.
|
||||
#' \item \code{label}
|
||||
#' \item \code{weight}
|
||||
#' \item \code{base_margin}
|
||||
#' \item \code{label_lower_bound}
|
||||
#' \item \code{label_upper_bound}
|
||||
#' \item \code{group}
|
||||
#' \item \code{feature_type}
|
||||
#' \item \code{feature_name}
|
||||
#' \item \code{nrow}
|
||||
#' }
|
||||
#' See the documentation for \link{xgb.DMatrix} for more information about these fields.
|
||||
#'
|
||||
#' For `xgb.Booster`, can be one of the following:
|
||||
#' \itemize{
|
||||
#' \item \code{feature_type}
|
||||
#' \item \code{feature_name}
|
||||
#' }
|
||||
#'
|
||||
#' \code{group} can be setup by \code{setinfo} but can't be retrieved by \code{getinfo}.
|
||||
#' Note that, while 'qid' cannot be retrieved, it's possible to get the equivalent 'group'
|
||||
#' for a DMatrix that had 'qid' assigned.
|
||||
#'
|
||||
#' \bold{Important}: when calling `setinfo`, the objects are modified in-place. See
|
||||
#' \link{xgb.copy.Booster} for an idea of this in-place assignment works.
|
||||
#' @examples
|
||||
#' data(agaricus.train, package='xgboost')
|
||||
#' dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label, nthread = 2))
|
||||
@ -225,49 +378,60 @@ dimnames.xgb.DMatrix <- function(x) {
|
||||
#' stopifnot(all(labels2 == 1-labels))
|
||||
#' @rdname getinfo
|
||||
#' @export
|
||||
getinfo <- function(object, ...) UseMethod("getinfo")
|
||||
getinfo <- function(object, name) UseMethod("getinfo")
|
||||
|
||||
#' @rdname getinfo
|
||||
#' @export
|
||||
getinfo.xgb.DMatrix <- function(object, name, ...) {
|
||||
getinfo.xgb.DMatrix <- function(object, name) {
|
||||
allowed_int_fields <- 'group'
|
||||
allowed_float_fields <- c(
|
||||
'label', 'weight', 'base_margin',
|
||||
'label_lower_bound', 'label_upper_bound'
|
||||
)
|
||||
allowed_str_fields <- c("feature_type", "feature_name")
|
||||
allowed_fields <- c(allowed_float_fields, allowed_int_fields, allowed_str_fields, 'nrow')
|
||||
|
||||
if (typeof(name) != "character" ||
|
||||
length(name) != 1 ||
|
||||
!name %in% c('label', 'weight', 'base_margin', 'nrow',
|
||||
'label_lower_bound', 'label_upper_bound', "feature_type", "feature_name")) {
|
||||
stop(
|
||||
"getinfo: name must be one of the following\n",
|
||||
" 'label', 'weight', 'base_margin', 'nrow', 'label_lower_bound', 'label_upper_bound', 'feature_type', 'feature_name'"
|
||||
)
|
||||
!name %in% allowed_fields) {
|
||||
stop("getinfo: name must be one of the following\n",
|
||||
paste(paste0("'", allowed_fields, "'"), collapse = ", "))
|
||||
}
|
||||
if (name == "feature_name" || name == "feature_type") {
|
||||
ret <- .Call(XGDMatrixGetStrFeatureInfo_R, object, name)
|
||||
} else if (name != "nrow") {
|
||||
ret <- .Call(XGDMatrixGetInfo_R, object, name)
|
||||
} else {
|
||||
if (name == "nrow") {
|
||||
ret <- nrow(object)
|
||||
} else if (name %in% allowed_str_fields) {
|
||||
ret <- .Call(XGDMatrixGetStrFeatureInfo_R, object, name)
|
||||
} else if (name %in% allowed_float_fields) {
|
||||
ret <- .Call(XGDMatrixGetFloatInfo_R, object, name)
|
||||
if (length(ret) > nrow(object)) {
|
||||
ret <- matrix(ret, nrow = nrow(object), byrow = TRUE)
|
||||
}
|
||||
} else if (name %in% allowed_int_fields) {
|
||||
if (name == "group") {
|
||||
name <- "group_ptr"
|
||||
}
|
||||
ret <- .Call(XGDMatrixGetUIntInfo_R, object, name)
|
||||
if (length(ret) > nrow(object)) {
|
||||
ret <- matrix(ret, nrow = nrow(object), byrow = TRUE)
|
||||
}
|
||||
}
|
||||
if (length(ret) == 0) return(NULL)
|
||||
return(ret)
|
||||
}
|
||||
|
||||
|
||||
#' Set information of an xgb.DMatrix object
|
||||
#'
|
||||
#' Set information of an xgb.DMatrix object
|
||||
#'
|
||||
#' @param object Object of class "xgb.DMatrix"
|
||||
#' @param name the name of the field to get
|
||||
#' @rdname getinfo
|
||||
#' @param info the specific field of information to set
|
||||
#' @param ... other parameters
|
||||
#'
|
||||
#' @details
|
||||
#' The \code{name} field can be one of the following:
|
||||
#' See the documentation for \link{xgb.DMatrix} for possible fields that can be set
|
||||
#' (which correspond to arguments in that function).
|
||||
#'
|
||||
#' \itemize{
|
||||
#' \item \code{label}: label XGBoost learn from ;
|
||||
#' \item \code{weight}: to do a weight rescale ;
|
||||
#' \item \code{base_margin}: base margin is the base prediction XGBoost will boost from ;
|
||||
#' \item \code{group}: number of rows in each group (to use with \code{rank:pairwise} objective).
|
||||
#' Note that the following fields are allowed in the construction of an \code{xgb.DMatrix}
|
||||
#' but \bold{aren't} allowed here:\itemize{
|
||||
#' \item data
|
||||
#' \item missing
|
||||
#' \item silent
|
||||
#' \item nthread
|
||||
#' }
|
||||
#'
|
||||
#' @examples
|
||||
@ -278,52 +442,61 @@ getinfo.xgb.DMatrix <- function(object, name, ...) {
|
||||
#' setinfo(dtrain, 'label', 1-labels)
|
||||
#' labels2 <- getinfo(dtrain, 'label')
|
||||
#' stopifnot(all.equal(labels2, 1-labels))
|
||||
#' @rdname setinfo
|
||||
#' @export
|
||||
setinfo <- function(object, ...) UseMethod("setinfo")
|
||||
setinfo <- function(object, name, info) UseMethod("setinfo")
|
||||
|
||||
#' @rdname setinfo
|
||||
#' @rdname getinfo
|
||||
#' @export
|
||||
setinfo.xgb.DMatrix <- function(object, name, info, ...) {
|
||||
setinfo.xgb.DMatrix <- function(object, name, info) {
|
||||
.internal.setinfo.xgb.DMatrix(object, name, info)
|
||||
attr(object, "fields")[[name]] <- TRUE
|
||||
return(TRUE)
|
||||
}
|
||||
|
||||
.internal.setinfo.xgb.DMatrix <- function(object, name, info) {
|
||||
if (name == "label") {
|
||||
if (length(info) != nrow(object))
|
||||
if (NROW(info) != nrow(object))
|
||||
stop("The length of labels must equal to the number of rows in the input data")
|
||||
.Call(XGDMatrixSetInfo_R, object, name, as.numeric(info))
|
||||
.Call(XGDMatrixSetInfo_R, object, name, info)
|
||||
return(TRUE)
|
||||
}
|
||||
if (name == "label_lower_bound") {
|
||||
if (length(info) != nrow(object))
|
||||
if (NROW(info) != nrow(object))
|
||||
stop("The length of lower-bound labels must equal to the number of rows in the input data")
|
||||
.Call(XGDMatrixSetInfo_R, object, name, as.numeric(info))
|
||||
.Call(XGDMatrixSetInfo_R, object, name, info)
|
||||
return(TRUE)
|
||||
}
|
||||
if (name == "label_upper_bound") {
|
||||
if (length(info) != nrow(object))
|
||||
if (NROW(info) != nrow(object))
|
||||
stop("The length of upper-bound labels must equal to the number of rows in the input data")
|
||||
.Call(XGDMatrixSetInfo_R, object, name, as.numeric(info))
|
||||
.Call(XGDMatrixSetInfo_R, object, name, info)
|
||||
return(TRUE)
|
||||
}
|
||||
if (name == "weight") {
|
||||
.Call(XGDMatrixSetInfo_R, object, name, as.numeric(info))
|
||||
.Call(XGDMatrixSetInfo_R, object, name, info)
|
||||
return(TRUE)
|
||||
}
|
||||
if (name == "base_margin") {
|
||||
# if (length(info)!=nrow(object))
|
||||
# stop("The length of base margin must equal to the number of rows in the input data")
|
||||
.Call(XGDMatrixSetInfo_R, object, name, as.numeric(info))
|
||||
.Call(XGDMatrixSetInfo_R, object, name, info)
|
||||
return(TRUE)
|
||||
}
|
||||
if (name == "group") {
|
||||
if (sum(info) != nrow(object))
|
||||
stop("The sum of groups must equal to the number of rows in the input data")
|
||||
.Call(XGDMatrixSetInfo_R, object, name, as.integer(info))
|
||||
.Call(XGDMatrixSetInfo_R, object, name, info)
|
||||
return(TRUE)
|
||||
}
|
||||
if (name == "qid") {
|
||||
if (NROW(info) != nrow(object))
|
||||
stop("The length of qid assignments must equal to the number of rows in the input data")
|
||||
.Call(XGDMatrixSetInfo_R, object, name, info)
|
||||
return(TRUE)
|
||||
}
|
||||
if (name == "feature_weights") {
|
||||
if (length(info) != ncol(object)) {
|
||||
if (NROW(info) != ncol(object)) {
|
||||
stop("The number of feature weights must equal to the number of columns in the input data")
|
||||
}
|
||||
.Call(XGDMatrixSetInfo_R, object, name, as.numeric(info))
|
||||
.Call(XGDMatrixSetInfo_R, object, name, info)
|
||||
return(TRUE)
|
||||
}
|
||||
|
||||
@ -353,6 +526,111 @@ setinfo.xgb.DMatrix <- function(object, name, info, ...) {
|
||||
stop("setinfo: unknown info name ", name)
|
||||
}
|
||||
|
||||
#' @title Get Quantile Cuts from DMatrix
|
||||
#' @description Get the quantile cuts (a.k.a. borders) from an `xgb.DMatrix`
|
||||
#' that has been quantized for the histogram method (`tree_method="hist"`).
|
||||
#'
|
||||
#' These cuts are used in order to assign observations to bins - i.e. these are ordered
|
||||
#' boundaries which are used to determine assignment condition `border_low < x < border_high`.
|
||||
#' As such, the first and last bin will be outside of the range of the data, so as to include
|
||||
#' all of the observations there.
|
||||
#'
|
||||
#' If a given column has 'n' bins, then there will be 'n+1' cuts / borders for that column,
|
||||
#' which will be output in sorted order from lowest to highest.
|
||||
#'
|
||||
#' Different columns can have different numbers of bins according to their range.
|
||||
#' @param dmat An `xgb.DMatrix` object, as returned by \link{xgb.DMatrix}.
|
||||
#' @param output Output format for the quantile cuts. Possible options are:\itemize{
|
||||
#' \item `"list"` will return the output as a list with one entry per column, where
|
||||
#' each column will have a numeric vector with the cuts. The list will be named if
|
||||
#' `dmat` has column names assigned to it.
|
||||
#' \item `"arrays"` will return a list with entries `indptr` (base-0 indexing) and
|
||||
#' `data`. Here, the cuts for column 'i' are obtained by slicing 'data' from entries
|
||||
#' `indptr[i]+1` to `indptr[i+1]`.
|
||||
#' }
|
||||
#' @return The quantile cuts, in the format specified by parameter `output`.
|
||||
#' @examples
|
||||
#' library(xgboost)
|
||||
#' data(mtcars)
|
||||
#' y <- mtcars$mpg
|
||||
#' x <- as.matrix(mtcars[, -1])
|
||||
#' dm <- xgb.DMatrix(x, label = y, nthread = 1)
|
||||
#'
|
||||
#' # DMatrix is not quantized right away, but will be once a hist model is generated
|
||||
#' model <- xgb.train(
|
||||
#' data = dm,
|
||||
#' params = list(
|
||||
#' tree_method = "hist",
|
||||
#' max_bin = 8,
|
||||
#' nthread = 1
|
||||
#' ),
|
||||
#' nrounds = 3
|
||||
#' )
|
||||
#'
|
||||
#' # Now can get the quantile cuts
|
||||
#' xgb.get.DMatrix.qcut(dm)
|
||||
#' @export
|
||||
xgb.get.DMatrix.qcut <- function(dmat, output = c("list", "arrays")) { # nolint
|
||||
stopifnot(inherits(dmat, "xgb.DMatrix"))
|
||||
output <- head(output, 1L)
|
||||
stopifnot(output %in% c("list", "arrays"))
|
||||
res <- .Call(XGDMatrixGetQuantileCut_R, dmat)
|
||||
if (output == "arrays") {
|
||||
return(res)
|
||||
} else {
|
||||
feature_names <- getinfo(dmat, "feature_name")
|
||||
ncols <- length(res$indptr) - 1
|
||||
out <- lapply(
|
||||
seq(1, ncols),
|
||||
function(col) {
|
||||
st <- res$indptr[col]
|
||||
end <- res$indptr[col + 1]
|
||||
if (end <= st) {
|
||||
return(numeric())
|
||||
}
|
||||
return(res$data[seq(1 + st, end)])
|
||||
}
|
||||
)
|
||||
if (NROW(feature_names)) {
|
||||
names(out) <- feature_names
|
||||
}
|
||||
return(out)
|
||||
}
|
||||
}
|
||||
|
||||
#' @title Get Number of Non-Missing Entries in DMatrix
|
||||
#' @param dmat An `xgb.DMatrix` object, as returned by \link{xgb.DMatrix}.
|
||||
#' @return The number of non-missing entries in the DMatrix
|
||||
#' @export
|
||||
xgb.get.DMatrix.num.non.missing <- function(dmat) { # nolint
|
||||
stopifnot(inherits(dmat, "xgb.DMatrix"))
|
||||
return(.Call(XGDMatrixNumNonMissing_R, dmat))
|
||||
}
|
||||
|
||||
#' @title Get DMatrix Data
|
||||
#' @param dmat An `xgb.DMatrix` object, as returned by \link{xgb.DMatrix}.
|
||||
#' @return The data held in the DMatrix, as a sparse CSR matrix (class `dgRMatrix`
|
||||
#' from package `Matrix`). If it had feature names, these will be added as column names
|
||||
#' in the output.
|
||||
#' @export
|
||||
xgb.get.DMatrix.data <- function(dmat) {
|
||||
stopifnot(inherits(dmat, "xgb.DMatrix"))
|
||||
res <- .Call(XGDMatrixGetDataAsCSR_R, dmat)
|
||||
out <- methods::new("dgRMatrix")
|
||||
nrows <- as.integer(length(res$indptr) - 1)
|
||||
out@p <- res$indptr
|
||||
out@j <- res$indices
|
||||
out@x <- res$data
|
||||
out@Dim <- as.integer(c(nrows, res$ncols))
|
||||
|
||||
feature_names <- getinfo(dmat, "feature_name")
|
||||
dim_names <- list(NULL, NULL)
|
||||
if (NROW(feature_names)) {
|
||||
dim_names[[2L]] <- feature_names
|
||||
}
|
||||
out@Dimnames <- dim_names
|
||||
return(out)
|
||||
}
|
||||
|
||||
#' Get a new DMatrix containing the specified rows of
|
||||
#' original xgb.DMatrix object
|
||||
@ -363,7 +641,6 @@ setinfo.xgb.DMatrix <- function(object, name, info, ...) {
|
||||
#' @param object Object of class "xgb.DMatrix"
|
||||
#' @param idxset a integer vector of indices of rows needed
|
||||
#' @param colset currently not used (columns subsetting is not available)
|
||||
#' @param ... other parameters (currently not used)
|
||||
#'
|
||||
#' @examples
|
||||
#' data(agaricus.train, package='xgboost')
|
||||
@ -377,11 +654,11 @@ setinfo.xgb.DMatrix <- function(object, name, info, ...) {
|
||||
#'
|
||||
#' @rdname slice.xgb.DMatrix
|
||||
#' @export
|
||||
slice <- function(object, ...) UseMethod("slice")
|
||||
slice <- function(object, idxset) UseMethod("slice")
|
||||
|
||||
#' @rdname slice.xgb.DMatrix
|
||||
#' @export
|
||||
slice.xgb.DMatrix <- function(object, idxset, ...) {
|
||||
slice.xgb.DMatrix <- function(object, idxset) {
|
||||
if (!inherits(object, "xgb.DMatrix")) {
|
||||
stop("object must be xgb.DMatrix")
|
||||
}
|
||||
@ -431,11 +708,15 @@ slice.xgb.DMatrix <- function(object, idxset, ...) {
|
||||
#' @method print xgb.DMatrix
|
||||
#' @export
|
||||
print.xgb.DMatrix <- function(x, verbose = FALSE, ...) {
|
||||
if (.Call(XGCheckNullPtr_R, x)) {
|
||||
cat("INVALID xgb.DMatrix object. Must be constructed anew.\n")
|
||||
return(invisible(x))
|
||||
}
|
||||
cat('xgb.DMatrix dim:', nrow(x), 'x', ncol(x), ' info: ')
|
||||
infos <- character(0)
|
||||
if (length(getinfo(x, 'label')) > 0) infos <- 'label'
|
||||
if (length(getinfo(x, 'weight')) > 0) infos <- c(infos, 'weight')
|
||||
if (length(getinfo(x, 'base_margin')) > 0) infos <- c(infos, 'base_margin')
|
||||
if (xgb.DMatrix.hasinfo(x, 'label')) infos <- 'label'
|
||||
if (xgb.DMatrix.hasinfo(x, 'weight')) infos <- c(infos, 'weight')
|
||||
if (xgb.DMatrix.hasinfo(x, 'base_margin')) infos <- c(infos, 'base_margin')
|
||||
if (length(infos) == 0) infos <- 'NA'
|
||||
cat(infos)
|
||||
cnames <- colnames(x)
|
||||
|
||||
@ -8,9 +8,9 @@
|
||||
#' @examples
|
||||
#' data(agaricus.train, package='xgboost')
|
||||
#' dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label, nthread = 2))
|
||||
#' xgb.DMatrix.save(dtrain, 'xgb.DMatrix.data')
|
||||
#' dtrain <- xgb.DMatrix('xgb.DMatrix.data')
|
||||
#' if (file.exists('xgb.DMatrix.data')) file.remove('xgb.DMatrix.data')
|
||||
#' fname <- file.path(tempdir(), "xgb.DMatrix.data")
|
||||
#' xgb.DMatrix.save(dtrain, fname)
|
||||
#' dtrain <- xgb.DMatrix(fname)
|
||||
#' @export
|
||||
xgb.DMatrix.save <- function(dmatrix, fname) {
|
||||
if (typeof(fname) != "character")
|
||||
|
||||
@ -51,7 +51,7 @@
|
||||
#' dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label, nthread = 2))
|
||||
#' dtest <- with(agaricus.test, xgb.DMatrix(data, label = label, nthread = 2))
|
||||
#'
|
||||
#' param <- list(max_depth=2, eta=1, silent=1, objective='binary:logistic')
|
||||
#' param <- list(max_depth=2, eta=1, objective='binary:logistic')
|
||||
#' nrounds = 4
|
||||
#'
|
||||
#' bst = xgb.train(params = param, data = dtrain, nrounds = nrounds, nthread = 2)
|
||||
|
||||
@ -126,6 +126,9 @@ xgb.cv <- function(params = list(), data, nrounds, nfold, label = NULL, missing
|
||||
early_stopping_rounds = NULL, maximize = NULL, callbacks = list(), ...) {
|
||||
|
||||
check.deprecation(...)
|
||||
if (inherits(data, "xgb.DMatrix") && .Call(XGCheckNullPtr_R, data)) {
|
||||
stop("'data' is an invalid 'xgb.DMatrix' object. Must be constructed again.")
|
||||
}
|
||||
|
||||
params <- check.booster.params(params, ...)
|
||||
# TODO: should we deprecate the redundant 'metrics' parameter?
|
||||
@ -136,7 +139,7 @@ xgb.cv <- function(params = list(), data, nrounds, nfold, label = NULL, missing
|
||||
check.custom.eval()
|
||||
|
||||
# Check the labels
|
||||
if ((inherits(data, 'xgb.DMatrix') && is.null(getinfo(data, 'label'))) ||
|
||||
if ((inherits(data, 'xgb.DMatrix') && !xgb.DMatrix.hasinfo(data, 'label')) ||
|
||||
(!inherits(data, 'xgb.DMatrix') && is.null(label))) {
|
||||
stop("Labels must be provided for CV either through xgb.DMatrix, or through 'label=' when 'data' is matrix")
|
||||
} else if (inherits(data, 'xgb.DMatrix')) {
|
||||
@ -201,13 +204,13 @@ xgb.cv <- function(params = list(), data, nrounds, nfold, label = NULL, missing
|
||||
dtrain <- slice(dall, unlist(folds[-k]))
|
||||
else
|
||||
dtrain <- slice(dall, train_folds[[k]])
|
||||
handle <- xgb.Booster.handle(
|
||||
bst <- xgb.Booster(
|
||||
params = params,
|
||||
cachelist = list(dtrain, dtest),
|
||||
modelfile = NULL,
|
||||
handle = NULL
|
||||
modelfile = NULL
|
||||
)
|
||||
list(dtrain = dtrain, bst = handle, watchlist = list(train = dtrain, test = dtest), index = folds[[k]])
|
||||
bst <- bst$bst
|
||||
list(dtrain = dtrain, bst = bst, watchlist = list(train = dtrain, test = dtest), index = folds[[k]])
|
||||
})
|
||||
rm(dall)
|
||||
# a "basket" to collect some results from callbacks
|
||||
@ -228,21 +231,22 @@ xgb.cv <- function(params = list(), data, nrounds, nfold, label = NULL, missing
|
||||
|
||||
msg <- lapply(bst_folds, function(fd) {
|
||||
xgb.iter.update(
|
||||
booster_handle = fd$bst,
|
||||
bst = fd$bst,
|
||||
dtrain = fd$dtrain,
|
||||
iter = iteration - 1,
|
||||
obj = obj
|
||||
)
|
||||
xgb.iter.eval(
|
||||
booster_handle = fd$bst,
|
||||
bst = fd$bst,
|
||||
watchlist = fd$watchlist,
|
||||
iter = iteration - 1,
|
||||
feval = feval
|
||||
)
|
||||
})
|
||||
msg <- simplify2array(msg)
|
||||
bst_evaluation <- rowMeans(msg)
|
||||
bst_evaluation_err <- sqrt(rowMeans(msg^2) - bst_evaluation^2) # nolint
|
||||
# Note: these variables might look unused here, but they are used in the callbacks
|
||||
bst_evaluation <- rowMeans(msg) # nolint
|
||||
bst_evaluation_err <- apply(msg, 1, sd) # nolint
|
||||
|
||||
for (f in cb$post_iter) f()
|
||||
|
||||
@ -263,7 +267,7 @@ xgb.cv <- function(params = list(), data, nrounds, nfold, label = NULL, missing
|
||||
ret <- c(ret, basket)
|
||||
|
||||
class(ret) <- 'xgb.cv.synchronous'
|
||||
invisible(ret)
|
||||
return(invisible(ret))
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -13,7 +13,10 @@
|
||||
#' When this option is on, the model dump contains two additional values:
|
||||
#' gain is the approximate loss function gain we get in each split;
|
||||
#' cover is the sum of second order gradient in each node.
|
||||
#' @param dump_format either 'text' or 'json' format could be specified.
|
||||
#' @param dump_format either 'text', 'json', or 'dot' (graphviz) format could be specified.
|
||||
#'
|
||||
#' Format 'dot' for a single tree can be passed directly to packages that consume this format
|
||||
#' for graph visualization, such as function [DiagrammeR::grViz()]
|
||||
#' @param ... currently not used
|
||||
#'
|
||||
#' @return
|
||||
@ -37,9 +40,13 @@
|
||||
#' # print in JSON format:
|
||||
#' cat(xgb.dump(bst, with_stats = TRUE, dump_format='json'))
|
||||
#'
|
||||
#' # plot first tree leveraging the 'dot' format
|
||||
#' if (requireNamespace('DiagrammeR', quietly = TRUE)) {
|
||||
#' DiagrammeR::grViz(xgb.dump(bst, dump_format = "dot")[[1L]])
|
||||
#' }
|
||||
#' @export
|
||||
xgb.dump <- function(model, fname = NULL, fmap = "", with_stats = FALSE,
|
||||
dump_format = c("text", "json"), ...) {
|
||||
dump_format = c("text", "json", "dot"), ...) {
|
||||
check.deprecation(...)
|
||||
dump_format <- match.arg(dump_format)
|
||||
if (!inherits(model, "xgb.Booster"))
|
||||
@ -49,9 +56,16 @@ xgb.dump <- function(model, fname = NULL, fmap = "", with_stats = FALSE,
|
||||
if (!(is.null(fmap) || is.character(fmap)))
|
||||
stop("fmap: argument must be a character string (when provided)")
|
||||
|
||||
model <- xgb.Booster.complete(model)
|
||||
model_dump <- .Call(XGBoosterDumpModel_R, model$handle, NVL(fmap, "")[1], as.integer(with_stats),
|
||||
as.character(dump_format))
|
||||
model_dump <- .Call(
|
||||
XGBoosterDumpModel_R,
|
||||
xgb.get.handle(model),
|
||||
NVL(fmap, "")[1],
|
||||
as.integer(with_stats),
|
||||
as.character(dump_format)
|
||||
)
|
||||
if (dump_format == "dot") {
|
||||
return(sapply(model_dump, function(x) gsub("^booster\\[\\d+\\]\\n", "\\1", x)))
|
||||
}
|
||||
|
||||
if (is.null(fname))
|
||||
model_dump <- gsub('\t', '', model_dump, fixed = TRUE)
|
||||
|
||||
@ -127,22 +127,20 @@ xgb.ggplot.shap.summary <- function(data, shap_contrib = NULL, features = NULL,
|
||||
p
|
||||
}
|
||||
|
||||
#' Combine and melt feature values and SHAP contributions for sample
|
||||
#' observations.
|
||||
#' Combine feature values and SHAP values
|
||||
#'
|
||||
#' Conforms to data format required for ggplot functions.
|
||||
#' Internal function used to combine and melt feature values and SHAP contributions
|
||||
#' as required for ggplot functions related to SHAP.
|
||||
#'
|
||||
#' Internal utility function.
|
||||
#' @param data_list The result of `xgb.shap.data()`.
|
||||
#' @param normalize Whether to standardize feature values to mean 0 and
|
||||
#' standard deviation 1. This is useful for comparing multiple features on the same
|
||||
#' plot. Default is \code{FALSE}.
|
||||
#'
|
||||
#' @param data_list List containing 'data' and 'shap_contrib' returned by
|
||||
#' \code{xgb.shap.data()}.
|
||||
#' @param normalize Whether to standardize feature values to have mean 0 and
|
||||
#' standard deviation 1 (useful for comparing multiple features on the same
|
||||
#' plot). Default \code{FALSE}.
|
||||
#'
|
||||
#' @return A data.table containing the observation ID, the feature name, the
|
||||
#' @return A `data.table` containing the observation ID, the feature name, the
|
||||
#' feature value (normalized if specified), and the SHAP contribution value.
|
||||
#' @noRd
|
||||
#' @keywords internal
|
||||
prepare.ggplot.shap.data <- function(data_list, normalize = FALSE) {
|
||||
data <- data_list[["data"]]
|
||||
shap_contrib <- data_list[["shap_contrib"]]
|
||||
@ -163,15 +161,16 @@ prepare.ggplot.shap.data <- function(data_list, normalize = FALSE) {
|
||||
p_data
|
||||
}
|
||||
|
||||
#' Scale feature value to have mean 0, standard deviation 1
|
||||
#' Scale feature values
|
||||
#'
|
||||
#' This is used to compare multiple features on the same plot.
|
||||
#' Internal utility function
|
||||
#' Internal function that scales feature values to mean 0 and standard deviation 1.
|
||||
#' Useful to compare multiple features on the same plot.
|
||||
#'
|
||||
#' @param x Numeric vector
|
||||
#' @param x Numeric vector.
|
||||
#'
|
||||
#' @return Numeric vector with mean 0 and sd 1.
|
||||
#' @return Numeric vector with mean 0 and standard deviation 1.
|
||||
#' @noRd
|
||||
#' @keywords internal
|
||||
normalize <- function(x) {
|
||||
loc <- mean(x, na.rm = TRUE)
|
||||
scale <- stats::sd(x, na.rm = TRUE)
|
||||
|
||||
@ -1,83 +1,115 @@
|
||||
#' Importance of features in a model.
|
||||
#' Feature importance
|
||||
#'
|
||||
#' Creates a \code{data.table} of feature importances in a model.
|
||||
#' Creates a `data.table` of feature importances.
|
||||
#'
|
||||
#' @param feature_names character vector of feature names. If the model already
|
||||
#' contains feature names, those would be used when \code{feature_names=NULL} (default value).
|
||||
#' Non-null \code{feature_names} could be provided to override those in the model.
|
||||
#' @param model object of class \code{xgb.Booster}.
|
||||
#' @param trees (only for the gbtree booster) an integer vector of tree indices that should be included
|
||||
#' into the importance calculation. If set to \code{NULL}, all trees of the model are parsed.
|
||||
#' It could be useful, e.g., in multiclass classification to get feature importances
|
||||
#' for each class separately. IMPORTANT: the tree index in xgboost models
|
||||
#' is zero-based (e.g., use \code{trees = 0:4} for first 5 trees).
|
||||
#' @param data deprecated.
|
||||
#' @param label deprecated.
|
||||
#' @param target deprecated.
|
||||
#' @param feature_names Character vector used to overwrite the feature names
|
||||
#' of the model. The default is `NULL` (use original feature names).
|
||||
#' @param model Object of class `xgb.Booster`.
|
||||
#' @param trees An integer vector of tree indices that should be included
|
||||
#' into the importance calculation (only for the "gbtree" booster).
|
||||
#' The default (`NULL`) parses all trees.
|
||||
#' It could be useful, e.g., in multiclass classification to get feature importances
|
||||
#' for each class separately. *Important*: the tree index in XGBoost models
|
||||
#' is zero-based (e.g., use `trees = 0:4` for the first five trees).
|
||||
#' @param data Deprecated.
|
||||
#' @param label Deprecated.
|
||||
#' @param target Deprecated.
|
||||
#'
|
||||
#' @details
|
||||
#'
|
||||
#' This function works for both linear and tree models.
|
||||
#'
|
||||
#' For linear models, the importance is the absolute magnitude of linear coefficients.
|
||||
#' For that reason, in order to obtain a meaningful ranking by importance for a linear model,
|
||||
#' the features need to be on the same scale (which you also would want to do when using either
|
||||
#' L1 or L2 regularization).
|
||||
#' To obtain a meaningful ranking by importance for linear models, the features need to
|
||||
#' be on the same scale (which is also recommended when using L1 or L2 regularization).
|
||||
#'
|
||||
#' @return
|
||||
#' @return A `data.table` with the following columns:
|
||||
#'
|
||||
#' For a tree model, a \code{data.table} with the following columns:
|
||||
#' \itemize{
|
||||
#' \item \code{Features} names of the features used in the model;
|
||||
#' \item \code{Gain} represents fractional contribution of each feature to the model based on
|
||||
#' the total gain of this feature's splits. Higher percentage means a more important
|
||||
#' predictive feature.
|
||||
#' \item \code{Cover} metric of the number of observation related to this feature;
|
||||
#' \item \code{Frequency} percentage representing the relative number of times
|
||||
#' a feature have been used in trees.
|
||||
#' }
|
||||
#' For a tree model:
|
||||
#' - `Features`: Names of the features used in the model.
|
||||
#' - `Gain`: Fractional contribution of each feature to the model based on
|
||||
#' the total gain of this feature's splits. Higher percentage means higher importance.
|
||||
#' - `Cover`: Metric of the number of observation related to this feature.
|
||||
#' - `Frequency`: Percentage of times a feature has been used in trees.
|
||||
#'
|
||||
#' A linear model's importance \code{data.table} has the following columns:
|
||||
#' \itemize{
|
||||
#' \item \code{Features} names of the features used in the model;
|
||||
#' \item \code{Weight} the linear coefficient of this feature;
|
||||
#' \item \code{Class} (only for multiclass models) class label.
|
||||
#' }
|
||||
#' For a linear model:
|
||||
#' - `Features`: Names of the features used in the model.
|
||||
#' - `Weight`: Linear coefficient of this feature.
|
||||
#' - `Class`: Class label (only for multiclass models).
|
||||
#'
|
||||
#' If \code{feature_names} is not provided and \code{model} doesn't have \code{feature_names},
|
||||
#' index of the features will be used instead. Because the index is extracted from the model dump
|
||||
#' If `feature_names` is not provided and `model` doesn't have `feature_names`,
|
||||
#' the index of the features will be used instead. Because the index is extracted from the model dump
|
||||
#' (based on C++ code), it starts at 0 (as in C/C++ or Python) instead of 1 (usual in R).
|
||||
#'
|
||||
#' @examples
|
||||
#'
|
||||
#' # binomial classification using gbtree:
|
||||
#' data(agaricus.train, package='xgboost')
|
||||
#' bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max_depth = 2,
|
||||
#' eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic")
|
||||
#' # binomial classification using "gbtree":
|
||||
#' data(agaricus.train, package = "xgboost")
|
||||
#'
|
||||
#' bst <- xgboost(
|
||||
#' data = agaricus.train$data,
|
||||
#' label = agaricus.train$label,
|
||||
#' max_depth = 2,
|
||||
#' eta = 1,
|
||||
#' nthread = 2,
|
||||
#' nrounds = 2,
|
||||
#' objective = "binary:logistic"
|
||||
#' )
|
||||
#'
|
||||
#' xgb.importance(model = bst)
|
||||
#'
|
||||
#' # binomial classification using gblinear:
|
||||
#' bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, booster = "gblinear",
|
||||
#' eta = 0.3, nthread = 1, nrounds = 20, objective = "binary:logistic")
|
||||
#' # binomial classification using "gblinear":
|
||||
#' bst <- xgboost(
|
||||
#' data = agaricus.train$data,
|
||||
#' label = agaricus.train$label,
|
||||
#' booster = "gblinear",
|
||||
#' eta = 0.3,
|
||||
#' nthread = 1,
|
||||
#' nrounds = 20,objective = "binary:logistic"
|
||||
#' )
|
||||
#'
|
||||
#' xgb.importance(model = bst)
|
||||
#'
|
||||
#' # multiclass classification using gbtree:
|
||||
#' # multiclass classification using "gbtree":
|
||||
#' nclass <- 3
|
||||
#' nrounds <- 10
|
||||
#' mbst <- xgboost(data = as.matrix(iris[, -5]), label = as.numeric(iris$Species) - 1,
|
||||
#' max_depth = 3, eta = 0.2, nthread = 2, nrounds = nrounds,
|
||||
#' objective = "multi:softprob", num_class = nclass)
|
||||
#' mbst <- xgboost(
|
||||
#' data = as.matrix(iris[, -5]),
|
||||
#' label = as.numeric(iris$Species) - 1,
|
||||
#' max_depth = 3,
|
||||
#' eta = 0.2,
|
||||
#' nthread = 2,
|
||||
#' nrounds = nrounds,
|
||||
#' objective = "multi:softprob",
|
||||
#' num_class = nclass
|
||||
#' )
|
||||
#'
|
||||
#' # all classes clumped together:
|
||||
#' xgb.importance(model = mbst)
|
||||
#' # inspect importances separately for each class:
|
||||
#' xgb.importance(model = mbst, trees = seq(from=0, by=nclass, length.out=nrounds))
|
||||
#' xgb.importance(model = mbst, trees = seq(from=1, by=nclass, length.out=nrounds))
|
||||
#' xgb.importance(model = mbst, trees = seq(from=2, by=nclass, length.out=nrounds))
|
||||
#'
|
||||
#' # multiclass classification using gblinear:
|
||||
#' mbst <- xgboost(data = scale(as.matrix(iris[, -5])), label = as.numeric(iris$Species) - 1,
|
||||
#' booster = "gblinear", eta = 0.2, nthread = 1, nrounds = 15,
|
||||
#' objective = "multi:softprob", num_class = nclass)
|
||||
#' # inspect importances separately for each class:
|
||||
#' xgb.importance(
|
||||
#' model = mbst, trees = seq(from = 0, by = nclass, length.out = nrounds)
|
||||
#' )
|
||||
#' xgb.importance(
|
||||
#' model = mbst, trees = seq(from = 1, by = nclass, length.out = nrounds)
|
||||
#' )
|
||||
#' xgb.importance(
|
||||
#' model = mbst, trees = seq(from = 2, by = nclass, length.out = nrounds)
|
||||
#' )
|
||||
#'
|
||||
#' # multiclass classification using "gblinear":
|
||||
#' mbst <- xgboost(
|
||||
#' data = scale(as.matrix(iris[, -5])),
|
||||
#' label = as.numeric(iris$Species) - 1,
|
||||
#' booster = "gblinear",
|
||||
#' eta = 0.2,
|
||||
#' nthread = 1,
|
||||
#' nrounds = 15,
|
||||
#' objective = "multi:softprob",
|
||||
#' num_class = nclass
|
||||
#' )
|
||||
#'
|
||||
#' xgb.importance(model = mbst)
|
||||
#'
|
||||
#' @export
|
||||
@ -87,21 +119,21 @@ xgb.importance <- function(feature_names = NULL, model = NULL, trees = NULL,
|
||||
if (!(is.null(data) && is.null(label) && is.null(target)))
|
||||
warning("xgb.importance: parameters 'data', 'label' and 'target' are deprecated")
|
||||
|
||||
if (!inherits(model, "xgb.Booster"))
|
||||
stop("model: must be an object of class xgb.Booster")
|
||||
|
||||
if (is.null(feature_names) && !is.null(model$feature_names))
|
||||
feature_names <- model$feature_names
|
||||
if (is.null(feature_names)) {
|
||||
model_feature_names <- xgb.feature_names(model)
|
||||
if (NROW(model_feature_names)) {
|
||||
feature_names <- model_feature_names
|
||||
}
|
||||
}
|
||||
|
||||
if (!(is.null(feature_names) || is.character(feature_names)))
|
||||
stop("feature_names: Has to be a character vector")
|
||||
|
||||
model <- xgb.Booster.complete(model)
|
||||
config <- jsonlite::fromJSON(xgb.config(model))
|
||||
if (config$learner$gradient_booster$name == "gblinear") {
|
||||
handle <- xgb.get.handle(model)
|
||||
if (xgb.booster_type(model) == "gblinear") {
|
||||
args <- list(importance_type = "weight", feature_names = feature_names)
|
||||
results <- .Call(
|
||||
XGBoosterFeatureScore_R, model$handle, jsonlite::toJSON(args, auto_unbox = TRUE, null = "null")
|
||||
XGBoosterFeatureScore_R, handle, jsonlite::toJSON(args, auto_unbox = TRUE, null = "null")
|
||||
)
|
||||
names(results) <- c("features", "shape", "weight")
|
||||
if (length(results$shape) == 2) {
|
||||
@ -122,7 +154,7 @@ xgb.importance <- function(feature_names = NULL, model = NULL, trees = NULL,
|
||||
for (importance_type in c("weight", "total_gain", "total_cover")) {
|
||||
args <- list(importance_type = importance_type, feature_names = feature_names, tree_idx = trees)
|
||||
results <- .Call(
|
||||
XGBoosterFeatureScore_R, model$handle, jsonlite::toJSON(args, auto_unbox = TRUE, null = "null")
|
||||
XGBoosterFeatureScore_R, handle, jsonlite::toJSON(args, auto_unbox = TRUE, null = "null")
|
||||
)
|
||||
names(results) <- c("features", "shape", importance_type)
|
||||
concatenated[
|
||||
|
||||
@ -17,7 +17,7 @@
|
||||
#' An object of \code{xgb.Booster} class.
|
||||
#'
|
||||
#' @seealso
|
||||
#' \code{\link{xgb.save}}, \code{\link{xgb.Booster.complete}}.
|
||||
#' \code{\link{xgb.save}}
|
||||
#'
|
||||
#' @examples
|
||||
#' data(agaricus.train, package='xgboost')
|
||||
@ -29,40 +29,37 @@
|
||||
#'
|
||||
#' train <- agaricus.train
|
||||
#' test <- agaricus.test
|
||||
#' bst <- xgboost(
|
||||
#' data = train$data, label = train$label, max_depth = 2, eta = 1,
|
||||
#' bst <- xgb.train(
|
||||
#' data = xgb.DMatrix(train$data, label = train$label),
|
||||
#' max_depth = 2,
|
||||
#' eta = 1,
|
||||
#' nthread = nthread,
|
||||
#' nrounds = 2,
|
||||
#' objective = "binary:logistic"
|
||||
#' )
|
||||
#'
|
||||
#' xgb.save(bst, 'xgb.model')
|
||||
#' bst <- xgb.load('xgb.model')
|
||||
#' if (file.exists('xgb.model')) file.remove('xgb.model')
|
||||
#' fname <- file.path(tempdir(), "xgb.ubj")
|
||||
#' xgb.save(bst, fname)
|
||||
#' bst <- xgb.load(fname)
|
||||
#' @export
|
||||
xgb.load <- function(modelfile) {
|
||||
if (is.null(modelfile))
|
||||
stop("xgb.load: modelfile cannot be NULL")
|
||||
|
||||
handle <- xgb.Booster.handle(
|
||||
bst <- xgb.Booster(
|
||||
params = list(),
|
||||
cachelist = list(),
|
||||
modelfile = modelfile,
|
||||
handle = NULL
|
||||
modelfile = modelfile
|
||||
)
|
||||
bst <- bst$bst
|
||||
# re-use modelfile if it is raw so we do not need to serialize
|
||||
if (typeof(modelfile) == "raw") {
|
||||
warning(
|
||||
paste(
|
||||
"The support for loading raw booster with `xgb.load` will be ",
|
||||
"discontinued in upcoming release. Use `xgb.load.raw` or",
|
||||
" `xgb.unserialize` instead. "
|
||||
"discontinued in upcoming release. Use `xgb.load.raw` instead. "
|
||||
)
|
||||
)
|
||||
bst <- xgb.handleToBooster(handle = handle, raw = modelfile)
|
||||
} else {
|
||||
bst <- xgb.handleToBooster(handle = handle, raw = NULL)
|
||||
}
|
||||
bst <- xgb.Booster.complete(bst, saveraw = TRUE)
|
||||
return(bst)
|
||||
}
|
||||
|
||||
@ -3,21 +3,10 @@
|
||||
#' User can generate raw memory buffer by calling xgb.save.raw
|
||||
#'
|
||||
#' @param buffer the buffer returned by xgb.save.raw
|
||||
#' @param as_booster Return the loaded model as xgb.Booster instead of xgb.Booster.handle.
|
||||
#'
|
||||
#' @export
|
||||
xgb.load.raw <- function(buffer, as_booster = FALSE) {
|
||||
xgb.load.raw <- function(buffer) {
|
||||
cachelist <- list()
|
||||
handle <- .Call(XGBoosterCreate_R, cachelist)
|
||||
.Call(XGBoosterLoadModelFromRaw_R, handle, buffer)
|
||||
class(handle) <- "xgb.Booster.handle"
|
||||
|
||||
if (as_booster) {
|
||||
booster <- list(handle = handle, raw = NULL)
|
||||
class(booster) <- "xgb.Booster"
|
||||
booster <- xgb.Booster.complete(booster, saveraw = TRUE)
|
||||
return(booster)
|
||||
} else {
|
||||
return(handle)
|
||||
}
|
||||
bst <- .Call(XGBoosterCreate_R, cachelist)
|
||||
.Call(XGBoosterLoadModelFromRaw_R, xgb.get.handle(bst), buffer)
|
||||
return(bst)
|
||||
}
|
||||
|
||||
@ -1,67 +1,72 @@
|
||||
#' Parse a boosted tree model text dump
|
||||
#' Parse model text dump
|
||||
#'
|
||||
#' Parse a boosted tree model text dump into a \code{data.table} structure.
|
||||
#' Parse a boosted tree model text dump into a `data.table` structure.
|
||||
#'
|
||||
#' @param feature_names character vector of feature names. If the model already
|
||||
#' contains feature names, those would be used when \code{feature_names=NULL} (default value).
|
||||
#' Non-null \code{feature_names} could be provided to override those in the model.
|
||||
#' @param model object of class \code{xgb.Booster}
|
||||
#' @param text \code{character} vector previously generated by the \code{xgb.dump}
|
||||
#' function (where parameter \code{with_stats = TRUE} should have been set).
|
||||
#' \code{text} takes precedence over \code{model}.
|
||||
#' @param trees an integer vector of tree indices that should be parsed.
|
||||
#' If set to \code{NULL}, all trees of the model are parsed.
|
||||
#' It could be useful, e.g., in multiclass classification to get only
|
||||
#' the trees of one certain class. IMPORTANT: the tree index in xgboost models
|
||||
#' is zero-based (e.g., use \code{trees = 0:4} for first 5 trees).
|
||||
#' @param use_int_id a logical flag indicating whether nodes in columns "Yes", "No", "Missing" should be
|
||||
#' represented as integers (when FALSE) or as "Tree-Node" character strings (when FALSE).
|
||||
#' @param ... currently not used.
|
||||
#' @param feature_names Character vector of feature names. If the model already
|
||||
#' contains feature names, those will be used when \code{feature_names=NULL} (default value).
|
||||
#'
|
||||
#' Note that, if the model already contains feature names, it's \bold{not} possible to override them here.
|
||||
#' @param model Object of class `xgb.Booster`.
|
||||
#' @param text Character vector previously generated by the function [xgb.dump()]
|
||||
#' (called with parameter `with_stats = TRUE`). `text` takes precedence over `model`.
|
||||
#' @param trees An integer vector of tree indices that should be used.
|
||||
#' The default (`NULL`) uses all trees.
|
||||
#' Useful, e.g., in multiclass classification to get only
|
||||
#' the trees of one class. *Important*: the tree index in XGBoost models
|
||||
#' is zero-based (e.g., use `trees = 0:4` for the first five trees).
|
||||
#' @param use_int_id A logical flag indicating whether nodes in columns "Yes", "No", and
|
||||
#' "Missing" should be represented as integers (when `TRUE`) or as "Tree-Node"
|
||||
#' character strings (when `FALSE`, default).
|
||||
#' @param ... Currently not used.
|
||||
#'
|
||||
#' @return
|
||||
#' A \code{data.table} with detailed information about model trees' nodes.
|
||||
#' A `data.table` with detailed information about tree nodes. It has the following columns:
|
||||
#' - `Tree`: integer ID of a tree in a model (zero-based index).
|
||||
#' - `Node`: integer ID of a node in a tree (zero-based index).
|
||||
#' - `ID`: character identifier of a node in a model (only when `use_int_id = FALSE`).
|
||||
#' - `Feature`: for a branch node, a feature ID or name (when available);
|
||||
#' for a leaf node, it simply labels it as `"Leaf"`.
|
||||
#' - `Split`: location of the split for a branch node (split condition is always "less than").
|
||||
#' - `Yes`: ID of the next node when the split condition is met.
|
||||
#' - `No`: ID of the next node when the split condition is not met.
|
||||
#' - `Missing`: ID of the next node when the branch value is missing.
|
||||
#' - `Gain`: either the split gain (change in loss) or the leaf value.
|
||||
#' - `Cover`: metric related to the number of observations either seen by a split
|
||||
#' or collected by a leaf during training.
|
||||
#'
|
||||
#' The columns of the \code{data.table} are:
|
||||
#'
|
||||
#' \itemize{
|
||||
#' \item \code{Tree}: integer ID of a tree in a model (zero-based index)
|
||||
#' \item \code{Node}: integer ID of a node in a tree (zero-based index)
|
||||
#' \item \code{ID}: character identifier of a node in a model (only when \code{use_int_id=FALSE})
|
||||
#' \item \code{Feature}: for a branch node, it's a feature id or name (when available);
|
||||
#' for a leaf note, it simply labels it as \code{'Leaf'}
|
||||
#' \item \code{Split}: location of the split for a branch node (split condition is always "less than")
|
||||
#' \item \code{Yes}: ID of the next node when the split condition is met
|
||||
#' \item \code{No}: ID of the next node when the split condition is not met
|
||||
#' \item \code{Missing}: ID of the next node when branch value is missing
|
||||
#' \item \code{Quality}: either the split gain (change in loss) or the leaf value
|
||||
#' \item \code{Cover}: metric related to the number of observation either seen by a split
|
||||
#' or collected by a leaf during training.
|
||||
#' }
|
||||
#'
|
||||
#' When \code{use_int_id=FALSE}, columns "Yes", "No", and "Missing" point to model-wide node identifiers
|
||||
#' in the "ID" column. When \code{use_int_id=TRUE}, those columns point to node identifiers from
|
||||
#' When `use_int_id = FALSE`, columns "Yes", "No", and "Missing" point to model-wide node identifiers
|
||||
#' in the "ID" column. When `use_int_id = TRUE`, those columns point to node identifiers from
|
||||
#' the corresponding trees in the "Node" column.
|
||||
#'
|
||||
#' @examples
|
||||
#' # Basic use:
|
||||
#'
|
||||
#' data(agaricus.train, package='xgboost')
|
||||
#' data(agaricus.train, package = "xgboost")
|
||||
#' ## Keep the number of threads to 1 for examples
|
||||
#' nthread <- 1
|
||||
#' data.table::setDTthreads(nthread)
|
||||
#'
|
||||
#' bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max_depth = 2,
|
||||
#' eta = 1, nthread = nthread, nrounds = 2,objective = "binary:logistic")
|
||||
#'
|
||||
#' (dt <- xgb.model.dt.tree(colnames(agaricus.train$data), bst))
|
||||
#' bst <- xgboost(
|
||||
#' data = agaricus.train$data,
|
||||
#' label = agaricus.train$label,
|
||||
#' max_depth = 2,
|
||||
#' eta = 1,
|
||||
#' nthread = nthread,
|
||||
#' nrounds = 2,
|
||||
#' objective = "binary:logistic"
|
||||
#' )
|
||||
#'
|
||||
#' # This bst model already has feature_names stored with it, so those would be used when
|
||||
#' # feature_names is not set:
|
||||
#' (dt <- xgb.model.dt.tree(model = bst))
|
||||
#'
|
||||
#' # How to match feature names of splits that are following a current 'Yes' branch:
|
||||
#'
|
||||
#' merge(dt, dt[, .(ID, Y.Feature=Feature)], by.x='Yes', by.y='ID', all.x=TRUE)[order(Tree,Node)]
|
||||
#' merge(
|
||||
#' dt,
|
||||
#' dt[, .(ID, Y.Feature = Feature)], by.x = "Yes", by.y = "ID", all.x = TRUE
|
||||
#' )[
|
||||
#' order(Tree, Node)
|
||||
#' ]
|
||||
#'
|
||||
#' @export
|
||||
xgb.model.dt.tree <- function(feature_names = NULL, model = NULL, text = NULL,
|
||||
@ -74,8 +79,15 @@ xgb.model.dt.tree <- function(feature_names = NULL, model = NULL, text = NULL,
|
||||
" (or NULL if 'model' was provided).")
|
||||
}
|
||||
|
||||
if (is.null(feature_names) && !is.null(model) && !is.null(model$feature_names))
|
||||
feature_names <- model$feature_names
|
||||
model_feature_names <- NULL
|
||||
if (inherits(model, "xgb.Booster")) {
|
||||
model_feature_names <- xgb.feature_names(model)
|
||||
if (NROW(model_feature_names) && !is.null(feature_names)) {
|
||||
stop("'model' contains feature names. Cannot override them.")
|
||||
}
|
||||
}
|
||||
if (is.null(feature_names) && !is.null(model) && !is.null(model_feature_names))
|
||||
feature_names <- model_feature_names
|
||||
|
||||
if (!(is.null(feature_names) || is.character(feature_names))) {
|
||||
stop("feature_names: must be a character vector")
|
||||
@ -85,8 +97,10 @@ xgb.model.dt.tree <- function(feature_names = NULL, model = NULL, text = NULL,
|
||||
stop("trees: must be a vector of integers.")
|
||||
}
|
||||
|
||||
from_text <- TRUE
|
||||
if (is.null(text)) {
|
||||
text <- xgb.dump(model = model, with_stats = TRUE)
|
||||
from_text <- FALSE
|
||||
}
|
||||
|
||||
if (length(text) < 2 || !any(grepl('leaf=(\\d+)', text))) {
|
||||
@ -115,9 +129,29 @@ xgb.model.dt.tree <- function(feature_names = NULL, model = NULL, text = NULL,
|
||||
td[, isLeaf := grepl("leaf", t, fixed = TRUE)]
|
||||
|
||||
# parse branch lines
|
||||
branch_rx <- paste0("f(\\d+)<(", anynumber_regex, ")\\] yes=(\\d+),no=(\\d+),missing=(\\d+),",
|
||||
"gain=(", anynumber_regex, "),cover=(", anynumber_regex, ")")
|
||||
branch_cols <- c("Feature", "Split", "Yes", "No", "Missing", "Quality", "Cover")
|
||||
branch_rx_nonames <- paste0("f(\\d+)<(", anynumber_regex, ")\\] yes=(\\d+),no=(\\d+),missing=(\\d+),",
|
||||
"gain=(", anynumber_regex, "),cover=(", anynumber_regex, ")")
|
||||
branch_rx_w_names <- paste0("\\d+:\\[(.+)<(", anynumber_regex, ")\\] yes=(\\d+),no=(\\d+),missing=(\\d+),",
|
||||
"gain=(", anynumber_regex, "),cover=(", anynumber_regex, ")")
|
||||
text_has_feature_names <- FALSE
|
||||
if (NROW(model_feature_names)) {
|
||||
branch_rx <- branch_rx_w_names
|
||||
text_has_feature_names <- TRUE
|
||||
} else {
|
||||
# Note: when passing a text dump, it might or might not have feature names,
|
||||
# but that aspect is unknown from just the text attributes
|
||||
branch_rx <- branch_rx_nonames
|
||||
if (from_text) {
|
||||
if (sum(grepl(branch_rx_w_names, text)) > sum(grepl(branch_rx_nonames, text))) {
|
||||
branch_rx <- branch_rx_w_names
|
||||
text_has_feature_names <- TRUE
|
||||
}
|
||||
}
|
||||
}
|
||||
if (text_has_feature_names && is.null(model) && !is.null(feature_names)) {
|
||||
stop("'text' contains feature names. Cannot override them.")
|
||||
}
|
||||
branch_cols <- c("Feature", "Split", "Yes", "No", "Missing", "Gain", "Cover")
|
||||
td[
|
||||
isLeaf == FALSE,
|
||||
(branch_cols) := {
|
||||
@ -127,7 +161,7 @@ xgb.model.dt.tree <- function(feature_names = NULL, model = NULL, text = NULL,
|
||||
xtr[, 3:5] <- add.tree.id(xtr[, 3:5], Tree)
|
||||
if (length(xtr) == 0) {
|
||||
as.data.table(
|
||||
list(Feature = "NA", Split = "NA", Yes = "NA", No = "NA", Missing = "NA", Quality = "NA", Cover = "NA")
|
||||
list(Feature = "NA", Split = "NA", Yes = "NA", No = "NA", Missing = "NA", Gain = "NA", Cover = "NA")
|
||||
)
|
||||
} else {
|
||||
as.data.table(xtr)
|
||||
@ -139,15 +173,17 @@ xgb.model.dt.tree <- function(feature_names = NULL, model = NULL, text = NULL,
|
||||
is_stump <- function() {
|
||||
return(length(td$Feature) == 1 && is.na(td$Feature))
|
||||
}
|
||||
if (!is.null(feature_names) && !is_stump()) {
|
||||
if (length(feature_names) <= max(as.numeric(td$Feature), na.rm = TRUE))
|
||||
stop("feature_names has less elements than there are features used in the model")
|
||||
td[isLeaf == FALSE, Feature := feature_names[as.numeric(Feature) + 1]]
|
||||
if (!text_has_feature_names) {
|
||||
if (!is.null(feature_names) && !is_stump()) {
|
||||
if (length(feature_names) <= max(as.numeric(td$Feature), na.rm = TRUE))
|
||||
stop("feature_names has less elements than there are features used in the model")
|
||||
td[isLeaf == FALSE, Feature := feature_names[as.numeric(Feature) + 1]]
|
||||
}
|
||||
}
|
||||
|
||||
# parse leaf lines
|
||||
leaf_rx <- paste0("leaf=(", anynumber_regex, "),cover=(", anynumber_regex, ")")
|
||||
leaf_cols <- c("Feature", "Quality", "Cover")
|
||||
leaf_cols <- c("Feature", "Gain", "Cover")
|
||||
td[
|
||||
isLeaf == TRUE,
|
||||
(leaf_cols) := {
|
||||
@ -162,7 +198,7 @@ xgb.model.dt.tree <- function(feature_names = NULL, model = NULL, text = NULL,
|
||||
]
|
||||
|
||||
# convert some columns to numeric
|
||||
numeric_cols <- c("Split", "Quality", "Cover")
|
||||
numeric_cols <- c("Split", "Gain", "Cover")
|
||||
td[, (numeric_cols) := lapply(.SD, as.numeric), .SDcols = numeric_cols]
|
||||
if (use_int_id) {
|
||||
int_cols <- c("Yes", "No", "Missing")
|
||||
|
||||
@ -1,65 +1,74 @@
|
||||
#' Plot model trees deepness
|
||||
#' Plot model tree depth
|
||||
#'
|
||||
#' Visualizes distributions related to depth of tree leafs.
|
||||
#' \code{xgb.plot.deepness} uses base R graphics, while \code{xgb.ggplot.deepness} uses the ggplot backend.
|
||||
#' Visualizes distributions related to the depth of tree leaves.
|
||||
#' - `xgb.plot.deepness()` uses base R graphics, while
|
||||
#' - `xgb.ggplot.deepness()` uses "ggplot2".
|
||||
#'
|
||||
#' @param model either an \code{xgb.Booster} model generated by the \code{xgb.train} function
|
||||
#' or a data.table result of the \code{xgb.model.dt.tree} function.
|
||||
#' @param plot (base R barplot) whether a barplot should be produced.
|
||||
#' If FALSE, only a data.table is returned.
|
||||
#' @param which which distribution to plot (see details).
|
||||
#' @param ... other parameters passed to \code{barplot} or \code{plot}.
|
||||
#' @param model Either an `xgb.Booster` model, or the "data.table" returned by [xgb.model.dt.tree()].
|
||||
#' @param which Which distribution to plot (see details).
|
||||
#' @param plot Should the plot be shown? Default is `TRUE`.
|
||||
#' @param ... Other parameters passed to [graphics::barplot()] or [graphics::plot()].
|
||||
#'
|
||||
#' @details
|
||||
#'
|
||||
#' When \code{which="2x1"}, two distributions with respect to the leaf depth
|
||||
#' When `which = "2x1"`, two distributions with respect to the leaf depth
|
||||
#' are plotted on top of each other:
|
||||
#' \itemize{
|
||||
#' \item the distribution of the number of leafs in a tree model at a certain depth;
|
||||
#' \item the distribution of average weighted number of observations ("cover")
|
||||
#' ending up in leafs at certain depth.
|
||||
#' }
|
||||
#' Those could be helpful in determining sensible ranges of the \code{max_depth}
|
||||
#' and \code{min_child_weight} parameters.
|
||||
#' 1. The distribution of the number of leaves in a tree model at a certain depth.
|
||||
#' 2. The distribution of the average weighted number of observations ("cover")
|
||||
#' ending up in leaves at a certain depth.
|
||||
#'
|
||||
#' When \code{which="max.depth"} or \code{which="med.depth"}, plots of either maximum or median depth
|
||||
#' per tree with respect to tree number are created. And \code{which="med.weight"} allows to see how
|
||||
#' Those could be helpful in determining sensible ranges of the `max_depth`
|
||||
#' and `min_child_weight` parameters.
|
||||
#'
|
||||
#' When `which = "max.depth"` or `which = "med.depth"`, plots of either maximum or
|
||||
#' median depth per tree with respect to the tree number are created.
|
||||
#'
|
||||
#' Finally, `which = "med.weight"` allows to see how
|
||||
#' a tree's median absolute leaf weight changes through the iterations.
|
||||
#'
|
||||
#' This function was inspired by the blog post
|
||||
#' \url{https://github.com/aysent/random-forest-leaf-visualization}.
|
||||
#' These functions have been inspired by the blog post
|
||||
#' <https://github.com/aysent/random-forest-leaf-visualization>.
|
||||
#'
|
||||
#' @return
|
||||
#' The return value of the two functions is as follows:
|
||||
#' - `xgb.plot.deepness()`: A "data.table" (invisibly).
|
||||
#' Each row corresponds to a terminal leaf in the model. It contains its information
|
||||
#' about depth, cover, and weight (used in calculating predictions).
|
||||
#' If `plot = TRUE`, also a plot is shown.
|
||||
#' - `xgb.ggplot.deepness()`: When `which = "2x1"`, a list of two "ggplot" objects,
|
||||
#' and a single "ggplot" object otherwise.
|
||||
#'
|
||||
#' Other than producing plots (when \code{plot=TRUE}), the \code{xgb.plot.deepness} function
|
||||
#' silently returns a processed data.table where each row corresponds to a terminal leaf in a tree model,
|
||||
#' and contains information about leaf's depth, cover, and weight (which is used in calculating predictions).
|
||||
#'
|
||||
#' The \code{xgb.ggplot.deepness} silently returns either a list of two ggplot graphs when \code{which="2x1"}
|
||||
#' or a single ggplot graph for the other \code{which} options.
|
||||
#'
|
||||
#' @seealso
|
||||
#'
|
||||
#' \code{\link{xgb.train}}, \code{\link{xgb.model.dt.tree}}.
|
||||
#' @seealso [xgb.train()] and [xgb.model.dt.tree()].
|
||||
#'
|
||||
#' @examples
|
||||
#'
|
||||
#' data(agaricus.train, package='xgboost')
|
||||
#' data(agaricus.train, package = "xgboost")
|
||||
#' ## Keep the number of threads to 2 for examples
|
||||
#' nthread <- 2
|
||||
#' data.table::setDTthreads(nthread)
|
||||
#'
|
||||
#' ## Change max_depth to a higher number to get a more significant result
|
||||
#' bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max_depth = 6,
|
||||
#' eta = 0.1, nthread = nthread, nrounds = 50, objective = "binary:logistic",
|
||||
#' subsample = 0.5, min_child_weight = 2)
|
||||
#' bst <- xgboost(
|
||||
#' data = agaricus.train$data,
|
||||
#' label = agaricus.train$label,
|
||||
#' max_depth = 6,
|
||||
#' nthread = nthread,
|
||||
#' nrounds = 50,
|
||||
#' objective = "binary:logistic",
|
||||
#' subsample = 0.5,
|
||||
#' min_child_weight = 2
|
||||
#' )
|
||||
#'
|
||||
#' xgb.plot.deepness(bst)
|
||||
#' xgb.ggplot.deepness(bst)
|
||||
#'
|
||||
#' xgb.plot.deepness(bst, which='max.depth', pch=16, col=rgb(0,0,1,0.3), cex=2)
|
||||
#' xgb.plot.deepness(
|
||||
#' bst, which = "max.depth", pch = 16, col = rgb(0, 0, 1, 0.3), cex = 2
|
||||
#' )
|
||||
#'
|
||||
#' xgb.plot.deepness(bst, which='med.weight', pch=16, col=rgb(0,0,1,0.3), cex=2)
|
||||
#' xgb.plot.deepness(
|
||||
#' bst, which = "med.weight", pch = 16, col = rgb(0, 0, 1, 0.3), cex = 2
|
||||
#' )
|
||||
#'
|
||||
#' @rdname xgb.plot.deepness
|
||||
#' @export
|
||||
@ -83,7 +92,7 @@ xgb.plot.deepness <- function(model = NULL, which = c("2x1", "max.depth", "med.d
|
||||
stop("Model tree columns are not as expected!\n",
|
||||
" Note that this function works only for tree models.")
|
||||
|
||||
dt_depths <- merge(get.leaf.depth(dt_tree), dt_tree[, .(ID, Cover, Weight = Quality)], by = "ID")
|
||||
dt_depths <- merge(get.leaf.depth(dt_tree), dt_tree[, .(ID, Cover, Weight = Gain)], by = "ID")
|
||||
setkeyv(dt_depths, c("Tree", "ID"))
|
||||
# count by depth levels, and also calculate average cover at a depth
|
||||
dt_summaries <- dt_depths[, .(.N, Cover = mean(Cover)), Depth]
|
||||
@ -148,6 +157,6 @@ get.leaf.depth <- function(dt_tree) {
|
||||
# They are mainly column names inferred by Data.table...
|
||||
globalVariables(
|
||||
c(
|
||||
".N", "N", "Depth", "Quality", "Cover", "Tree", "ID", "Yes", "No", "Feature", "Leaf", "Weight"
|
||||
".N", "N", "Depth", "Gain", "Cover", "Tree", "ID", "Yes", "No", "Feature", "Leaf", "Weight"
|
||||
)
|
||||
)
|
||||
|
||||
@ -1,64 +1,75 @@
|
||||
#' Plot feature importance as a bar graph
|
||||
#' Plot feature importance
|
||||
#'
|
||||
#' Represents previously calculated feature importance as a bar graph.
|
||||
#' \code{xgb.plot.importance} uses base R graphics, while \code{xgb.ggplot.importance} uses the ggplot backend.
|
||||
#' - `xgb.plot.importance()` uses base R graphics, while
|
||||
#' - `xgb.ggplot.importance()` uses "ggplot".
|
||||
#'
|
||||
#' @param importance_matrix a \code{data.table} returned by \code{\link{xgb.importance}}.
|
||||
#' @param top_n maximal number of top features to include into the plot.
|
||||
#' @param measure the name of importance measure to plot.
|
||||
#' When \code{NULL}, 'Gain' would be used for trees and 'Weight' would be used for gblinear.
|
||||
#' @param rel_to_first whether importance values should be represented as relative to the highest ranked feature.
|
||||
#' See Details.
|
||||
#' @param left_margin (base R barplot) allows to adjust the left margin size to fit feature names.
|
||||
#' When it is NULL, the existing \code{par('mar')} is used.
|
||||
#' @param cex (base R barplot) passed as \code{cex.names} parameter to \code{barplot}.
|
||||
#' @param plot (base R barplot) whether a barplot should be produced.
|
||||
#' If FALSE, only a data.table is returned.
|
||||
#' @param n_clusters (ggplot only) a \code{numeric} vector containing the min and the max range
|
||||
#' @param importance_matrix A `data.table` as returned by [xgb.importance()].
|
||||
#' @param top_n Maximal number of top features to include into the plot.
|
||||
#' @param measure The name of importance measure to plot.
|
||||
#' When `NULL`, 'Gain' would be used for trees and 'Weight' would be used for gblinear.
|
||||
#' @param rel_to_first Whether importance values should be represented as relative to
|
||||
#' the highest ranked feature, see Details.
|
||||
#' @param left_margin Adjust the left margin size to fit feature names.
|
||||
#' When `NULL`, the existing `par("mar")` is used.
|
||||
#' @param cex Passed as `cex.names` parameter to [graphics::barplot()].
|
||||
#' @param plot Should the barplot be shown? Default is `TRUE`.
|
||||
#' @param n_clusters A numeric vector containing the min and the max range
|
||||
#' of the possible number of clusters of bars.
|
||||
#' @param ... other parameters passed to \code{barplot} (except horiz, border, cex.names, names.arg, and las).
|
||||
#' @param ... Other parameters passed to [graphics::barplot()]
|
||||
#' (except `horiz`, `border`, `cex.names`, `names.arg`, and `las`).
|
||||
#' Only used in `xgb.plot.importance()`.
|
||||
#'
|
||||
#' @details
|
||||
#' The graph represents each feature as a horizontal bar of length proportional to the importance of a feature.
|
||||
#' Features are shown ranked in a decreasing importance order.
|
||||
#' It works for importances from both \code{gblinear} and \code{gbtree} models.
|
||||
#' Features are sorted by decreasing importance.
|
||||
#' It works for both "gblinear" and "gbtree" models.
|
||||
#'
|
||||
#' When \code{rel_to_first = FALSE}, the values would be plotted as they were in \code{importance_matrix}.
|
||||
#' For gbtree model, that would mean being normalized to the total of 1
|
||||
#' When `rel_to_first = FALSE`, the values would be plotted as in `importance_matrix`.
|
||||
#' For a "gbtree" model, that would mean being normalized to the total of 1
|
||||
#' ("what is feature's importance contribution relative to the whole model?").
|
||||
#' For linear models, \code{rel_to_first = FALSE} would show actual values of the coefficients.
|
||||
#' Setting \code{rel_to_first = TRUE} allows to see the picture from the perspective of
|
||||
#' For linear models, `rel_to_first = FALSE` would show actual values of the coefficients.
|
||||
#' Setting `rel_to_first = TRUE` allows to see the picture from the perspective of
|
||||
#' "what is feature's importance contribution relative to the most important feature?"
|
||||
#'
|
||||
#' The ggplot-backend method also performs 1-D clustering of the importance values,
|
||||
#' with bar colors corresponding to different clusters that have somewhat similar importance values.
|
||||
#' The "ggplot" backend performs 1-D clustering of the importance values,
|
||||
#' with bar colors corresponding to different clusters having similar importance values.
|
||||
#'
|
||||
#' @return
|
||||
#' The \code{xgb.plot.importance} function creates a \code{barplot} (when \code{plot=TRUE})
|
||||
#' and silently returns a processed data.table with \code{n_top} features sorted by importance.
|
||||
#' The return value depends on the function:
|
||||
#' - `xgb.plot.importance()`: Invisibly, a "data.table" with `n_top` features sorted
|
||||
#' by importance. If `plot = TRUE`, the values are also plotted as barplot.
|
||||
#' - `xgb.ggplot.importance()`: A customizable "ggplot" object.
|
||||
#' E.g., to change the title, set `+ ggtitle("A GRAPH NAME")`.
|
||||
#'
|
||||
#' The \code{xgb.ggplot.importance} function returns a ggplot graph which could be customized afterwards.
|
||||
#' E.g., to change the title of the graph, add \code{+ ggtitle("A GRAPH NAME")} to the result.
|
||||
#'
|
||||
#' @seealso
|
||||
#' \code{\link[graphics]{barplot}}.
|
||||
#' @seealso [graphics::barplot()]
|
||||
#'
|
||||
#' @examples
|
||||
#' data(agaricus.train)
|
||||
#'
|
||||
#' ## Keep the number of threads to 2 for examples
|
||||
#' nthread <- 2
|
||||
#' data.table::setDTthreads(nthread)
|
||||
#'
|
||||
#' bst <- xgboost(
|
||||
#' data = agaricus.train$data, label = agaricus.train$label, max_depth = 3,
|
||||
#' eta = 1, nthread = nthread, nrounds = 2, objective = "binary:logistic"
|
||||
#' data = agaricus.train$data,
|
||||
#' label = agaricus.train$label,
|
||||
#' max_depth = 3,
|
||||
#' eta = 1,
|
||||
#' nthread = nthread,
|
||||
#' nrounds = 2,
|
||||
#' objective = "binary:logistic"
|
||||
#' )
|
||||
#'
|
||||
#' importance_matrix <- xgb.importance(colnames(agaricus.train$data), model = bst)
|
||||
#' xgb.plot.importance(
|
||||
#' importance_matrix, rel_to_first = TRUE, xlab = "Relative importance"
|
||||
#' )
|
||||
#'
|
||||
#' xgb.plot.importance(importance_matrix, rel_to_first = TRUE, xlab = "Relative importance")
|
||||
#'
|
||||
#' (gg <- xgb.ggplot.importance(importance_matrix, measure = "Frequency", rel_to_first = TRUE))
|
||||
#' gg <- xgb.ggplot.importance(
|
||||
#' importance_matrix, measure = "Frequency", rel_to_first = TRUE
|
||||
#' )
|
||||
#' gg
|
||||
#' gg + ggplot2::ylab("Frequency")
|
||||
#'
|
||||
#' @rdname xgb.plot.importance
|
||||
|
||||
@ -1,14 +1,10 @@
|
||||
#' Project all trees on one tree and plot it
|
||||
#' Project all trees on one tree
|
||||
#'
|
||||
#' Visualization of the ensemble of trees as a single collective unit.
|
||||
#'
|
||||
#' @param model produced by the \code{xgb.train} function.
|
||||
#' @param feature_names names of each feature as a \code{character} vector.
|
||||
#' @param features_keep number of features to keep in each position of the multi trees.
|
||||
#' @param plot_width width in pixels of the graph to produce
|
||||
#' @param plot_height height in pixels of the graph to produce
|
||||
#' @param render a logical flag for whether the graph should be rendered (see Value).
|
||||
#' @param ... currently not used
|
||||
#' @inheritParams xgb.plot.tree
|
||||
#' @param features_keep Number of features to keep in each position of the multi trees,
|
||||
#' by default 5.
|
||||
#'
|
||||
#' @details
|
||||
#'
|
||||
@ -24,33 +20,31 @@
|
||||
#' Moreover, the trees tend to reuse the same features.
|
||||
#'
|
||||
#' The function projects each tree onto one, and keeps for each position the
|
||||
#' \code{features_keep} first features (based on the Gain per feature measure).
|
||||
#' `features_keep` first features (based on the Gain per feature measure).
|
||||
#'
|
||||
#' This function is inspired by this blog post:
|
||||
#' \url{https://wellecks.wordpress.com/2015/02/21/peering-into-the-black-box-visualizing-lambdamart/}
|
||||
#' <https://wellecks.wordpress.com/2015/02/21/peering-into-the-black-box-visualizing-lambdamart/>
|
||||
#'
|
||||
#' @return
|
||||
#'
|
||||
#' When \code{render = TRUE}:
|
||||
#' returns a rendered graph object which is an \code{htmlwidget} of class \code{grViz}.
|
||||
#' Similar to ggplot objects, it needs to be printed to see it when not running from command line.
|
||||
#'
|
||||
#' When \code{render = FALSE}:
|
||||
#' silently returns a graph object which is of DiagrammeR's class \code{dgr_graph}.
|
||||
#' This could be useful if one wants to modify some of the graph attributes
|
||||
#' before rendering the graph with \code{\link[DiagrammeR]{render_graph}}.
|
||||
#' @inherit xgb.plot.tree return
|
||||
#'
|
||||
#' @examples
|
||||
#'
|
||||
#' data(agaricus.train, package='xgboost')
|
||||
#' data(agaricus.train, package = "xgboost")
|
||||
#'
|
||||
#' ## Keep the number of threads to 2 for examples
|
||||
#' nthread <- 2
|
||||
#' data.table::setDTthreads(nthread)
|
||||
#'
|
||||
#' bst <- xgboost(
|
||||
#' data = agaricus.train$data, label = agaricus.train$label, max_depth = 15,
|
||||
#' eta = 1, nthread = nthread, nrounds = 30, objective = "binary:logistic",
|
||||
#' min_child_weight = 50, verbose = 0
|
||||
#' data = agaricus.train$data,
|
||||
#' label = agaricus.train$label,
|
||||
#' max_depth = 15,
|
||||
#' eta = 1,
|
||||
#' nthread = nthread,
|
||||
#' nrounds = 30,
|
||||
#' objective = "binary:logistic",
|
||||
#' min_child_weight = 50,
|
||||
#' verbose = 0
|
||||
#' )
|
||||
#'
|
||||
#' p <- xgb.plot.multi.trees(model = bst, features_keep = 3)
|
||||
@ -58,10 +52,13 @@
|
||||
#'
|
||||
#' \dontrun{
|
||||
#' # Below is an example of how to save this plot to a file.
|
||||
#' # Note that for `export_graph` to work, the DiagrammeRsvg and rsvg packages must also be installed.
|
||||
#' # Note that for export_graph() to work, the {DiagrammeRsvg} and {rsvg} packages
|
||||
#' # must also be installed.
|
||||
#'
|
||||
#' library(DiagrammeR)
|
||||
#' gr <- xgb.plot.multi.trees(model=bst, features_keep = 3, render=FALSE)
|
||||
#' export_graph(gr, 'tree.pdf', width=1500, height=600)
|
||||
#'
|
||||
#' gr <- xgb.plot.multi.trees(model = bst, features_keep = 3, render = FALSE)
|
||||
#' export_graph(gr, "tree.pdf", width = 1500, height = 600)
|
||||
#' }
|
||||
#'
|
||||
#' @export
|
||||
@ -98,13 +95,13 @@ xgb.plot.multi.trees <- function(model, feature_names = NULL, features_keep = 5,
|
||||
data.table::set(tree.matrix, j = nm, value = sub("^\\d+-", "", tree.matrix[[nm]]))
|
||||
|
||||
nodes.dt <- tree.matrix[
|
||||
, .(Quality = sum(Quality))
|
||||
, .(Gain = sum(Gain))
|
||||
, by = .(abs.node.position, Feature)
|
||||
][, .(Text = paste0(
|
||||
paste0(
|
||||
Feature[seq_len(min(length(Feature), features_keep))],
|
||||
" (",
|
||||
format(Quality[seq_len(min(length(Quality), features_keep))], digits = 5),
|
||||
format(Gain[seq_len(min(length(Gain), features_keep))], digits = 5),
|
||||
")"
|
||||
),
|
||||
collapse = "\n"
|
||||
|
||||
@ -1,110 +1,165 @@
|
||||
#' SHAP contribution dependency plots
|
||||
#' SHAP dependence plots
|
||||
#'
|
||||
#' Visualizing the SHAP feature contribution to prediction dependencies on feature value.
|
||||
#' Visualizes SHAP values against feature values to gain an impression of feature effects.
|
||||
#'
|
||||
#' @param data data as a \code{matrix} or \code{dgCMatrix}.
|
||||
#' @param shap_contrib a matrix of SHAP contributions that was computed earlier for the above
|
||||
#' \code{data}. When it is NULL, it is computed internally using \code{model} and \code{data}.
|
||||
#' @param features a vector of either column indices or of feature names to plot. When it is NULL,
|
||||
#' feature importance is calculated, and \code{top_n} high ranked features are taken.
|
||||
#' @param top_n when \code{features} is NULL, top_n [1, 100] most important features in a model are taken.
|
||||
#' @param model an \code{xgb.Booster} model. It has to be provided when either \code{shap_contrib}
|
||||
#' or \code{features} is missing.
|
||||
#' @param trees passed to \code{\link{xgb.importance}} when \code{features = NULL}.
|
||||
#' @param target_class is only relevant for multiclass models. When it is set to a 0-based class index,
|
||||
#' only SHAP contributions for that specific class are used.
|
||||
#' If it is not set, SHAP importances are averaged over all classes.
|
||||
#' @param approxcontrib passed to \code{\link{predict.xgb.Booster}} when \code{shap_contrib = NULL}.
|
||||
#' @param subsample a random fraction of data points to use for plotting. When it is NULL,
|
||||
#' it is set so that up to 100K data points are used.
|
||||
#' @param n_col a number of columns in a grid of plots.
|
||||
#' @param col color of the scatterplot markers.
|
||||
#' @param pch scatterplot marker.
|
||||
#' @param discrete_n_uniq a maximal number of unique values in a feature to consider it as discrete.
|
||||
#' @param discrete_jitter an \code{amount} parameter of jitter added to discrete features' positions.
|
||||
#' @param ylab a y-axis label in 1D plots.
|
||||
#' @param plot_NA whether the contributions of cases with missing values should also be plotted.
|
||||
#' @param col_NA a color of marker for missing value contributions.
|
||||
#' @param pch_NA a marker type for NA values.
|
||||
#' @param pos_NA a relative position of the x-location where NA values are shown:
|
||||
#' \code{min(x) + (max(x) - min(x)) * pos_NA}.
|
||||
#' @param plot_loess whether to plot loess-smoothed curves. The smoothing is only done for features with
|
||||
#' more than 5 distinct values.
|
||||
#' @param col_loess a color to use for the loess curves.
|
||||
#' @param span_loess the \code{span} parameter in \code{\link[stats]{loess}}'s call.
|
||||
#' @param which whether to do univariate or bivariate plotting. NOTE: only 1D is implemented so far.
|
||||
#' @param plot whether a plot should be drawn. If FALSE, only a list of matrices is returned.
|
||||
#' @param ... other parameters passed to \code{plot}.
|
||||
#' @param data The data to explain as a `matrix` or `dgCMatrix`.
|
||||
#' @param shap_contrib Matrix of SHAP contributions of `data`.
|
||||
#' The default (`NULL`) computes it from `model` and `data`.
|
||||
#' @param features Vector of column indices or feature names to plot.
|
||||
#' When `NULL` (default), the `top_n` most important features are selected
|
||||
#' by [xgb.importance()].
|
||||
#' @param top_n How many of the most important features (<= 100) should be selected?
|
||||
#' By default 1 for SHAP dependence and 10 for SHAP summary).
|
||||
#' Only used when `features = NULL`.
|
||||
#' @param model An `xgb.Booster` model. Only required when `shap_contrib = NULL` or
|
||||
#' `features = NULL`.
|
||||
#' @param trees Passed to [xgb.importance()] when `features = NULL`.
|
||||
#' @param target_class Only relevant for multiclass models. The default (`NULL`)
|
||||
#' averages the SHAP values over all classes. Pass a (0-based) class index
|
||||
#' to show only SHAP values of that class.
|
||||
#' @param approxcontrib Passed to `predict()` when `shap_contrib = NULL`.
|
||||
#' @param subsample Fraction of data points randomly picked for plotting.
|
||||
#' The default (`NULL`) will use up to 100k data points.
|
||||
#' @param n_col Number of columns in a grid of plots.
|
||||
#' @param col Color of the scatterplot markers.
|
||||
#' @param pch Scatterplot marker.
|
||||
#' @param discrete_n_uniq Maximal number of unique feature values to consider the
|
||||
#' feature as discrete.
|
||||
#' @param discrete_jitter Jitter amount added to the values of discrete features.
|
||||
#' @param ylab The y-axis label in 1D plots.
|
||||
#' @param plot_NA Should contributions of cases with missing values be plotted?
|
||||
#' Default is `TRUE`.
|
||||
#' @param col_NA Color of marker for missing value contributions.
|
||||
#' @param pch_NA Marker type for `NA` values.
|
||||
#' @param pos_NA Relative position of the x-location where `NA` values are shown:
|
||||
#' `min(x) + (max(x) - min(x)) * pos_NA`.
|
||||
#' @param plot_loess Should loess-smoothed curves be plotted? (Default is `TRUE`).
|
||||
#' The smoothing is only done for features with more than 5 distinct values.
|
||||
#' @param col_loess Color of loess curves.
|
||||
#' @param span_loess The `span` parameter of [stats::loess()].
|
||||
#' @param which Whether to do univariate or bivariate plotting. Currently, only "1d" is implemented.
|
||||
#' @param plot Should the plot be drawn? (Default is `TRUE`).
|
||||
#' If `FALSE`, only a list of matrices is returned.
|
||||
#' @param ... Other parameters passed to [graphics::plot()].
|
||||
#'
|
||||
#' @details
|
||||
#'
|
||||
#' These scatterplots represent how SHAP feature contributions depend of feature values.
|
||||
#' The similarity to partial dependency plots is that they also give an idea for how feature values
|
||||
#' affect predictions. However, in partial dependency plots, we usually see marginal dependencies
|
||||
#' of model prediction on feature value, while SHAP contribution dependency plots display the estimated
|
||||
#' contributions of a feature to model prediction for each individual case.
|
||||
#' The similarity to partial dependence plots is that they also give an idea for how feature values
|
||||
#' affect predictions. However, in partial dependence plots, we see marginal dependencies
|
||||
#' of model prediction on feature value, while SHAP dependence plots display the estimated
|
||||
#' contributions of a feature to the prediction for each individual case.
|
||||
#'
|
||||
#' When \code{plot_loess = TRUE} is set, feature values are rounded to 3 significant digits and
|
||||
#' weighted LOESS is computed and plotted, where weights are the numbers of data points
|
||||
#' When `plot_loess = TRUE`, feature values are rounded to three significant digits and
|
||||
#' weighted LOESS is computed and plotted, where the weights are the numbers of data points
|
||||
#' at each rounded value.
|
||||
#'
|
||||
#' Note: SHAP contributions are shown on the scale of model margin. E.g., for a logistic binomial objective,
|
||||
#' the margin is prediction before a sigmoidal transform into probability-like values.
|
||||
#' Note: SHAP contributions are on the scale of the model margin.
|
||||
#' E.g., for a logistic binomial objective, the margin is on log-odds scale.
|
||||
#' Also, since SHAP stands for "SHapley Additive exPlanation" (model prediction = sum of SHAP
|
||||
#' contributions for all features + bias), depending on the objective used, transforming SHAP
|
||||
#' contributions for a feature from the marginal to the prediction space is not necessarily
|
||||
#' a meaningful thing to do.
|
||||
#'
|
||||
#' @return
|
||||
#'
|
||||
#' In addition to producing plots (when \code{plot=TRUE}), it silently returns a list of two matrices:
|
||||
#' \itemize{
|
||||
#' \item \code{data} the values of selected features;
|
||||
#' \item \code{shap_contrib} the contributions of selected features.
|
||||
#' }
|
||||
#' In addition to producing plots (when `plot = TRUE`), it silently returns a list of two matrices:
|
||||
#' - `data`: Feature value matrix.
|
||||
#' - `shap_contrib`: Corresponding SHAP value matrix.
|
||||
#'
|
||||
#' @references
|
||||
#'
|
||||
#' Scott M. Lundberg, Su-In Lee, "A Unified Approach to Interpreting Model Predictions", NIPS Proceedings 2017, \url{https://arxiv.org/abs/1705.07874}
|
||||
#'
|
||||
#' Scott M. Lundberg, Su-In Lee, "Consistent feature attribution for tree ensembles", \url{https://arxiv.org/abs/1706.06060}
|
||||
#' 1. Scott M. Lundberg, Su-In Lee, "A Unified Approach to Interpreting Model Predictions",
|
||||
#' NIPS Proceedings 2017, <https://arxiv.org/abs/1705.07874>
|
||||
#' 2. Scott M. Lundberg, Su-In Lee, "Consistent feature attribution for tree ensembles",
|
||||
#' <https://arxiv.org/abs/1706.06060>
|
||||
#'
|
||||
#' @examples
|
||||
#'
|
||||
#' data(agaricus.train, package='xgboost')
|
||||
#' data(agaricus.test, package='xgboost')
|
||||
#' data(agaricus.train, package = "xgboost")
|
||||
#' data(agaricus.test, package = "xgboost")
|
||||
#'
|
||||
#' ## Keep the number of threads to 1 for examples
|
||||
#' nthread <- 1
|
||||
#' data.table::setDTthreads(nthread)
|
||||
#' nrounds <- 20
|
||||
#'
|
||||
#' bst <- xgboost(agaricus.train$data, agaricus.train$label, nrounds = nrounds,
|
||||
#' eta = 0.1, max_depth = 3, subsample = .5,
|
||||
#' method = "hist", objective = "binary:logistic", nthread = nthread, verbose = 0)
|
||||
#' bst <- xgboost(
|
||||
#' agaricus.train$data,
|
||||
#' agaricus.train$label,
|
||||
#' nrounds = nrounds,
|
||||
#' eta = 0.1,
|
||||
#' max_depth = 3,
|
||||
#' subsample = 0.5,
|
||||
#' objective = "binary:logistic",
|
||||
#' nthread = nthread,
|
||||
#' verbose = 0
|
||||
#' )
|
||||
#'
|
||||
#' xgb.plot.shap(agaricus.test$data, model = bst, features = "odor=none")
|
||||
#'
|
||||
#' contr <- predict(bst, agaricus.test$data, predcontrib = TRUE)
|
||||
#' xgb.plot.shap(agaricus.test$data, contr, model = bst, top_n = 12, n_col = 3)
|
||||
#' xgb.ggplot.shap.summary(agaricus.test$data, contr, model = bst, top_n = 12) # Summary plot
|
||||
#'
|
||||
#' # multiclass example - plots for each class separately:
|
||||
#' # Summary plot
|
||||
#' xgb.ggplot.shap.summary(agaricus.test$data, contr, model = bst, top_n = 12)
|
||||
#'
|
||||
#' # Multiclass example - plots for each class separately:
|
||||
#' nclass <- 3
|
||||
#' x <- as.matrix(iris[, -5])
|
||||
#' set.seed(123)
|
||||
#' is.na(x[sample(nrow(x) * 4, 30)]) <- TRUE # introduce some missing values
|
||||
#' mbst <- xgboost(data = x, label = as.numeric(iris$Species) - 1, nrounds = nrounds,
|
||||
#' max_depth = 2, eta = 0.3, subsample = .5, nthread = nthread,
|
||||
#' objective = "multi:softprob", num_class = nclass, verbose = 0)
|
||||
#' trees0 <- seq(from=0, by=nclass, length.out=nrounds)
|
||||
#'
|
||||
#' mbst <- xgboost(
|
||||
#' data = x,
|
||||
#' label = as.numeric(iris$Species) - 1,
|
||||
#' nrounds = nrounds,
|
||||
#' max_depth = 2,
|
||||
#' eta = 0.3,
|
||||
#' subsample = 0.5,
|
||||
#' nthread = nthread,
|
||||
#' objective = "multi:softprob",
|
||||
#' num_class = nclass,
|
||||
#' verbose = 0
|
||||
#' )
|
||||
#' trees0 <- seq(from = 0, by = nclass, length.out = nrounds)
|
||||
#' col <- rgb(0, 0, 1, 0.5)
|
||||
#' xgb.plot.shap(x, model = mbst, trees = trees0, target_class = 0, top_n = 4,
|
||||
#' n_col = 2, col = col, pch = 16, pch_NA = 17)
|
||||
#' xgb.plot.shap(x, model = mbst, trees = trees0 + 1, target_class = 1, top_n = 4,
|
||||
#' n_col = 2, col = col, pch = 16, pch_NA = 17)
|
||||
#' xgb.plot.shap(x, model = mbst, trees = trees0 + 2, target_class = 2, top_n = 4,
|
||||
#' n_col = 2, col = col, pch = 16, pch_NA = 17)
|
||||
#' xgb.ggplot.shap.summary(x, model = mbst, target_class = 0, top_n = 4) # Summary plot
|
||||
#' xgb.plot.shap(
|
||||
#' x,
|
||||
#' model = mbst,
|
||||
#' trees = trees0,
|
||||
#' target_class = 0,
|
||||
#' top_n = 4,
|
||||
#' n_col = 2,
|
||||
#' col = col,
|
||||
#' pch = 16,
|
||||
#' pch_NA = 17
|
||||
#' )
|
||||
#'
|
||||
#' xgb.plot.shap(
|
||||
#' x,
|
||||
#' model = mbst,
|
||||
#' trees = trees0 + 1,
|
||||
#' target_class = 1,
|
||||
#' top_n = 4,
|
||||
#' n_col = 2,
|
||||
#' col = col,
|
||||
#' pch = 16,
|
||||
#' pch_NA = 17
|
||||
#' )
|
||||
#'
|
||||
#' xgb.plot.shap(
|
||||
#' x,
|
||||
#' model = mbst,
|
||||
#' trees = trees0 + 2,
|
||||
#' target_class = 2,
|
||||
#' top_n = 4,
|
||||
#' n_col = 2,
|
||||
#' col = col,
|
||||
#' pch = 16,
|
||||
#' pch_NA = 17
|
||||
#' )
|
||||
#'
|
||||
#' # Summary plot
|
||||
#' xgb.ggplot.shap.summary(x, model = mbst, target_class = 0, top_n = 4)
|
||||
#'
|
||||
#' @rdname xgb.plot.shap
|
||||
#' @export
|
||||
@ -187,41 +242,48 @@ xgb.plot.shap <- function(data, shap_contrib = NULL, features = NULL, top_n = 1,
|
||||
invisible(list(data = data, shap_contrib = shap_contrib))
|
||||
}
|
||||
|
||||
#' SHAP contribution dependency summary plot
|
||||
#' SHAP summary plot
|
||||
#'
|
||||
#' Compare SHAP contributions of different features.
|
||||
#' Visualizes SHAP contributions of different features.
|
||||
#'
|
||||
#' A point plot (each point representing one sample from \code{data}) is
|
||||
#' A point plot (each point representing one observation from `data`) is
|
||||
#' produced for each feature, with the points plotted on the SHAP value axis.
|
||||
#' Each point (observation) is coloured based on its feature value. The plot
|
||||
#' hence allows us to see which features have a negative / positive contribution
|
||||
#' Each point (observation) is coloured based on its feature value.
|
||||
#'
|
||||
#' The plot allows to see which features have a negative / positive contribution
|
||||
#' on the model prediction, and whether the contribution is different for larger
|
||||
#' or smaller values of the feature. We effectively try to replicate the
|
||||
#' \code{summary_plot} function from https://github.com/shap/shap.
|
||||
#' or smaller values of the feature. Inspired by the summary plot of
|
||||
#' <https://github.com/shap/shap>.
|
||||
#'
|
||||
#' @inheritParams xgb.plot.shap
|
||||
#'
|
||||
#' @return A \code{ggplot2} object.
|
||||
#' @return A `ggplot2` object.
|
||||
#' @export
|
||||
#'
|
||||
#' @examples # See \code{\link{xgb.plot.shap}}.
|
||||
#' @seealso \code{\link{xgb.plot.shap}}, \code{\link{xgb.ggplot.shap.summary}},
|
||||
#' \url{https://github.com/shap/shap}
|
||||
#' @examples
|
||||
#' # See examples in xgb.plot.shap()
|
||||
#'
|
||||
#' @seealso [xgb.plot.shap()], [xgb.ggplot.shap.summary()],
|
||||
#' and the Python library <https://github.com/shap/shap>.
|
||||
xgb.plot.shap.summary <- function(data, shap_contrib = NULL, features = NULL, top_n = 10, model = NULL,
|
||||
trees = NULL, target_class = NULL, approxcontrib = FALSE, subsample = NULL) {
|
||||
# Only ggplot implementation is available.
|
||||
xgb.ggplot.shap.summary(data, shap_contrib, features, top_n, model, trees, target_class, approxcontrib, subsample)
|
||||
}
|
||||
|
||||
#' Prepare data for SHAP plots. To be used in xgb.plot.shap, xgb.plot.shap.summary, etc.
|
||||
#' Internal utility function.
|
||||
#' Prepare data for SHAP plots
|
||||
#'
|
||||
#' Internal function used in [xgb.plot.shap()], [xgb.plot.shap.summary()], etc.
|
||||
#'
|
||||
#' @inheritParams xgb.plot.shap
|
||||
#' @param max_observations Maximum number of observations to consider.
|
||||
#' @keywords internal
|
||||
#' @noRd
|
||||
#'
|
||||
#' @return A list containing: 'data', a matrix containing sample observations
|
||||
#' and their feature values; 'shap_contrib', a matrix containing the SHAP contribution
|
||||
#' values for these observations.
|
||||
#' @return
|
||||
#' A list containing:
|
||||
#' - `data`: The matrix of feature values.
|
||||
#' - `shap_contrib`: The matrix with corresponding SHAP values.
|
||||
xgb.shap.data <- function(data, shap_contrib = NULL, features = NULL, top_n = 1, model = NULL,
|
||||
trees = NULL, target_class = NULL, approxcontrib = FALSE,
|
||||
subsample = NULL, max_observations = 100000) {
|
||||
@ -241,7 +303,11 @@ xgb.shap.data <- function(data, shap_contrib = NULL, features = NULL, top_n = 1,
|
||||
if (is.character(features) && is.null(colnames(data)))
|
||||
stop("either provide `data` with column names or provide `features` as column indices")
|
||||
|
||||
if (is.null(model$feature_names) && model$nfeatures != ncol(data))
|
||||
model_feature_names <- NULL
|
||||
if (is.null(features) && !is.null(model)) {
|
||||
model_feature_names <- xgb.feature_names(model)
|
||||
}
|
||||
if (is.null(model_feature_names) && xgb.num_feature(model) != ncol(data))
|
||||
stop("if model has no feature_names, columns in `data` must match features in model")
|
||||
|
||||
if (!is.null(subsample)) {
|
||||
@ -270,7 +336,7 @@ xgb.shap.data <- function(data, shap_contrib = NULL, features = NULL, top_n = 1,
|
||||
}
|
||||
|
||||
if (is.null(features)) {
|
||||
if (!is.null(model$feature_names)) {
|
||||
if (!is.null(model_feature_names)) {
|
||||
imp <- xgb.importance(model = model, trees = trees)
|
||||
} else {
|
||||
imp <- xgb.importance(model = model, trees = trees, feature_names = colnames(data))
|
||||
|
||||
@ -1,74 +1,110 @@
|
||||
#' Plot a boosted tree model
|
||||
#' Plot boosted trees
|
||||
#'
|
||||
#' Read a tree model text dump and plot the model.
|
||||
#'
|
||||
#' @param feature_names names of each feature as a \code{character} vector.
|
||||
#' @param model produced by the \code{xgb.train} function.
|
||||
#' @param trees an integer vector of tree indices that should be visualized.
|
||||
#' If set to \code{NULL}, all trees of the model are included.
|
||||
#' IMPORTANT: the tree index in xgboost model is zero-based
|
||||
#' (e.g., use \code{trees = 0:2} for the first 3 trees in a model).
|
||||
#' @param plot_width the width of the diagram in pixels.
|
||||
#' @param plot_height the height of the diagram in pixels.
|
||||
#' @param render a logical flag for whether the graph should be rendered (see Value).
|
||||
#' @param feature_names Character vector used to overwrite the feature names
|
||||
#' of the model. The default (`NULL`) uses the original feature names.
|
||||
#' @param model Object of class `xgb.Booster`.
|
||||
#' @param trees An integer vector of tree indices that should be used.
|
||||
#' The default (`NULL`) uses all trees.
|
||||
#' Useful, e.g., in multiclass classification to get only
|
||||
#' the trees of one class. *Important*: the tree index in XGBoost models
|
||||
#' is zero-based (e.g., use `trees = 0:2` for the first three trees).
|
||||
#' @param plot_width,plot_height Width and height of the graph in pixels.
|
||||
#' The values are passed to [DiagrammeR::render_graph()].
|
||||
#' @param render Should the graph be rendered or not? The default is `TRUE`.
|
||||
#' @param show_node_id a logical flag for whether to show node id's in the graph.
|
||||
#' @param style Style to use for the plot. Options are:\itemize{
|
||||
#' \item `"xgboost"`: will use the plot style defined in the core XGBoost library,
|
||||
#' which is shared between different interfaces through the 'dot' format. This
|
||||
#' style was not available before version 2.1.0 in R. It always plots the trees
|
||||
#' vertically (from top to bottom).
|
||||
#' \item `"R"`: will use the style defined from XGBoost's R interface, which predates
|
||||
#' the introducition of the standardized style from the core library. It might plot
|
||||
#' the trees horizontally (from left to right).
|
||||
#' }
|
||||
#'
|
||||
#' Note that `style="xgboost"` is only supported when all of the following conditions are met:\itemize{
|
||||
#' \item Only a single tree is being plotted.
|
||||
#' \item Node IDs are not added to the graph.
|
||||
#' \item The graph is being returned as `htmlwidget` (`render=TRUE`).
|
||||
#' }
|
||||
#' @param ... currently not used.
|
||||
#'
|
||||
#' @details
|
||||
#'
|
||||
#' The content of each node is organised that way:
|
||||
#' When using `style="xgboost"`, the content of each node is visualized as follows:
|
||||
#' - For non-terminal nodes, it will display the split condition (number or name if
|
||||
#' available, and the condition that would decide to which node to go next).
|
||||
#' - Those nodes will be connected to their children by arrows that indicate whether the
|
||||
#' branch corresponds to the condition being met or not being met.
|
||||
#' - Terminal (leaf) nodes contain the margin to add when ending there.
|
||||
#'
|
||||
#' \itemize{
|
||||
#' \item Feature name.
|
||||
#' \item \code{Cover}: The sum of second order gradient of training data classified to the leaf.
|
||||
#' If it is square loss, this simply corresponds to the number of instances seen by a split
|
||||
#' or collected by a leaf during training.
|
||||
#' The deeper in the tree a node is, the lower this metric will be.
|
||||
#' \item \code{Gain} (for split nodes): the information gain metric of a split
|
||||
#' When using `style="R"`, the content of each node is visualized like this:
|
||||
#' - *Feature name*.
|
||||
#' - *Cover:* The sum of second order gradients of training data.
|
||||
#' For the squared loss, this simply corresponds to the number of instances in the node.
|
||||
#' The deeper in the tree, the lower the value.
|
||||
#' - *Gain* (for split nodes): Information gain metric of a split
|
||||
#' (corresponds to the importance of the node in the model).
|
||||
#' \item \code{Value} (for leafs): the margin value that the leaf may contribute to prediction.
|
||||
#' }
|
||||
#' The tree root nodes also indicate the Tree index (0-based).
|
||||
#' - *Value* (for leaves): Margin value that the leaf may contribute to the prediction.
|
||||
#'
|
||||
#' The tree root nodes also indicate the tree index (0-based).
|
||||
#'
|
||||
#' The "Yes" branches are marked by the "< split_value" label.
|
||||
#' The branches that also used for missing values are marked as bold
|
||||
#' The branches also used for missing values are marked as bold
|
||||
#' (as in "carrying extra capacity").
|
||||
#'
|
||||
#' This function uses \href{https://www.graphviz.org/}{GraphViz} as a backend of DiagrammeR.
|
||||
#' This function uses [GraphViz](https://www.graphviz.org/) as DiagrammeR backend.
|
||||
#'
|
||||
#' @return
|
||||
#'
|
||||
#' When \code{render = TRUE}:
|
||||
#' returns a rendered graph object which is an \code{htmlwidget} of class \code{grViz}.
|
||||
#' Similar to ggplot objects, it needs to be printed to see it when not running from command line.
|
||||
#'
|
||||
#' When \code{render = FALSE}:
|
||||
#' silently returns a graph object which is of DiagrammeR's class \code{dgr_graph}.
|
||||
#' This could be useful if one wants to modify some of the graph attributes
|
||||
#' before rendering the graph with \code{\link[DiagrammeR]{render_graph}}.
|
||||
#' The value depends on the `render` parameter:
|
||||
#' - If `render = TRUE` (default): Rendered graph object which is an htmlwidget of
|
||||
#' class `grViz`. Similar to "ggplot" objects, it needs to be printed when not
|
||||
#' running from the command line.
|
||||
#' - If `render = FALSE`: Graph object which is of DiagrammeR's class `dgr_graph`.
|
||||
#' This could be useful if one wants to modify some of the graph attributes
|
||||
#' before rendering the graph with [DiagrammeR::render_graph()].
|
||||
#'
|
||||
#' @examples
|
||||
#' data(agaricus.train, package='xgboost')
|
||||
#' data(agaricus.train, package = "xgboost")
|
||||
#'
|
||||
#' bst <- xgboost(
|
||||
#' data = agaricus.train$data,
|
||||
#' label = agaricus.train$label,
|
||||
#' max_depth = 3,
|
||||
#' eta = 1,
|
||||
#' nthread = 2,
|
||||
#' nrounds = 2,
|
||||
#' objective = "binary:logistic"
|
||||
#' )
|
||||
#'
|
||||
#' # plot the first tree, using the style from xgboost's core library
|
||||
#' # (this plot should look identical to the ones generated from other
|
||||
#' # interfaces like the python package for xgboost)
|
||||
#' xgb.plot.tree(model = bst, trees = 1, style = "xgboost")
|
||||
#'
|
||||
#' bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max_depth = 3,
|
||||
#' eta = 1, nthread = 2, nrounds = 2,objective = "binary:logistic")
|
||||
#' # plot all the trees
|
||||
#' xgb.plot.tree(model = bst)
|
||||
#' xgb.plot.tree(model = bst, trees = NULL)
|
||||
#'
|
||||
#' # plot only the first tree and display the node ID:
|
||||
#' xgb.plot.tree(model = bst, trees = 0, show_node_id = TRUE)
|
||||
#'
|
||||
#' \dontrun{
|
||||
#' # Below is an example of how to save this plot to a file.
|
||||
#' # Note that for `export_graph` to work, the DiagrammeRsvg and rsvg packages must also be installed.
|
||||
#' # Note that for export_graph() to work, the {DiagrammeRsvg}
|
||||
#' # and {rsvg} packages must also be installed.
|
||||
#'
|
||||
#' library(DiagrammeR)
|
||||
#' gr <- xgb.plot.tree(model=bst, trees=0:1, render=FALSE)
|
||||
#' export_graph(gr, 'tree.pdf', width=1500, height=1900)
|
||||
#' export_graph(gr, 'tree.png', width=1500, height=1900)
|
||||
#'
|
||||
#' gr <- xgb.plot.tree(model = bst, trees = 0:1, render = FALSE)
|
||||
#' export_graph(gr, "tree.pdf", width = 1500, height = 1900)
|
||||
#' export_graph(gr, "tree.png", width = 1500, height = 1900)
|
||||
#' }
|
||||
#'
|
||||
#' @export
|
||||
xgb.plot.tree <- function(feature_names = NULL, model = NULL, trees = NULL, plot_width = NULL, plot_height = NULL,
|
||||
render = TRUE, show_node_id = FALSE, ...) {
|
||||
render = TRUE, show_node_id = FALSE, style = c("R", "xgboost"), ...) {
|
||||
check.deprecation(...)
|
||||
if (!inherits(model, "xgb.Booster")) {
|
||||
stop("model: Has to be an object of class xgb.Booster")
|
||||
@ -78,9 +114,25 @@ xgb.plot.tree <- function(feature_names = NULL, model = NULL, trees = NULL, plot
|
||||
stop("DiagrammeR package is required for xgb.plot.tree", call. = FALSE)
|
||||
}
|
||||
|
||||
style <- as.character(head(style, 1L))
|
||||
stopifnot(style %in% c("R", "xgboost"))
|
||||
if (style == "xgboost") {
|
||||
if (NROW(trees) != 1L || !render || show_node_id) {
|
||||
stop("style='xgboost' is only supported for single, rendered tree, without node IDs.")
|
||||
}
|
||||
if (!is.null(feature_names)) {
|
||||
stop(
|
||||
"style='xgboost' cannot override 'feature_names'. Will automatically take them from the model."
|
||||
)
|
||||
}
|
||||
|
||||
txt <- xgb.dump(model, dump_format = "dot")
|
||||
return(DiagrammeR::grViz(txt[[trees + 1]], width = plot_width, height = plot_height))
|
||||
}
|
||||
|
||||
dt <- xgb.model.dt.tree(feature_names = feature_names, model = model, trees = trees)
|
||||
|
||||
dt[, label := paste0(Feature, "\nCover: ", Cover, ifelse(Feature == "Leaf", "\nValue: ", "\nGain: "), Quality)]
|
||||
dt[, label := paste0(Feature, "\nCover: ", Cover, ifelse(Feature == "Leaf", "\nValue: ", "\nGain: "), Gain)]
|
||||
if (show_node_id)
|
||||
dt[, label := paste0(ID, ": ", label)]
|
||||
dt[Node == 0, label := paste0("Tree ", Tree, "\n", label)]
|
||||
@ -147,4 +199,4 @@ xgb.plot.tree <- function(feature_names = NULL, model = NULL, trees = NULL, plot
|
||||
# Avoid error messages during CRAN check.
|
||||
# The reason is that these variables are never declared
|
||||
# They are mainly column names inferred by Data.table...
|
||||
globalVariables(c("Feature", "ID", "Cover", "Quality", "Split", "Yes", "No", "Missing", ".", "shape", "filledcolor", "label"))
|
||||
globalVariables(c("Feature", "ID", "Cover", "Gain", "Split", "Yes", "No", "Missing", ".", "shape", "filledcolor", "label"))
|
||||
|
||||
@ -1,12 +1,24 @@
|
||||
#' Save xgboost model to binary file
|
||||
#'
|
||||
#' Save xgboost model to a file in binary format.
|
||||
#' Save xgboost model to a file in binary or JSON format.
|
||||
#'
|
||||
#' @param model model object of \code{xgb.Booster} class.
|
||||
#' @param fname name of the file to write.
|
||||
#' @param model Model object of \code{xgb.Booster} class.
|
||||
#' @param fname Name of the file to write.
|
||||
#'
|
||||
#' Note that the extension of this file name determined the serialization format to use:\itemize{
|
||||
#' \item Extension ".ubj" will use the universal binary JSON format (recommended).
|
||||
#' This format uses binary types for e.g. floating point numbers, thereby preventing any loss
|
||||
#' of precision when converting to a human-readable JSON text or similar.
|
||||
#' \item Extension ".json" will use plain JSON, which is a human-readable format.
|
||||
#' \item Extension ".deprecated" will use a \bold{deprecated} binary format. This format will
|
||||
#' not be able to save attributes introduced after v1 of XGBoost, such as the "best_iteration"
|
||||
#' attribute that boosters might keep, nor feature names or user-specifiec attributes.
|
||||
#' \item If the format is not specified by passing one of the file extensions above, will
|
||||
#' default to UBJ.
|
||||
#' }
|
||||
#'
|
||||
#' @details
|
||||
#' This methods allows to save a model in an xgboost-internal binary format which is universal
|
||||
#' This methods allows to save a model in an xgboost-internal binary or text format which is universal
|
||||
#' among the various xgboost interfaces. In R, the saved model file could be read-in later
|
||||
#' using either the \code{\link{xgb.load}} function or the \code{xgb_model} parameter
|
||||
#' of \code{\link{xgb.train}}.
|
||||
@ -14,13 +26,13 @@
|
||||
#' Note: a model can also be saved as an R-object (e.g., by using \code{\link[base]{readRDS}}
|
||||
#' or \code{\link[base]{save}}). However, it would then only be compatible with R, and
|
||||
#' corresponding R-methods would need to be used to load it. Moreover, persisting the model with
|
||||
#' \code{\link[base]{readRDS}} or \code{\link[base]{save}}) will cause compatibility problems in
|
||||
#' \code{\link[base]{readRDS}} or \code{\link[base]{save}}) might cause compatibility problems in
|
||||
#' future versions of XGBoost. Consult \code{\link{a-compatibility-note-for-saveRDS-save}} to learn
|
||||
#' how to persist models in a future-proof way, i.e. to make the model accessible in future
|
||||
#' releases of XGBoost.
|
||||
#'
|
||||
#' @seealso
|
||||
#' \code{\link{xgb.load}}, \code{\link{xgb.Booster.complete}}.
|
||||
#' \code{\link{xgb.load}}
|
||||
#'
|
||||
#' @examples
|
||||
#' data(agaricus.train, package='xgboost')
|
||||
@ -32,15 +44,17 @@
|
||||
#'
|
||||
#' train <- agaricus.train
|
||||
#' test <- agaricus.test
|
||||
#' bst <- xgboost(
|
||||
#' data = train$data, label = train$label, max_depth = 2, eta = 1,
|
||||
#' bst <- xgb.train(
|
||||
#' data = xgb.DMatrix(train$data, label = train$label),
|
||||
#' max_depth = 2,
|
||||
#' eta = 1,
|
||||
#' nthread = nthread,
|
||||
#' nrounds = 2,
|
||||
#' objective = "binary:logistic"
|
||||
#' )
|
||||
#' xgb.save(bst, 'xgb.model')
|
||||
#' bst <- xgb.load('xgb.model')
|
||||
#' if (file.exists('xgb.model')) file.remove('xgb.model')
|
||||
#' fname <- file.path(tempdir(), "xgb.ubj")
|
||||
#' xgb.save(bst, fname)
|
||||
#' bst <- xgb.load(fname)
|
||||
#' @export
|
||||
xgb.save <- function(model, fname) {
|
||||
if (typeof(fname) != "character")
|
||||
@ -49,8 +63,7 @@ xgb.save <- function(model, fname) {
|
||||
stop("model must be xgb.Booster.",
|
||||
if (inherits(model, "xgb.DMatrix")) " Use xgb.DMatrix.save to save an xgb.DMatrix object." else "")
|
||||
}
|
||||
model <- xgb.Booster.complete(model, saveraw = FALSE)
|
||||
fname <- path.expand(fname)
|
||||
.Call(XGBoosterSaveModel_R, model$handle, enc2utf8(fname[1]))
|
||||
.Call(XGBoosterSaveModel_R, xgb.get.handle(model), enc2utf8(fname[1]))
|
||||
return(TRUE)
|
||||
}
|
||||
|
||||
@ -11,8 +11,6 @@
|
||||
#' \item \code{deprecated}: Encode the booster into old customized binary format.
|
||||
#' }
|
||||
#'
|
||||
#' Right now the default is \code{deprecated} but will be changed to \code{ubj} in upcoming release.
|
||||
#'
|
||||
#' @examples
|
||||
#' data(agaricus.train, package='xgboost')
|
||||
#' data(agaricus.test, package='xgboost')
|
||||
@ -23,14 +21,14 @@
|
||||
#'
|
||||
#' train <- agaricus.train
|
||||
#' test <- agaricus.test
|
||||
#' bst <- xgboost(data = train$data, label = train$label, max_depth = 2,
|
||||
#' eta = 1, nthread = nthread, nrounds = 2,objective = "binary:logistic")
|
||||
#' bst <- xgb.train(data = xgb.DMatrix(train$data, label = train$label), max_depth = 2,
|
||||
#' eta = 1, nthread = nthread, nrounds = 2,objective = "binary:logistic")
|
||||
#'
|
||||
#' raw <- xgb.save.raw(bst)
|
||||
#' bst <- xgb.load.raw(raw)
|
||||
#'
|
||||
#' @export
|
||||
xgb.save.raw <- function(model, raw_format = "deprecated") {
|
||||
xgb.save.raw <- function(model, raw_format = "ubj") {
|
||||
handle <- xgb.get.handle(model)
|
||||
args <- list(format = raw_format)
|
||||
.Call(XGBoosterSaveModelToRaw_R, handle, jsonlite::toJSON(args, auto_unbox = TRUE))
|
||||
|
||||
@ -1,21 +0,0 @@
|
||||
#' Serialize the booster instance into R's raw vector. The serialization method differs
|
||||
#' from \code{\link{xgb.save.raw}} as the latter one saves only the model but not
|
||||
#' parameters. This serialization format is not stable across different xgboost versions.
|
||||
#'
|
||||
#' @param booster the booster instance
|
||||
#'
|
||||
#' @examples
|
||||
#' data(agaricus.train, package='xgboost')
|
||||
#' data(agaricus.test, package='xgboost')
|
||||
#' train <- agaricus.train
|
||||
#' test <- agaricus.test
|
||||
#' bst <- xgboost(data = train$data, label = train$label, max_depth = 2,
|
||||
#' eta = 1, nthread = 2, nrounds = 2,objective = "binary:logistic")
|
||||
#' raw <- xgb.serialize(bst)
|
||||
#' bst <- xgb.unserialize(raw)
|
||||
#'
|
||||
#' @export
|
||||
xgb.serialize <- function(booster) {
|
||||
handle <- xgb.get.handle(booster)
|
||||
.Call(XGBoosterSerializeToBuffer_R, handle)
|
||||
}
|
||||
@ -152,6 +152,10 @@
|
||||
#' See \code{\link{callbacks}}. Some of the callbacks are automatically created depending on the
|
||||
#' parameters' values. User can provide either existing or their own callback methods in order
|
||||
#' to customize the training process.
|
||||
#'
|
||||
#' Note that some callbacks might try to set an evaluation log - be aware that these evaluation logs
|
||||
#' are kept as R attributes, and thus do not get saved when using non-R serializaters like
|
||||
#' \link{xgb.save} (but are kept when using R serializers like \link{saveRDS}).
|
||||
#' @param ... other parameters to pass to \code{params}.
|
||||
#' @param label vector of response values. Should not be provided when data is
|
||||
#' a local data file name or an \code{xgb.DMatrix}.
|
||||
@ -160,6 +164,9 @@
|
||||
#' This parameter is only used when input is a dense matrix.
|
||||
#' @param weight a vector indicating the weight for each row of the input.
|
||||
#'
|
||||
#' @return
|
||||
#' An object of class \code{xgb.Booster}.
|
||||
#'
|
||||
#' @details
|
||||
#' These are the training functions for \code{xgboost}.
|
||||
#'
|
||||
@ -201,28 +208,20 @@
|
||||
#' \item \code{cb.save.model}: when \code{save_period > 0} is set.
|
||||
#' }
|
||||
#'
|
||||
#' @return
|
||||
#' An object of class \code{xgb.Booster} with the following elements:
|
||||
#' \itemize{
|
||||
#' \item \code{handle} a handle (pointer) to the xgboost model in memory.
|
||||
#' \item \code{raw} a cached memory dump of the xgboost model saved as R's \code{raw} type.
|
||||
#' \item \code{niter} number of boosting iterations.
|
||||
#' \item \code{evaluation_log} evaluation history stored as a \code{data.table} with the
|
||||
#' first column corresponding to iteration number and the rest corresponding to evaluation
|
||||
#' metrics' values. It is created by the \code{\link{cb.evaluation.log}} callback.
|
||||
#' \item \code{call} a function call.
|
||||
#' \item \code{params} parameters that were passed to the xgboost library. Note that it does not
|
||||
#' capture parameters changed by the \code{\link{cb.reset.parameters}} callback.
|
||||
#' \item \code{callbacks} callback functions that were either automatically assigned or
|
||||
#' explicitly passed.
|
||||
#' \item \code{best_iteration} iteration number with the best evaluation metric value
|
||||
#' (only available with early stopping).
|
||||
#' \item \code{best_score} the best evaluation metric value during early stopping.
|
||||
#' (only available with early stopping).
|
||||
#' \item \code{feature_names} names of the training dataset features
|
||||
#' (only when column names were defined in training data).
|
||||
#' \item \code{nfeatures} number of features in training data.
|
||||
#' }
|
||||
#' Note that objects of type `xgb.Booster` as returned by this function behave a bit differently
|
||||
#' from typical R objects (it's an 'altrep' list class), and it makes a separation between
|
||||
#' internal booster attributes (restricted to jsonifyable data), accessed through \link{xgb.attr}
|
||||
#' and shared between interfaces through serialization functions like \link{xgb.save}; and
|
||||
#' R-specific attributes, accessed through \link{attributes} and \link{attr}, which are otherwise
|
||||
#' only used in the R interface, only kept when using R's serializers like \link{saveRDS}, and
|
||||
#' not anyhow used by functions like \link{predict.xgb.Booster}.
|
||||
#'
|
||||
#' Be aware that one such R attribute that is automatically added is `params` - this attribute
|
||||
#' is assigned from the `params` argument to this function, and is only meant to serve as a
|
||||
#' reference for what went into the booster, but is not used in other methods that take a booster
|
||||
#' object - so for example, changing the booster's configuration requires calling `xgb.config<-`
|
||||
#' or 'xgb.parameters<-', while simply modifying `attributes(model)$params$<...>` will have no
|
||||
#' effect elsewhere.
|
||||
#'
|
||||
#' @seealso
|
||||
#' \code{\link{callbacks}},
|
||||
@ -251,9 +250,9 @@
|
||||
#' watchlist <- list(train = dtrain, eval = dtest)
|
||||
#'
|
||||
#' ## A simple xgb.train example:
|
||||
#' param <- list(max_depth = 2, eta = 1, verbose = 0, nthread = nthread,
|
||||
#' param <- list(max_depth = 2, eta = 1, nthread = nthread,
|
||||
#' objective = "binary:logistic", eval_metric = "auc")
|
||||
#' bst <- xgb.train(param, dtrain, nrounds = 2, watchlist)
|
||||
#' bst <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0)
|
||||
#'
|
||||
#' ## An xgb.train example where custom objective and evaluation metric are
|
||||
#' ## used:
|
||||
@ -272,13 +271,13 @@
|
||||
#'
|
||||
#' # These functions could be used by passing them either:
|
||||
#' # as 'objective' and 'eval_metric' parameters in the params list:
|
||||
#' param <- list(max_depth = 2, eta = 1, verbose = 0, nthread = nthread,
|
||||
#' param <- list(max_depth = 2, eta = 1, nthread = nthread,
|
||||
#' objective = logregobj, eval_metric = evalerror)
|
||||
#' bst <- xgb.train(param, dtrain, nrounds = 2, watchlist)
|
||||
#' bst <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0)
|
||||
#'
|
||||
#' # or through the ... arguments:
|
||||
#' param <- list(max_depth = 2, eta = 1, verbose = 0, nthread = nthread)
|
||||
#' bst <- xgb.train(param, dtrain, nrounds = 2, watchlist,
|
||||
#' param <- list(max_depth = 2, eta = 1, nthread = nthread)
|
||||
#' bst <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0,
|
||||
#' objective = logregobj, eval_metric = evalerror)
|
||||
#'
|
||||
#' # or as dedicated 'obj' and 'feval' parameters of xgb.train:
|
||||
@ -287,10 +286,10 @@
|
||||
#'
|
||||
#'
|
||||
#' ## An xgb.train example of using variable learning rates at each iteration:
|
||||
#' param <- list(max_depth = 2, eta = 1, verbose = 0, nthread = nthread,
|
||||
#' param <- list(max_depth = 2, eta = 1, nthread = nthread,
|
||||
#' objective = "binary:logistic", eval_metric = "auc")
|
||||
#' my_etas <- list(eta = c(0.5, 0.1))
|
||||
#' bst <- xgb.train(param, dtrain, nrounds = 2, watchlist,
|
||||
#' bst <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0,
|
||||
#' callbacks = list(cb.reset.parameters(my_etas)))
|
||||
#'
|
||||
#' ## Early stopping:
|
||||
@ -371,27 +370,31 @@ xgb.train <- function(params = list(), data, nrounds, watchlist = list(),
|
||||
# The tree updating process would need slightly different handling
|
||||
is_update <- NVL(params[['process_type']], '.') == 'update'
|
||||
|
||||
past_evaluation_log <- NULL
|
||||
if (inherits(xgb_model, "xgb.Booster")) {
|
||||
past_evaluation_log <- attributes(xgb_model)$evaluation_log
|
||||
}
|
||||
|
||||
# Construct a booster (either a new one or load from xgb_model)
|
||||
handle <- xgb.Booster.handle(
|
||||
bst <- xgb.Booster(
|
||||
params = params,
|
||||
cachelist = append(watchlist, dtrain),
|
||||
modelfile = xgb_model,
|
||||
handle = NULL
|
||||
modelfile = xgb_model
|
||||
)
|
||||
niter_init <- bst$niter
|
||||
bst <- bst$bst
|
||||
.Call(
|
||||
XGBoosterCopyInfoFromDMatrix_R,
|
||||
xgb.get.handle(bst),
|
||||
dtrain
|
||||
)
|
||||
bst <- xgb.handleToBooster(handle = handle, raw = NULL)
|
||||
|
||||
# extract parameters that can affect the relationship b/w #trees and #iterations
|
||||
num_class <- max(as.numeric(NVL(params[['num_class']], 1)), 1)
|
||||
num_parallel_tree <- max(as.numeric(NVL(params[['num_parallel_tree']], 1)), 1)
|
||||
# Note: it might look like these aren't used, but they need to be defined in this
|
||||
# environment for the callbacks for work correctly.
|
||||
num_class <- max(as.numeric(NVL(params[['num_class']], 1)), 1) # nolint
|
||||
num_parallel_tree <- max(as.numeric(NVL(params[['num_parallel_tree']], 1)), 1) # nolint
|
||||
|
||||
# When the 'xgb_model' was set, find out how many boosting iterations it has
|
||||
niter_init <- 0
|
||||
if (!is.null(xgb_model)) {
|
||||
niter_init <- as.numeric(xgb.attr(bst, 'niter')) + 1
|
||||
if (length(niter_init) == 0) {
|
||||
niter_init <- xgb.ntree(bst) %/% (num_parallel_tree * num_class)
|
||||
}
|
||||
}
|
||||
if (is_update && nrounds > niter_init)
|
||||
stop("nrounds cannot be larger than ", niter_init, " (nrounds of xgb_model)")
|
||||
|
||||
@ -405,7 +408,7 @@ xgb.train <- function(params = list(), data, nrounds, watchlist = list(),
|
||||
for (f in cb$pre_iter) f()
|
||||
|
||||
xgb.iter.update(
|
||||
booster_handle = bst$handle,
|
||||
bst = bst,
|
||||
dtrain = dtrain,
|
||||
iter = iteration - 1,
|
||||
obj = obj
|
||||
@ -413,46 +416,43 @@ xgb.train <- function(params = list(), data, nrounds, watchlist = list(),
|
||||
|
||||
if (length(watchlist) > 0) {
|
||||
bst_evaluation <- xgb.iter.eval( # nolint: object_usage_linter
|
||||
booster_handle = bst$handle,
|
||||
bst = bst,
|
||||
watchlist = watchlist,
|
||||
iter = iteration - 1,
|
||||
feval = feval
|
||||
)
|
||||
}
|
||||
|
||||
xgb.attr(bst$handle, 'niter') <- iteration - 1
|
||||
|
||||
for (f in cb$post_iter) f()
|
||||
|
||||
if (stop_condition) break
|
||||
}
|
||||
for (f in cb$finalize) f(finalize = TRUE)
|
||||
|
||||
bst <- xgb.Booster.complete(bst, saveraw = TRUE)
|
||||
|
||||
# store the total number of boosting iterations
|
||||
bst$niter <- end_iteration
|
||||
|
||||
# store the evaluation results
|
||||
if (length(evaluation_log) > 0 &&
|
||||
nrow(evaluation_log) > 0) {
|
||||
keep_evaluation_log <- FALSE
|
||||
if (length(evaluation_log) > 0 && nrow(evaluation_log) > 0) {
|
||||
keep_evaluation_log <- TRUE
|
||||
# include the previous compatible history when available
|
||||
if (inherits(xgb_model, 'xgb.Booster') &&
|
||||
!is_update &&
|
||||
!is.null(xgb_model$evaluation_log) &&
|
||||
!is.null(past_evaluation_log) &&
|
||||
isTRUE(all.equal(colnames(evaluation_log),
|
||||
colnames(xgb_model$evaluation_log)))) {
|
||||
evaluation_log <- rbindlist(list(xgb_model$evaluation_log, evaluation_log))
|
||||
colnames(past_evaluation_log)))) {
|
||||
evaluation_log <- rbindlist(list(past_evaluation_log, evaluation_log))
|
||||
}
|
||||
bst$evaluation_log <- evaluation_log
|
||||
}
|
||||
|
||||
bst$call <- match.call()
|
||||
bst$params <- params
|
||||
bst$callbacks <- callbacks
|
||||
if (!is.null(colnames(dtrain)))
|
||||
bst$feature_names <- colnames(dtrain)
|
||||
bst$nfeatures <- ncol(dtrain)
|
||||
extra_attrs <- list(
|
||||
call = match.call(),
|
||||
params = params,
|
||||
callbacks = callbacks
|
||||
)
|
||||
if (keep_evaluation_log) {
|
||||
extra_attrs$evaluation_log <- evaluation_log
|
||||
}
|
||||
curr_attrs <- attributes(bst)
|
||||
attributes(bst) <- c(curr_attrs, extra_attrs)
|
||||
|
||||
return(bst)
|
||||
}
|
||||
|
||||
@ -1,41 +0,0 @@
|
||||
#' Load the instance back from \code{\link{xgb.serialize}}
|
||||
#'
|
||||
#' @param buffer the buffer containing booster instance saved by \code{\link{xgb.serialize}}
|
||||
#' @param handle An \code{xgb.Booster.handle} object which will be overwritten with
|
||||
#' the new deserialized object. Must be a null handle (e.g. when loading the model through
|
||||
#' `readRDS`). If not provided, a new handle will be created.
|
||||
#' @return An \code{xgb.Booster.handle} object.
|
||||
#'
|
||||
#' @export
|
||||
xgb.unserialize <- function(buffer, handle = NULL) {
|
||||
cachelist <- list()
|
||||
if (is.null(handle)) {
|
||||
handle <- .Call(XGBoosterCreate_R, cachelist)
|
||||
} else {
|
||||
if (!is.null.handle(handle))
|
||||
stop("'handle' is not null/empty. Cannot overwrite existing handle.")
|
||||
.Call(XGBoosterCreateInEmptyObj_R, cachelist, handle)
|
||||
}
|
||||
tryCatch(
|
||||
.Call(XGBoosterUnserializeFromBuffer_R, handle, buffer),
|
||||
error = function(e) {
|
||||
error_msg <- conditionMessage(e)
|
||||
m <- regexec("(src[\\\\/]learner.cc:[0-9]+): Check failed: (header == serialisation_header_)",
|
||||
error_msg, perl = TRUE)
|
||||
groups <- regmatches(error_msg, m)[[1]]
|
||||
if (length(groups) == 3) {
|
||||
warning(paste("The model had been generated by XGBoost version 1.0.0 or earlier and was ",
|
||||
"loaded from a RDS file. We strongly ADVISE AGAINST using saveRDS() ",
|
||||
"function, to ensure that your model can be read in current and upcoming ",
|
||||
"XGBoost releases. Please use xgb.save() instead to preserve models for the ",
|
||||
"long term. For more details and explanation, see ",
|
||||
"https://xgboost.readthedocs.io/en/latest/tutorials/saving_model.html",
|
||||
sep = ""))
|
||||
.Call(XGBoosterLoadModelFromRaw_R, handle, buffer)
|
||||
} else {
|
||||
stop(e)
|
||||
}
|
||||
})
|
||||
class(handle) <- "xgb.Booster.handle"
|
||||
return(handle)
|
||||
}
|
||||
@ -40,10 +40,10 @@ xgboost <- function(data = NULL, label = NULL, missing = NA, weight = NULL,
|
||||
#' }
|
||||
#'
|
||||
#' @references
|
||||
#' https://archive.ics.uci.edu/ml/datasets/Mushroom
|
||||
#' <https://archive.ics.uci.edu/ml/datasets/Mushroom>
|
||||
#'
|
||||
#' Bache, K. & Lichman, M. (2013). UCI Machine Learning Repository
|
||||
#' [http://archive.ics.uci.edu/ml]. Irvine, CA: University of California,
|
||||
#' <http://archive.ics.uci.edu/ml>. Irvine, CA: University of California,
|
||||
#' School of Information and Computer Science.
|
||||
#'
|
||||
#' @docType data
|
||||
@ -67,10 +67,10 @@ NULL
|
||||
#' }
|
||||
#'
|
||||
#' @references
|
||||
#' https://archive.ics.uci.edu/ml/datasets/Mushroom
|
||||
#' <https://archive.ics.uci.edu/ml/datasets/Mushroom>
|
||||
#'
|
||||
#' Bache, K. & Lichman, M. (2013). UCI Machine Learning Repository
|
||||
#' [http://archive.ics.uci.edu/ml]. Irvine, CA: University of California,
|
||||
#' <http://archive.ics.uci.edu/ml>. Irvine, CA: University of California,
|
||||
#' School of Information and Computer Science.
|
||||
#'
|
||||
#' @docType data
|
||||
@ -82,7 +82,7 @@ NULL
|
||||
NULL
|
||||
|
||||
# Various imports
|
||||
#' @importClassesFrom Matrix dgCMatrix dgeMatrix
|
||||
#' @importClassesFrom Matrix dgCMatrix dgeMatrix dgRMatrix
|
||||
#' @importFrom Matrix colSums
|
||||
#' @importFrom Matrix sparse.model.matrix
|
||||
#' @importFrom Matrix sparseVector
|
||||
@ -98,9 +98,12 @@ NULL
|
||||
#' @importFrom data.table setnames
|
||||
#' @importFrom jsonlite fromJSON
|
||||
#' @importFrom jsonlite toJSON
|
||||
#' @importFrom methods new
|
||||
#' @importFrom utils object.size str tail
|
||||
#' @importFrom stats coef
|
||||
#' @importFrom stats predict
|
||||
#' @importFrom stats median
|
||||
#' @importFrom stats variable.names
|
||||
#' @importFrom utils head
|
||||
#' @importFrom graphics barplot
|
||||
#' @importFrom graphics lines
|
||||
|
||||
@ -1,5 +1,4 @@
|
||||
basic_walkthrough Basic feature walkthrough
|
||||
caret_wrapper Use xgboost to train in caret library
|
||||
custom_objective Customize loss function, and evaluation metric
|
||||
boost_from_prediction Boosting from existing prediction
|
||||
predict_first_ntree Predicting using first n trees
|
||||
|
||||
@ -1,7 +1,6 @@
|
||||
XGBoost R Feature Walkthrough
|
||||
====
|
||||
* [Basic walkthrough of wrappers](basic_walkthrough.R)
|
||||
* [Train a xgboost model from caret library](caret_wrapper.R)
|
||||
* [Customize loss function, and evaluation metric](custom_objective.R)
|
||||
* [Boosting from existing prediction](boost_from_prediction.R)
|
||||
* [Predicting using first n trees](predict_first_ntree.R)
|
||||
|
||||
@ -1,44 +0,0 @@
|
||||
# install development version of caret library that contains xgboost models
|
||||
require(caret)
|
||||
require(xgboost)
|
||||
require(data.table)
|
||||
require(vcd)
|
||||
require(e1071)
|
||||
|
||||
# Load Arthritis dataset in memory.
|
||||
data(Arthritis)
|
||||
# Create a copy of the dataset with data.table package
|
||||
# (data.table is 100% compliant with R dataframe but its syntax is a lot more consistent
|
||||
# and its performance are really good).
|
||||
df <- data.table(Arthritis, keep.rownames = FALSE)
|
||||
|
||||
# Let's add some new categorical features to see if it helps.
|
||||
# Of course these feature are highly correlated to the Age feature.
|
||||
# Usually it's not a good thing in ML, but Tree algorithms (including boosted trees) are able to select the best features,
|
||||
# even in case of highly correlated features.
|
||||
# For the first feature we create groups of age by rounding the real age.
|
||||
# Note that we transform it to factor (categorical data) so the algorithm treat them as independant values.
|
||||
df[, AgeDiscret := as.factor(round(Age / 10, 0))]
|
||||
|
||||
# Here is an even stronger simplification of the real age with an arbitrary split at 30 years old.
|
||||
# I choose this value based on nothing.
|
||||
# We will see later if simplifying the information based on arbitrary values is a good strategy
|
||||
# (I am sure you already have an idea of how well it will work!).
|
||||
df[, AgeCat := as.factor(ifelse(Age > 30, "Old", "Young"))]
|
||||
|
||||
# We remove ID as there is nothing to learn from this feature (it will just add some noise as the dataset is small).
|
||||
df[, ID := NULL]
|
||||
|
||||
#-------------Basic Training using XGBoost in caret Library-----------------
|
||||
# Set up control parameters for caret::train
|
||||
# Here we use 10-fold cross-validation, repeating twice, and using random search for tuning hyper-parameters.
|
||||
fitControl <- trainControl(method = "repeatedcv", number = 10, repeats = 2, search = "random")
|
||||
# train a xgbTree model using caret::train
|
||||
model <- train(factor(Improved) ~ ., data = df, method = "xgbTree", trControl = fitControl)
|
||||
|
||||
# Instead of tree for our boosters, you can also fit a linear regression or logistic regression model
|
||||
# using xgbLinear
|
||||
# model <- train(factor(Improved)~., data = df, method = "xgbLinear", trControl = fitControl)
|
||||
|
||||
# See model results
|
||||
print(model)
|
||||
@ -81,8 +81,8 @@ output_vector <- df[, Y := 0][Improved == "Marked", Y := 1][, Y]
|
||||
|
||||
# Following is the same process as other demo
|
||||
cat("Learning...\n")
|
||||
bst <- xgboost(data = sparse_matrix, label = output_vector, max_depth = 9,
|
||||
eta = 1, nthread = 2, nrounds = 10, objective = "binary:logistic")
|
||||
bst <- xgb.train(data = xgb.DMatrix(sparse_matrix, label = output_vector), max_depth = 9,
|
||||
eta = 1, nthread = 2, nrounds = 10, objective = "binary:logistic")
|
||||
|
||||
importance <- xgb.importance(feature_names = colnames(sparse_matrix), model = bst)
|
||||
print(importance)
|
||||
|
||||
@ -74,26 +74,26 @@ cols2ids <- function(object, col_names) {
|
||||
interaction_list_fid <- cols2ids(interaction_list, colnames(train))
|
||||
|
||||
# Fit model with interaction constraints
|
||||
bst <- xgboost(data = train, label = y, max_depth = 4,
|
||||
eta = 0.1, nthread = 2, nrounds = 1000,
|
||||
interaction_constraints = interaction_list_fid)
|
||||
bst <- xgb.train(data = xgb.DMatrix(train, label = y), max_depth = 4,
|
||||
eta = 0.1, nthread = 2, nrounds = 1000,
|
||||
interaction_constraints = interaction_list_fid)
|
||||
|
||||
bst_tree <- xgb.model.dt.tree(colnames(train), bst)
|
||||
bst_interactions <- treeInteractions(bst_tree, 4)
|
||||
# interactions constrained to combinations of V1*V2 and V3*V4*V5
|
||||
|
||||
# Fit model without interaction constraints
|
||||
bst2 <- xgboost(data = train, label = y, max_depth = 4,
|
||||
eta = 0.1, nthread = 2, nrounds = 1000)
|
||||
bst2 <- xgb.train(data = xgb.DMatrix(train, label = y), max_depth = 4,
|
||||
eta = 0.1, nthread = 2, nrounds = 1000)
|
||||
|
||||
bst2_tree <- xgb.model.dt.tree(colnames(train), bst2)
|
||||
bst2_interactions <- treeInteractions(bst2_tree, 4) # much more interactions
|
||||
|
||||
# Fit model with both interaction and monotonicity constraints
|
||||
bst3 <- xgboost(data = train, label = y, max_depth = 4,
|
||||
eta = 0.1, nthread = 2, nrounds = 1000,
|
||||
interaction_constraints = interaction_list_fid,
|
||||
monotone_constraints = c(-1, 0, 0, 0, 0, 0, 0, 0, 0, 0))
|
||||
bst3 <- xgb.train(data = xgb.DMatrix(train, label = y), max_depth = 4,
|
||||
eta = 0.1, nthread = 2, nrounds = 1000,
|
||||
interaction_constraints = interaction_list_fid,
|
||||
monotone_constraints = c(-1, 0, 0, 0, 0, 0, 0, 0, 0, 0))
|
||||
|
||||
bst3_tree <- xgb.model.dt.tree(colnames(train), bst3)
|
||||
bst3_interactions <- treeInteractions(bst3_tree, 4)
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
data(mtcars)
|
||||
head(mtcars)
|
||||
bst <- xgboost(data = as.matrix(mtcars[, -11]), label = mtcars[, 11],
|
||||
objective = 'count:poisson', nrounds = 5)
|
||||
bst <- xgb.train(data = xgb.DMatrix(as.matrix(mtcars[, -11]), label = mtcars[, 11]),
|
||||
objective = 'count:poisson', nrounds = 5)
|
||||
pred <- predict(bst, as.matrix(mtcars[, -11]))
|
||||
sqrt(mean((pred - mtcars[, 11]) ^ 2))
|
||||
|
||||
@ -27,7 +27,7 @@ head(pred_with_leaf)
|
||||
create.new.tree.features <- function(model, original.features) {
|
||||
pred_with_leaf <- predict(model, original.features, predleaf = TRUE)
|
||||
cols <- list()
|
||||
for (i in 1:model$niter) {
|
||||
for (i in 1:xgb.get.num.boosted.rounds(model)) {
|
||||
# max is not the real max but it s not important for the purpose of adding features
|
||||
leaf.id <- sort(unique(pred_with_leaf[, i]))
|
||||
cols[[i]] <- factor(x = pred_with_leaf[, i], level = leaf.id)
|
||||
|
||||
@ -9,6 +9,5 @@ demo(create_sparse_matrix, package = 'xgboost')
|
||||
demo(predict_leaf_indices, package = 'xgboost')
|
||||
demo(early_stopping, package = 'xgboost')
|
||||
demo(poisson_regression, package = 'xgboost')
|
||||
demo(caret_wrapper, package = 'xgboost')
|
||||
demo(tweedie_regression, package = 'xgboost')
|
||||
#demo(gpu_accelerated, package = 'xgboost') # can only run when built with GPU support
|
||||
|
||||
@ -55,7 +55,7 @@ message(sprintf("Creating '%s' from '%s'", OUT_DEF_FILE, IN_DLL_FILE))
|
||||
}
|
||||
|
||||
# use objdump to dump all the symbols
|
||||
OBJDUMP_FILE <- "objdump-out.txt"
|
||||
OBJDUMP_FILE <- file.path(tempdir(), "objdump-out.txt")
|
||||
.pipe_shell_command_to_stdout(
|
||||
command = "objdump"
|
||||
, args = c("-p", IN_DLL_FILE)
|
||||
|
||||
@ -2,16 +2,44 @@
|
||||
% Please edit documentation in R/utils.R
|
||||
\name{a-compatibility-note-for-saveRDS-save}
|
||||
\alias{a-compatibility-note-for-saveRDS-save}
|
||||
\title{Do not use \code{\link[base]{saveRDS}} or \code{\link[base]{save}} for long-term archival of
|
||||
models. Instead, use \code{\link{xgb.save}} or \code{\link{xgb.save.raw}}.}
|
||||
\title{Model Serialization and Compatibility}
|
||||
\description{
|
||||
It is a common practice to use the built-in \code{\link[base]{saveRDS}} function (or
|
||||
\code{\link[base]{save}}) to persist R objects to the disk. While it is possible to persist
|
||||
\code{xgb.Booster} objects using \code{\link[base]{saveRDS}}, it is not advisable to do so if
|
||||
the model is to be accessed in the future. If you train a model with the current version of
|
||||
XGBoost and persist it with \code{\link[base]{saveRDS}}, the model is not guaranteed to be
|
||||
accessible in later releases of XGBoost. To ensure that your model can be accessed in future
|
||||
releases of XGBoost, use \code{\link{xgb.save}} or \code{\link{xgb.save.raw}} instead.
|
||||
When it comes to serializing XGBoost models, it's possible to use R serializers such as
|
||||
\link{save} or \link{saveRDS} to serialize an XGBoost R model, but XGBoost also provides
|
||||
its own serializers with better compatibility guarantees, which allow loading
|
||||
said models in other language bindings of XGBoost.
|
||||
|
||||
Note that an \code{xgb.Booster} object, outside of its core components, might also keep:\itemize{
|
||||
\item Additional model configuration (accessible through \link{xgb.config}),
|
||||
which includes model fitting parameters like \code{max_depth} and runtime parameters like \code{nthread}.
|
||||
These are not necessarily useful for prediction/importance/plotting.
|
||||
\item Additional R-specific attributes - e.g. results of callbacks, such as evaluation logs,
|
||||
which are kept as a \code{data.table} object, accessible through \code{attributes(model)$evaluation_log}
|
||||
if present.
|
||||
}
|
||||
|
||||
The first one (configurations) does not have the same compatibility guarantees as
|
||||
the model itself, including attributes that are set and accessed through \link{xgb.attributes} - that is, such configuration
|
||||
might be lost after loading the booster in a different XGBoost version, regardless of the
|
||||
serializer that was used. These are saved when using \link{saveRDS}, but will be discarded
|
||||
if loaded into an incompatible XGBoost version. They are not saved when using XGBoost's
|
||||
serializers from its public interface including \link{xgb.save} and \link{xgb.save.raw}.
|
||||
|
||||
The second ones (R attributes) are not part of the standard XGBoost model structure, and thus are
|
||||
not saved when using XGBoost's own serializers. These attributes are only used for informational
|
||||
purposes, such as keeping track of evaluation metrics as the model was fit, or saving the R
|
||||
call that produced the model, but are otherwise not used for prediction / importance / plotting / etc.
|
||||
These R attributes are only preserved when using R's serializers.
|
||||
|
||||
Note that XGBoost models in R starting from version \verb{2.1.0} and onwards, and XGBoost models
|
||||
before version \verb{2.1.0}; have a very different R object structure and are incompatible with
|
||||
each other. Hence, models that were saved with R serializers live \code{saveRDS} or \code{save} before
|
||||
version \verb{2.1.0} will not work with latter \code{xgboost} versions and vice versa. Be aware that
|
||||
the structure of R model objects could in theory change again in the future, so XGBoost's serializers
|
||||
should be preferred for long-term storage.
|
||||
|
||||
Furthermore, note that using the package \code{qs} for serialization will require version 0.26 or
|
||||
higher of said package, and will have the same compatibility restrictions as R serializers.
|
||||
}
|
||||
\details{
|
||||
Use \code{\link{xgb.save}} to save the XGBoost model as a stand-alone file. You may opt into
|
||||
@ -24,26 +52,29 @@ re-construct the corresponding model. To read the model back, use \code{\link{xg
|
||||
The \code{\link{xgb.save.raw}} function is useful if you'd like to persist the XGBoost model
|
||||
as part of another R object.
|
||||
|
||||
Note: Do not use \code{\link{xgb.serialize}} to store models long-term. It persists not only the
|
||||
model but also internal configurations and parameters, and its format is not stable across
|
||||
multiple XGBoost versions. Use \code{\link{xgb.serialize}} only for checkpointing.
|
||||
Use \link{saveRDS} if you require the R-specific attributes that a booster might have, such
|
||||
as evaluation logs, but note that future compatibility of such objects is outside XGBoost's
|
||||
control as it relies on R's serialization format (see e.g. the details section in
|
||||
\link{serialize} and \link{save} from base R).
|
||||
|
||||
For more details and explanation about model persistence and archival, consult the page
|
||||
\url{https://xgboost.readthedocs.io/en/latest/tutorials/saving_model.html}.
|
||||
}
|
||||
\examples{
|
||||
data(agaricus.train, package='xgboost')
|
||||
bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max_depth = 2,
|
||||
eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic")
|
||||
bst <- xgb.train(data = xgb.DMatrix(agaricus.train$data, label = agaricus.train$label),
|
||||
max_depth = 2, eta = 1, nthread = 2, nrounds = 2,
|
||||
objective = "binary:logistic")
|
||||
|
||||
# Save as a stand-alone file; load it with xgb.load()
|
||||
xgb.save(bst, 'xgb.model')
|
||||
bst2 <- xgb.load('xgb.model')
|
||||
fname <- file.path(tempdir(), "xgb_model.ubj")
|
||||
xgb.save(bst, fname)
|
||||
bst2 <- xgb.load(fname)
|
||||
|
||||
# Save as a stand-alone file (JSON); load it with xgb.load()
|
||||
xgb.save(bst, 'xgb.model.json')
|
||||
bst2 <- xgb.load('xgb.model.json')
|
||||
if (file.exists('xgb.model.json')) file.remove('xgb.model.json')
|
||||
fname <- file.path(tempdir(), "xgb_model.json")
|
||||
xgb.save(bst, fname)
|
||||
bst2 <- xgb.load(fname)
|
||||
|
||||
# Save as a raw byte vector; load it with xgb.load.raw()
|
||||
xgb_bytes <- xgb.save.raw(bst)
|
||||
@ -54,11 +85,11 @@ obj <- list(xgb_model_bytes = xgb.save.raw(bst), description = "My first XGBoost
|
||||
# Persist the R object. Here, saveRDS() is okay, since it doesn't persist
|
||||
# xgb.Booster directly. What's being persisted is the future-proof byte representation
|
||||
# as given by xgb.save.raw().
|
||||
saveRDS(obj, 'my_object.rds')
|
||||
fname <- file.path(tempdir(), "my_object.Rds")
|
||||
saveRDS(obj, fname)
|
||||
# Read back the R object
|
||||
obj2 <- readRDS('my_object.rds')
|
||||
obj2 <- readRDS(fname)
|
||||
# Re-construct xgb.Booster object from the bytes
|
||||
bst2 <- xgb.load.raw(obj2$xgb_model_bytes)
|
||||
if (file.exists('my_object.rds')) file.remove('my_object.rds')
|
||||
|
||||
}
|
||||
|
||||
@ -19,15 +19,15 @@ UCI Machine Learning Repository.
|
||||
This data set includes the following fields:
|
||||
|
||||
\itemize{
|
||||
\item \code{label} the label for each record
|
||||
\item \code{data} a sparse Matrix of \code{dgCMatrix} class, with 126 columns.
|
||||
\item \code{label} the label for each record
|
||||
\item \code{data} a sparse Matrix of \code{dgCMatrix} class, with 126 columns.
|
||||
}
|
||||
}
|
||||
\references{
|
||||
https://archive.ics.uci.edu/ml/datasets/Mushroom
|
||||
\url{https://archive.ics.uci.edu/ml/datasets/Mushroom}
|
||||
|
||||
Bache, K. & Lichman, M. (2013). UCI Machine Learning Repository
|
||||
[http://archive.ics.uci.edu/ml]. Irvine, CA: University of California,
|
||||
\url{http://archive.ics.uci.edu/ml}. Irvine, CA: University of California,
|
||||
School of Information and Computer Science.
|
||||
}
|
||||
\keyword{datasets}
|
||||
|
||||
@ -19,15 +19,15 @@ UCI Machine Learning Repository.
|
||||
This data set includes the following fields:
|
||||
|
||||
\itemize{
|
||||
\item \code{label} the label for each record
|
||||
\item \code{data} a sparse Matrix of \code{dgCMatrix} class, with 126 columns.
|
||||
\item \code{label} the label for each record
|
||||
\item \code{data} a sparse Matrix of \code{dgCMatrix} class, with 126 columns.
|
||||
}
|
||||
}
|
||||
\references{
|
||||
https://archive.ics.uci.edu/ml/datasets/Mushroom
|
||||
\url{https://archive.ics.uci.edu/ml/datasets/Mushroom}
|
||||
|
||||
Bache, K. & Lichman, M. (2013). UCI Machine Learning Repository
|
||||
[http://archive.ics.uci.edu/ml]. Irvine, CA: University of California,
|
||||
\url{http://archive.ics.uci.edu/ml}. Irvine, CA: University of California,
|
||||
School of Information and Computer Science.
|
||||
}
|
||||
\keyword{datasets}
|
||||
|
||||
@ -4,17 +4,22 @@
|
||||
\alias{cb.save.model}
|
||||
\title{Callback closure for saving a model file.}
|
||||
\usage{
|
||||
cb.save.model(save_period = 0, save_name = "xgboost.model")
|
||||
cb.save.model(save_period = 0, save_name = "xgboost.ubj")
|
||||
}
|
||||
\arguments{
|
||||
\item{save_period}{save the model to disk after every
|
||||
\code{save_period} iterations; 0 means save the model at the end.}
|
||||
|
||||
\item{save_name}{the name or path for the saved model file.
|
||||
It can contain a \code{\link[base]{sprintf}} formatting specifier
|
||||
to include the integer iteration number in the file name.
|
||||
E.g., with \code{save_name} = 'xgboost_%04d.model',
|
||||
the file saved at iteration 50 would be named "xgboost_0050.model".}
|
||||
|
||||
\if{html}{\out{<div class="sourceCode">}}\preformatted{ Note that the format of the model being saved is determined by the file
|
||||
extension specified here (see \link{xgb.save} for details about how it works).
|
||||
|
||||
It can contain a \code{\link[base]{sprintf}} formatting specifier
|
||||
to include the integer iteration number in the file name.
|
||||
E.g., with \code{save_name} = 'xgboost_\%04d.ubj',
|
||||
the file saved at iteration 50 would be named "xgboost_0050.ubj".
|
||||
}\if{html}{\out{</div>}}}
|
||||
}
|
||||
\description{
|
||||
Callback closure for saving a model file.
|
||||
@ -29,5 +34,7 @@ Callback function expects the following values to be set in its calling frame:
|
||||
\code{end_iteration}.
|
||||
}
|
||||
\seealso{
|
||||
\link{xgb.save}
|
||||
|
||||
\code{\link{callbacks}}
|
||||
}
|
||||
|
||||
50
R-package/man/coef.xgb.Booster.Rd
Normal file
50
R-package/man/coef.xgb.Booster.Rd
Normal file
@ -0,0 +1,50 @@
|
||||
% Generated by roxygen2: do not edit by hand
|
||||
% Please edit documentation in R/xgb.Booster.R
|
||||
\name{coef.xgb.Booster}
|
||||
\alias{coef.xgb.Booster}
|
||||
\title{Extract coefficients from linear booster}
|
||||
\usage{
|
||||
\method{coef}{xgb.Booster}(object, ...)
|
||||
}
|
||||
\arguments{
|
||||
\item{object}{A fitted booster of 'gblinear' type.}
|
||||
|
||||
\item{...}{Not used.}
|
||||
}
|
||||
\value{
|
||||
The extracted coefficients:\itemize{
|
||||
\item If there's only one coefficient per column in the data, will be returned as a
|
||||
vector, potentially containing the feature names if available, with the intercept
|
||||
as first column.
|
||||
\item If there's more than one coefficient per column in the data (e.g. when using
|
||||
\code{objective="multi:softmax"}), will be returned as a matrix with dimensions equal
|
||||
to \verb{[num_features, num_cols]}, with the intercepts as first row. Note that the column
|
||||
(classes in multi-class classification) dimension will not be named.
|
||||
}
|
||||
|
||||
The intercept returned here will include the 'base_score' parameter (unlike the 'bias'
|
||||
or the last coefficient in the model dump, which doesn't have 'base_score' added to it),
|
||||
hence one should get the same values from calling \code{predict(..., outputmargin = TRUE)} and
|
||||
from performing a matrix multiplication with \code{model.matrix(~., ...)}.
|
||||
|
||||
Be aware that the coefficients are obtained by first converting them to strings and
|
||||
back, so there will always be some very small lose of precision compared to the actual
|
||||
coefficients as used by \link{predict.xgb.Booster}.
|
||||
}
|
||||
\description{
|
||||
Extracts the coefficients from a 'gblinear' booster object,
|
||||
as produced by \code{xgb.train} when using parameter \code{booster="gblinear"}.
|
||||
|
||||
Note: this function will error out if passing a booster model
|
||||
which is not of "gblinear" type.
|
||||
}
|
||||
\examples{
|
||||
library(xgboost)
|
||||
data(mtcars)
|
||||
y <- mtcars[, 1]
|
||||
x <- as.matrix(mtcars[, -1])
|
||||
dm <- xgb.DMatrix(data = x, label = y, nthread = 1)
|
||||
params <- list(booster = "gblinear", nthread = 1)
|
||||
model <- xgb.train(data = dm, params = params, nrounds = 2)
|
||||
coef(model)
|
||||
}
|
||||
@ -1,36 +1,78 @@
|
||||
% Generated by roxygen2: do not edit by hand
|
||||
% Please edit documentation in R/xgb.DMatrix.R
|
||||
\name{getinfo}
|
||||
% Please edit documentation in R/xgb.Booster.R, R/xgb.DMatrix.R
|
||||
\name{getinfo.xgb.Booster}
|
||||
\alias{getinfo.xgb.Booster}
|
||||
\alias{setinfo.xgb.Booster}
|
||||
\alias{getinfo}
|
||||
\alias{getinfo.xgb.DMatrix}
|
||||
\title{Get information of an xgb.DMatrix object}
|
||||
\alias{setinfo}
|
||||
\alias{setinfo.xgb.DMatrix}
|
||||
\title{Get or set information of xgb.DMatrix and xgb.Booster objects}
|
||||
\usage{
|
||||
getinfo(object, ...)
|
||||
\method{getinfo}{xgb.Booster}(object, name)
|
||||
|
||||
\method{getinfo}{xgb.DMatrix}(object, name, ...)
|
||||
\method{setinfo}{xgb.Booster}(object, name, info)
|
||||
|
||||
getinfo(object, name)
|
||||
|
||||
\method{getinfo}{xgb.DMatrix}(object, name)
|
||||
|
||||
setinfo(object, name, info)
|
||||
|
||||
\method{setinfo}{xgb.DMatrix}(object, name, info)
|
||||
}
|
||||
\arguments{
|
||||
\item{object}{Object of class \code{xgb.DMatrix}}
|
||||
|
||||
\item{...}{other parameters}
|
||||
\item{object}{Object of class \code{xgb.DMatrix} of \code{xgb.Booster}.}
|
||||
|
||||
\item{name}{the name of the information field to get (see details)}
|
||||
|
||||
\item{info}{the specific field of information to set}
|
||||
}
|
||||
\value{
|
||||
For \code{getinfo}, will return the requested field. For \code{setinfo}, will always return value \code{TRUE}
|
||||
if it succeeds.
|
||||
}
|
||||
\description{
|
||||
Get information of an xgb.DMatrix object
|
||||
Get or set information of xgb.DMatrix and xgb.Booster objects
|
||||
}
|
||||
\details{
|
||||
The \code{name} field can be one of the following:
|
||||
The \code{name} field can be one of the following for \code{xgb.DMatrix}:
|
||||
|
||||
\itemize{
|
||||
\item \code{label}: label XGBoost learn from ;
|
||||
\item \code{weight}: to do a weight rescale ;
|
||||
\item \code{base_margin}: base margin is the base prediction XGBoost will boost from ;
|
||||
\item \code{nrow}: number of rows of the \code{xgb.DMatrix}.
|
||||
\item \code{label}
|
||||
\item \code{weight}
|
||||
\item \code{base_margin}
|
||||
\item \code{label_lower_bound}
|
||||
\item \code{label_upper_bound}
|
||||
\item \code{group}
|
||||
\item \code{feature_type}
|
||||
\item \code{feature_name}
|
||||
\item \code{nrow}
|
||||
}
|
||||
See the documentation for \link{xgb.DMatrix} for more information about these fields.
|
||||
|
||||
For \code{xgb.Booster}, can be one of the following:
|
||||
\itemize{
|
||||
\item \code{feature_type}
|
||||
\item \code{feature_name}
|
||||
}
|
||||
|
||||
\code{group} can be setup by \code{setinfo} but can't be retrieved by \code{getinfo}.
|
||||
Note that, while 'qid' cannot be retrieved, it's possible to get the equivalent 'group'
|
||||
for a DMatrix that had 'qid' assigned.
|
||||
|
||||
\bold{Important}: when calling \code{setinfo}, the objects are modified in-place. See
|
||||
\link{xgb.copy.Booster} for an idea of this in-place assignment works.
|
||||
|
||||
See the documentation for \link{xgb.DMatrix} for possible fields that can be set
|
||||
(which correspond to arguments in that function).
|
||||
|
||||
Note that the following fields are allowed in the construction of an \code{xgb.DMatrix}
|
||||
but \bold{aren't} allowed here:\itemize{
|
||||
\item data
|
||||
\item missing
|
||||
\item silent
|
||||
\item nthread
|
||||
}
|
||||
}
|
||||
\examples{
|
||||
data(agaricus.train, package='xgboost')
|
||||
@ -41,4 +83,11 @@ setinfo(dtrain, 'label', 1-labels)
|
||||
|
||||
labels2 <- getinfo(dtrain, 'label')
|
||||
stopifnot(all(labels2 == 1-labels))
|
||||
data(agaricus.train, package='xgboost')
|
||||
dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label, nthread = 2))
|
||||
|
||||
labels <- getinfo(dtrain, 'label')
|
||||
setinfo(dtrain, 'label', 1-labels)
|
||||
labels2 <- getinfo(dtrain, 'label')
|
||||
stopifnot(all.equal(labels2, 1-labels))
|
||||
}
|
||||
|
||||
@ -2,8 +2,7 @@
|
||||
% Please edit documentation in R/xgb.Booster.R
|
||||
\name{predict.xgb.Booster}
|
||||
\alias{predict.xgb.Booster}
|
||||
\alias{predict.xgb.Booster.handle}
|
||||
\title{Predict method for eXtreme Gradient Boosting model}
|
||||
\title{Predict method for XGBoost model}
|
||||
\usage{
|
||||
\method{predict}{xgb.Booster}(
|
||||
object,
|
||||
@ -21,94 +20,88 @@
|
||||
strict_shape = FALSE,
|
||||
...
|
||||
)
|
||||
|
||||
\method{predict}{xgb.Booster.handle}(object, ...)
|
||||
}
|
||||
\arguments{
|
||||
\item{object}{Object of class \code{xgb.Booster} or \code{xgb.Booster.handle}}
|
||||
\item{object}{Object of class \code{xgb.Booster}.}
|
||||
|
||||
\item{newdata}{takes \code{matrix}, \code{dgCMatrix}, \code{dgRMatrix}, \code{dsparseVector},
|
||||
local data file or \code{xgb.DMatrix}.
|
||||
\item{newdata}{Takes \code{matrix}, \code{dgCMatrix}, \code{dgRMatrix}, \code{dsparseVector},
|
||||
local data file, or \code{xgb.DMatrix}.
|
||||
For single-row predictions on sparse data, it is recommended to use the CSR format.
|
||||
If passing a sparse vector, it will take it as a row vector.}
|
||||
|
||||
For single-row predictions on sparse data, it's recommended to use CSR format. If passing
|
||||
a sparse vector, it will take it as a row vector.}
|
||||
\item{missing}{Only used when input is a dense matrix. Pick a float value that represents
|
||||
missing values in data (e.g., 0 or some other extreme value).}
|
||||
|
||||
\item{missing}{Missing is only used when input is dense matrix. Pick a float value that represents
|
||||
missing values in data (e.g., sometimes 0 or some other extreme value is used).}
|
||||
|
||||
\item{outputmargin}{whether the prediction should be returned in the for of original untransformed
|
||||
\item{outputmargin}{Whether the prediction should be returned in the form of original untransformed
|
||||
sum of predictions from boosting iterations' results. E.g., setting \code{outputmargin=TRUE} for
|
||||
logistic regression would result in predictions for log-odds instead of probabilities.}
|
||||
logistic regression would return log-odds instead of probabilities.}
|
||||
|
||||
\item{ntreelimit}{Deprecated, use \code{iterationrange} instead.}
|
||||
|
||||
\item{predleaf}{whether predict leaf index.}
|
||||
\item{predleaf}{Whether to predict pre-tree leaf indices.}
|
||||
|
||||
\item{predcontrib}{whether to return feature contributions to individual predictions (see Details).}
|
||||
\item{predcontrib}{Whether to return feature contributions to individual predictions (see Details).}
|
||||
|
||||
\item{approxcontrib}{whether to use a fast approximation for feature contributions (see Details).}
|
||||
\item{approxcontrib}{Whether to use a fast approximation for feature contributions (see Details).}
|
||||
|
||||
\item{predinteraction}{whether to return contributions of feature interactions to individual predictions (see Details).}
|
||||
\item{predinteraction}{Whether to return contributions of feature interactions to individual predictions (see Details).}
|
||||
|
||||
\item{reshape}{whether to reshape the vector of predictions to a matrix form when there are several
|
||||
prediction outputs per case. This option has no effect when either of predleaf, predcontrib,
|
||||
or predinteraction flags is TRUE.}
|
||||
\item{reshape}{Whether to reshape the vector of predictions to matrix form when there are several
|
||||
prediction outputs per case. No effect if \code{predleaf}, \code{predcontrib},
|
||||
or \code{predinteraction} is \code{TRUE}.}
|
||||
|
||||
\item{training}{whether is the prediction result used for training. For dart booster,
|
||||
\item{training}{Whether the predictions are used for training. For dart booster,
|
||||
training predicting will perform dropout.}
|
||||
|
||||
\item{iterationrange}{Specifies which layer of trees are used in prediction. For
|
||||
example, if a random forest is trained with 100 rounds. Specifying
|
||||
`iterationrange=(1, 21)`, then only the forests built during [1, 21) (half open set)
|
||||
rounds are used in this prediction. It's 1-based index just like R vector. When set
|
||||
to \code{c(1, 1)} XGBoost will use all trees.}
|
||||
\item{iterationrange}{Specifies which trees are used in prediction. For
|
||||
example, take a random forest with 100 rounds.
|
||||
With \code{iterationrange=c(1, 21)}, only the trees built during \verb{[1, 21)} (half open set)
|
||||
rounds are used in this prediction. The index is 1-based just like an R vector. When set
|
||||
to \code{c(1, 1)}, XGBoost will use all trees.}
|
||||
|
||||
\item{strict_shape}{Default is \code{FALSE}. When it's set to \code{TRUE}, output
|
||||
type and shape of prediction are invariant to model type.}
|
||||
\item{strict_shape}{Default is \code{FALSE}. When set to \code{TRUE}, the output
|
||||
type and shape of predictions are invariant to the model type.}
|
||||
|
||||
\item{...}{Parameters passed to \code{predict.xgb.Booster}}
|
||||
\item{...}{Not used.}
|
||||
}
|
||||
\value{
|
||||
The return type is different depending whether \code{strict_shape} is set to \code{TRUE}. By default,
|
||||
for regression or binary classification, it returns a vector of length \code{nrows(newdata)}.
|
||||
For multiclass classification, either a \code{num_class * nrows(newdata)} vector or
|
||||
a \code{(nrows(newdata), num_class)} dimension matrix is returned, depending on
|
||||
the \code{reshape} value.
|
||||
The return type depends on \code{strict_shape}. If \code{FALSE} (default):
|
||||
\itemize{
|
||||
\item For regression or binary classification: A vector of length \code{nrows(newdata)}.
|
||||
\item For multiclass classification: A vector of length \code{num_class * nrows(newdata)} or
|
||||
a \verb{(nrows(newdata), num_class)} matrix, depending on the \code{reshape} value.
|
||||
\item When \code{predleaf = TRUE}: A matrix with one column per tree.
|
||||
\item When \code{predcontrib = TRUE}: When not multiclass, a matrix with
|
||||
\code{ num_features + 1} columns. The last "+ 1" column corresponds to the baseline value.
|
||||
In the multiclass case, a list of \code{num_class} such matrices.
|
||||
The contribution values are on the scale of untransformed margin
|
||||
(e.g., for binary classification, the values are log-odds deviations from the baseline).
|
||||
\item When \code{predinteraction = TRUE}: When not multiclass, the output is a 3d array of
|
||||
dimension \code{c(nrow, num_features + 1, num_features + 1)}. The off-diagonal (in the last two dimensions)
|
||||
elements represent different feature interaction contributions. The array is symmetric WRT the last
|
||||
two dimensions. The "+ 1" columns corresponds to the baselines. Summing this array along the last dimension should
|
||||
produce practically the same result as \code{predcontrib = TRUE}.
|
||||
In the multiclass case, a list of \code{num_class} such arrays.
|
||||
}
|
||||
|
||||
When \code{predleaf = TRUE}, the output is a matrix object with the
|
||||
number of columns corresponding to the number of trees.
|
||||
|
||||
When \code{predcontrib = TRUE} and it is not a multiclass setting, the output is a matrix object with
|
||||
\code{num_features + 1} columns. The last "+ 1" column in a matrix corresponds to bias.
|
||||
For a multiclass case, a list of \code{num_class} elements is returned, where each element is
|
||||
such a matrix. The contribution values are on the scale of untransformed margin
|
||||
(e.g., for binary classification would mean that the contributions are log-odds deviations from bias).
|
||||
|
||||
When \code{predinteraction = TRUE} and it is not a multiclass setting, the output is a 3d array with
|
||||
dimensions \code{c(nrow, num_features + 1, num_features + 1)}. The off-diagonal (in the last two dimensions)
|
||||
elements represent different features interaction contributions. The array is symmetric WRT the last
|
||||
two dimensions. The "+ 1" columns corresponds to bias. Summing this array along the last dimension should
|
||||
produce practically the same result as predict with \code{predcontrib = TRUE}.
|
||||
For a multiclass case, a list of \code{num_class} elements is returned, where each element is
|
||||
such an array.
|
||||
|
||||
When \code{strict_shape} is set to \code{TRUE}, the output is always an array. For
|
||||
normal prediction, the output is a 2-dimension array \code{(num_class, nrow(newdata))}.
|
||||
|
||||
For \code{predcontrib = TRUE}, output is \code{(ncol(newdata) + 1, num_class, nrow(newdata))}
|
||||
For \code{predinteraction = TRUE}, output is \code{(ncol(newdata) + 1, ncol(newdata) + 1, num_class, nrow(newdata))}
|
||||
For \code{predleaf = TRUE}, output is \code{(n_trees_in_forest, num_class, n_iterations, nrow(newdata))}
|
||||
When \code{strict_shape = TRUE}, the output is always an array:
|
||||
\itemize{
|
||||
\item For normal predictions, the output has dimension \verb{(num_class, nrow(newdata))}.
|
||||
\item For \code{predcontrib = TRUE}, the dimension is \verb{(ncol(newdata) + 1, num_class, nrow(newdata))}.
|
||||
\item For \code{predinteraction = TRUE}, the dimension is \verb{(ncol(newdata) + 1, ncol(newdata) + 1, num_class, nrow(newdata))}.
|
||||
\item For \code{predleaf = TRUE}, the dimension is \verb{(n_trees_in_forest, num_class, n_iterations, nrow(newdata))}.
|
||||
}
|
||||
}
|
||||
\description{
|
||||
Predicted values based on either xgboost model or model handle object.
|
||||
}
|
||||
\details{
|
||||
Note that \code{iterationrange} would currently do nothing for predictions from gblinear,
|
||||
since gblinear doesn't keep its boosting history.
|
||||
Note that \code{iterationrange} would currently do nothing for predictions from "gblinear",
|
||||
since "gblinear" doesn't keep its boosting history.
|
||||
|
||||
One possible practical applications of the \code{predleaf} option is to use the model
|
||||
as a generator of new features which capture non-linearity and interactions,
|
||||
e.g., as implemented in \code{\link{xgb.create.features}}.
|
||||
e.g., as implemented in \code{\link[=xgb.create.features]{xgb.create.features()}}.
|
||||
|
||||
Setting \code{predcontrib = TRUE} allows to calculate contributions of each feature to
|
||||
individual predictions. For "gblinear" booster, feature contributions are simply linear terms
|
||||
@ -124,14 +117,14 @@ Since it quadratically depends on the number of features, it is recommended to p
|
||||
of the most important features first. See below about the format of the returned results.
|
||||
|
||||
The \code{predict()} method uses as many threads as defined in \code{xgb.Booster} object (all by default).
|
||||
If you want to change their number, then assign a new number to \code{nthread} using \code{\link{xgb.parameters<-}}.
|
||||
Note also that converting a matrix to \code{\link{xgb.DMatrix}} uses multiple threads too.
|
||||
If you want to change their number, assign a new number to \code{nthread} using \code{\link[=xgb.parameters<-]{xgb.parameters<-()}}.
|
||||
Note that converting a matrix to \code{\link[=xgb.DMatrix]{xgb.DMatrix()}} uses multiple threads too.
|
||||
}
|
||||
\examples{
|
||||
## binary classification:
|
||||
|
||||
data(agaricus.train, package='xgboost')
|
||||
data(agaricus.test, package='xgboost')
|
||||
data(agaricus.train, package = "xgboost")
|
||||
data(agaricus.test, package = "xgboost")
|
||||
|
||||
## Keep the number of threads to 2 for examples
|
||||
nthread <- 2
|
||||
@ -140,8 +133,15 @@ data.table::setDTthreads(nthread)
|
||||
train <- agaricus.train
|
||||
test <- agaricus.test
|
||||
|
||||
bst <- xgboost(data = train$data, label = train$label, max_depth = 2,
|
||||
eta = 0.5, nthread = nthread, nrounds = 5, objective = "binary:logistic")
|
||||
bst <- xgb.train(
|
||||
data = xgb.DMatrix(train$data, label = train$label),
|
||||
max_depth = 2,
|
||||
eta = 0.5,
|
||||
nthread = nthread,
|
||||
nrounds = 5,
|
||||
objective = "binary:logistic"
|
||||
)
|
||||
|
||||
# use all trees by default
|
||||
pred <- predict(bst, test$data)
|
||||
# use only the 1st tree
|
||||
@ -173,39 +173,61 @@ par(mar = old_mar)
|
||||
|
||||
lb <- as.numeric(iris$Species) - 1
|
||||
num_class <- 3
|
||||
|
||||
set.seed(11)
|
||||
bst <- xgboost(data = as.matrix(iris[, -5]), label = lb,
|
||||
max_depth = 4, eta = 0.5, nthread = 2, nrounds = 10, subsample = 0.5,
|
||||
objective = "multi:softprob", num_class = num_class)
|
||||
|
||||
bst <- xgb.train(
|
||||
data = xgb.DMatrix(as.matrix(iris[, -5]), label = lb),
|
||||
max_depth = 4,
|
||||
eta = 0.5,
|
||||
nthread = 2,
|
||||
nrounds = 10,
|
||||
subsample = 0.5,
|
||||
objective = "multi:softprob",
|
||||
num_class = num_class
|
||||
)
|
||||
|
||||
# predict for softmax returns num_class probability numbers per case:
|
||||
pred <- predict(bst, as.matrix(iris[, -5]))
|
||||
str(pred)
|
||||
# reshape it to a num_class-columns matrix
|
||||
pred <- matrix(pred, ncol=num_class, byrow=TRUE)
|
||||
pred <- matrix(pred, ncol = num_class, byrow = TRUE)
|
||||
# convert the probabilities to softmax labels
|
||||
pred_labels <- max.col(pred) - 1
|
||||
# the following should result in the same error as seen in the last iteration
|
||||
sum(pred_labels != lb)/length(lb)
|
||||
sum(pred_labels != lb) / length(lb)
|
||||
|
||||
# compare that to the predictions from softmax:
|
||||
# compare with predictions from softmax:
|
||||
set.seed(11)
|
||||
bst <- xgboost(data = as.matrix(iris[, -5]), label = lb,
|
||||
max_depth = 4, eta = 0.5, nthread = 2, nrounds = 10, subsample = 0.5,
|
||||
objective = "multi:softmax", num_class = num_class)
|
||||
|
||||
bst <- xgb.train(
|
||||
data = xgb.DMatrix(as.matrix(iris[, -5]), label = lb),
|
||||
max_depth = 4,
|
||||
eta = 0.5,
|
||||
nthread = 2,
|
||||
nrounds = 10,
|
||||
subsample = 0.5,
|
||||
objective = "multi:softmax",
|
||||
num_class = num_class
|
||||
)
|
||||
|
||||
pred <- predict(bst, as.matrix(iris[, -5]))
|
||||
str(pred)
|
||||
all.equal(pred, pred_labels)
|
||||
# prediction from using only 5 iterations should result
|
||||
# in the same error as seen in iteration 5:
|
||||
pred5 <- predict(bst, as.matrix(iris[, -5]), iterationrange=c(1, 6))
|
||||
sum(pred5 != lb)/length(lb)
|
||||
pred5 <- predict(bst, as.matrix(iris[, -5]), iterationrange = c(1, 6))
|
||||
sum(pred5 != lb) / length(lb)
|
||||
|
||||
}
|
||||
\references{
|
||||
Scott M. Lundberg, Su-In Lee, "A Unified Approach to Interpreting Model Predictions", NIPS Proceedings 2017, \url{https://arxiv.org/abs/1705.07874}
|
||||
|
||||
Scott M. Lundberg, Su-In Lee, "Consistent feature attribution for tree ensembles", \url{https://arxiv.org/abs/1706.06060}
|
||||
\enumerate{
|
||||
\item Scott M. Lundberg, Su-In Lee, "A Unified Approach to Interpreting Model Predictions",
|
||||
NIPS Proceedings 2017, \url{https://arxiv.org/abs/1705.07874}
|
||||
\item Scott M. Lundberg, Su-In Lee, "Consistent feature attribution for tree ensembles",
|
||||
\url{https://arxiv.org/abs/1706.06060}
|
||||
}
|
||||
}
|
||||
\seealso{
|
||||
\code{\link{xgb.train}}.
|
||||
\code{\link[=xgb.train]{xgb.train()}}
|
||||
}
|
||||
|
||||
@ -4,26 +4,35 @@
|
||||
\alias{print.xgb.Booster}
|
||||
\title{Print xgb.Booster}
|
||||
\usage{
|
||||
\method{print}{xgb.Booster}(x, verbose = FALSE, ...)
|
||||
\method{print}{xgb.Booster}(x, ...)
|
||||
}
|
||||
\arguments{
|
||||
\item{x}{an xgb.Booster object}
|
||||
\item{x}{An \code{xgb.Booster} object.}
|
||||
|
||||
\item{verbose}{whether to print detailed data (e.g., attribute values)}
|
||||
|
||||
\item{...}{not currently used}
|
||||
\item{...}{Not used.}
|
||||
}
|
||||
\value{
|
||||
The same \code{x} object, returned invisibly
|
||||
}
|
||||
\description{
|
||||
Print information about xgb.Booster.
|
||||
Print information about \code{xgb.Booster}.
|
||||
}
|
||||
\examples{
|
||||
data(agaricus.train, package='xgboost')
|
||||
data(agaricus.train, package = "xgboost")
|
||||
train <- agaricus.train
|
||||
bst <- xgboost(data = train$data, label = train$label, max_depth = 2,
|
||||
eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic")
|
||||
attr(bst, 'myattr') <- 'memo'
|
||||
|
||||
bst <- xgboost(
|
||||
data = train$data,
|
||||
label = train$label,
|
||||
max_depth = 2,
|
||||
eta = 1,
|
||||
nthread = 2,
|
||||
nrounds = 2,
|
||||
objective = "binary:logistic"
|
||||
)
|
||||
|
||||
attr(bst, "myattr") <- "memo"
|
||||
|
||||
print(bst)
|
||||
print(bst, verbose=TRUE)
|
||||
|
||||
}
|
||||
|
||||
@ -1,42 +0,0 @@
|
||||
% Generated by roxygen2: do not edit by hand
|
||||
% Please edit documentation in R/xgb.DMatrix.R
|
||||
\name{setinfo}
|
||||
\alias{setinfo}
|
||||
\alias{setinfo.xgb.DMatrix}
|
||||
\title{Set information of an xgb.DMatrix object}
|
||||
\usage{
|
||||
setinfo(object, ...)
|
||||
|
||||
\method{setinfo}{xgb.DMatrix}(object, name, info, ...)
|
||||
}
|
||||
\arguments{
|
||||
\item{object}{Object of class "xgb.DMatrix"}
|
||||
|
||||
\item{...}{other parameters}
|
||||
|
||||
\item{name}{the name of the field to get}
|
||||
|
||||
\item{info}{the specific field of information to set}
|
||||
}
|
||||
\description{
|
||||
Set information of an xgb.DMatrix object
|
||||
}
|
||||
\details{
|
||||
The \code{name} field can be one of the following:
|
||||
|
||||
\itemize{
|
||||
\item \code{label}: label XGBoost learn from ;
|
||||
\item \code{weight}: to do a weight rescale ;
|
||||
\item \code{base_margin}: base margin is the base prediction XGBoost will boost from ;
|
||||
\item \code{group}: number of rows in each group (to use with \code{rank:pairwise} objective).
|
||||
}
|
||||
}
|
||||
\examples{
|
||||
data(agaricus.train, package='xgboost')
|
||||
dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label, nthread = 2))
|
||||
|
||||
labels <- getinfo(dtrain, 'label')
|
||||
setinfo(dtrain, 'label', 1-labels)
|
||||
labels2 <- getinfo(dtrain, 'label')
|
||||
stopifnot(all.equal(labels2, 1-labels))
|
||||
}
|
||||
@ -7,17 +7,15 @@
|
||||
\title{Get a new DMatrix containing the specified rows of
|
||||
original xgb.DMatrix object}
|
||||
\usage{
|
||||
slice(object, ...)
|
||||
slice(object, idxset)
|
||||
|
||||
\method{slice}{xgb.DMatrix}(object, idxset, ...)
|
||||
\method{slice}{xgb.DMatrix}(object, idxset)
|
||||
|
||||
\method{[}{xgb.DMatrix}(object, idxset, colset = NULL)
|
||||
}
|
||||
\arguments{
|
||||
\item{object}{Object of class "xgb.DMatrix"}
|
||||
|
||||
\item{...}{other parameters (currently not used)}
|
||||
|
||||
\item{idxset}{a integer vector of indices of rows needed}
|
||||
|
||||
\item{colset}{currently not used (columns subsetting is not available)}
|
||||
|
||||
22
R-package/man/variable.names.xgb.Booster.Rd
Normal file
22
R-package/man/variable.names.xgb.Booster.Rd
Normal file
@ -0,0 +1,22 @@
|
||||
% Generated by roxygen2: do not edit by hand
|
||||
% Please edit documentation in R/xgb.Booster.R
|
||||
\name{variable.names.xgb.Booster}
|
||||
\alias{variable.names.xgb.Booster}
|
||||
\title{Get Features Names from Booster}
|
||||
\usage{
|
||||
\method{variable.names}{xgb.Booster}(object, ...)
|
||||
}
|
||||
\arguments{
|
||||
\item{object}{An \code{xgb.Booster} object.}
|
||||
|
||||
\item{...}{Not used.}
|
||||
}
|
||||
\description{
|
||||
Returns the feature / variable / column names from a fitted
|
||||
booster object, which are set automatically during the call to \link{xgb.train}
|
||||
from the DMatrix names, or which can be set manually through \link{setinfo}.
|
||||
|
||||
If the object doesn't have feature names, will return \code{NULL}.
|
||||
|
||||
It is equivalent to calling \code{getinfo(object, "feature_name")}.
|
||||
}
|
||||
@ -1,52 +0,0 @@
|
||||
% Generated by roxygen2: do not edit by hand
|
||||
% Please edit documentation in R/xgb.Booster.R
|
||||
\name{xgb.Booster.complete}
|
||||
\alias{xgb.Booster.complete}
|
||||
\title{Restore missing parts of an incomplete xgb.Booster object.}
|
||||
\usage{
|
||||
xgb.Booster.complete(object, saveraw = TRUE)
|
||||
}
|
||||
\arguments{
|
||||
\item{object}{object of class \code{xgb.Booster}}
|
||||
|
||||
\item{saveraw}{a flag indicating whether to append \code{raw} Booster memory dump data
|
||||
when it doesn't already exist.}
|
||||
}
|
||||
\value{
|
||||
An object of \code{xgb.Booster} class.
|
||||
}
|
||||
\description{
|
||||
It attempts to complete an \code{xgb.Booster} object by restoring either its missing
|
||||
raw model memory dump (when it has no \code{raw} data but its \code{xgb.Booster.handle} is valid)
|
||||
or its missing internal handle (when its \code{xgb.Booster.handle} is not valid
|
||||
but it has a raw Booster memory dump).
|
||||
}
|
||||
\details{
|
||||
While this method is primarily for internal use, it might be useful in some practical situations.
|
||||
|
||||
E.g., when an \code{xgb.Booster} model is saved as an R object and then is loaded as an R object,
|
||||
its handle (pointer) to an internal xgboost model would be invalid. The majority of xgboost methods
|
||||
should still work for such a model object since those methods would be using
|
||||
\code{xgb.Booster.complete} internally. However, one might find it to be more efficient to call the
|
||||
\code{xgb.Booster.complete} function explicitly once after loading a model as an R-object.
|
||||
That would prevent further repeated implicit reconstruction of an internal booster model.
|
||||
}
|
||||
\examples{
|
||||
|
||||
data(agaricus.train, package='xgboost')
|
||||
bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max_depth = 2,
|
||||
eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic")
|
||||
saveRDS(bst, "xgb.model.rds")
|
||||
|
||||
# Warning: The resulting RDS file is only compatible with the current XGBoost version.
|
||||
# Refer to the section titled "a-compatibility-note-for-saveRDS-save".
|
||||
bst1 <- readRDS("xgb.model.rds")
|
||||
if (file.exists("xgb.model.rds")) file.remove("xgb.model.rds")
|
||||
# the handle is invalid:
|
||||
print(bst1$handle)
|
||||
|
||||
bst1 <- xgb.Booster.complete(bst1)
|
||||
# now the handle points to a valid internal booster model:
|
||||
print(bst1$handle)
|
||||
|
||||
}
|
||||
@ -6,11 +6,19 @@
|
||||
\usage{
|
||||
xgb.DMatrix(
|
||||
data,
|
||||
info = list(),
|
||||
label = NULL,
|
||||
weight = NULL,
|
||||
base_margin = NULL,
|
||||
missing = NA,
|
||||
silent = FALSE,
|
||||
feature_names = colnames(data),
|
||||
nthread = NULL,
|
||||
...
|
||||
group = NULL,
|
||||
qid = NULL,
|
||||
label_lower_bound = NULL,
|
||||
label_upper_bound = NULL,
|
||||
feature_weights = NULL,
|
||||
enable_categorical = FALSE
|
||||
)
|
||||
}
|
||||
\arguments{
|
||||
@ -19,23 +27,65 @@ a \code{dgRMatrix} object,
|
||||
a \code{dsparseVector} object (only when making predictions from a fitted model, will be
|
||||
interpreted as a row vector), or a character string representing a filename.}
|
||||
|
||||
\item{info}{a named list of additional information to store in the \code{xgb.DMatrix} object.
|
||||
See \code{\link{setinfo}} for the specific allowed kinds of}
|
||||
\item{label}{Label of the training data.}
|
||||
|
||||
\item{weight}{Weight for each instance.
|
||||
|
||||
Note that, for ranking task, weights are per-group. In ranking task, one weight
|
||||
is assigned to each group (not each data point). This is because we
|
||||
only care about the relative ordering of data points within each group,
|
||||
so it doesn't make sense to assign weights to individual data points.}
|
||||
|
||||
\item{base_margin}{Base margin used for boosting from existing model.
|
||||
|
||||
\if{html}{\out{<div class="sourceCode">}}\preformatted{ In the case of multi-output models, one can also pass multi-dimensional base_margin.
|
||||
}\if{html}{\out{</div>}}}
|
||||
|
||||
\item{missing}{a float value to represents missing values in data (used only when input is a dense matrix).
|
||||
It is useful when a 0 or some other extreme value represents missing values in data.}
|
||||
|
||||
\item{silent}{whether to suppress printing an informational message after loading from a file.}
|
||||
|
||||
\item{feature_names}{Set names for features. Overrides column names in data
|
||||
frame and matrix.}
|
||||
|
||||
\item{nthread}{Number of threads used for creating DMatrix.}
|
||||
|
||||
\item{...}{the \code{info} data could be passed directly as parameters, without creating an \code{info} list.}
|
||||
\item{group}{Group size for all ranking group.}
|
||||
|
||||
\item{qid}{Query ID for data samples, used for ranking.}
|
||||
|
||||
\item{label_lower_bound}{Lower bound for survival training.}
|
||||
|
||||
\item{label_upper_bound}{Upper bound for survival training.}
|
||||
|
||||
\item{feature_weights}{Set feature weights for column sampling.}
|
||||
|
||||
\item{enable_categorical}{Experimental support of specializing for categorical features.
|
||||
|
||||
\if{html}{\out{<div class="sourceCode">}}\preformatted{ If passing 'TRUE' and 'data' is a data frame,
|
||||
columns of categorical types will automatically
|
||||
be set to be of categorical type (feature_type='c') in the resulting DMatrix.
|
||||
|
||||
If passing 'FALSE' and 'data' is a data frame with categorical columns,
|
||||
it will result in an error being thrown.
|
||||
|
||||
If 'data' is not a data frame, this argument is ignored.
|
||||
|
||||
JSON/UBJSON serialization format is required for this.
|
||||
}\if{html}{\out{</div>}}}
|
||||
}
|
||||
\description{
|
||||
Construct xgb.DMatrix object from either a dense matrix, a sparse matrix, or a local file.
|
||||
Supported input file formats are either a LIBSVM text file or a binary file that was created previously by
|
||||
\code{\link{xgb.DMatrix.save}}).
|
||||
}
|
||||
\details{
|
||||
Note that DMatrix objects are not serializable through R functions such as \code{saveRDS} or \code{save}.
|
||||
If a DMatrix gets serialized and then de-serialized (for example, when saving data in an R session or caching
|
||||
chunks in an Rmd file), the resulting object will not be usable anymore and will need to be reconstructed
|
||||
from the original source of data.
|
||||
}
|
||||
\examples{
|
||||
data(agaricus.train, package='xgboost')
|
||||
## Keep the number of threads to 1 for examples
|
||||
@ -44,7 +94,7 @@ data.table::setDTthreads(nthread)
|
||||
dtrain <- with(
|
||||
agaricus.train, xgb.DMatrix(data, label = label, nthread = nthread)
|
||||
)
|
||||
xgb.DMatrix.save(dtrain, 'xgb.DMatrix.data')
|
||||
dtrain <- xgb.DMatrix('xgb.DMatrix.data')
|
||||
if (file.exists('xgb.DMatrix.data')) file.remove('xgb.DMatrix.data')
|
||||
fname <- file.path(tempdir(), "xgb.DMatrix.data")
|
||||
xgb.DMatrix.save(dtrain, fname)
|
||||
dtrain <- xgb.DMatrix(fname)
|
||||
}
|
||||
|
||||
32
R-package/man/xgb.DMatrix.hasinfo.Rd
Normal file
32
R-package/man/xgb.DMatrix.hasinfo.Rd
Normal file
@ -0,0 +1,32 @@
|
||||
% Generated by roxygen2: do not edit by hand
|
||||
% Please edit documentation in R/xgb.DMatrix.R
|
||||
\name{xgb.DMatrix.hasinfo}
|
||||
\alias{xgb.DMatrix.hasinfo}
|
||||
\title{Check whether DMatrix object has a field}
|
||||
\usage{
|
||||
xgb.DMatrix.hasinfo(object, info)
|
||||
}
|
||||
\arguments{
|
||||
\item{object}{The DMatrix object to check for the given \code{info} field.}
|
||||
|
||||
\item{info}{The field to check for presence or absence in \code{object}.}
|
||||
}
|
||||
\description{
|
||||
Checks whether an xgb.DMatrix object has a given field assigned to
|
||||
it, such as weights, labels, etc.
|
||||
}
|
||||
\examples{
|
||||
library(xgboost)
|
||||
x <- matrix(1:10, nrow = 5)
|
||||
dm <- xgb.DMatrix(x, nthread = 1)
|
||||
|
||||
# 'dm' so far doesn't have any fields set
|
||||
xgb.DMatrix.hasinfo(dm, "label")
|
||||
|
||||
# Fields can be added after construction
|
||||
setinfo(dm, "label", 1:5)
|
||||
xgb.DMatrix.hasinfo(dm, "label")
|
||||
}
|
||||
\seealso{
|
||||
\link{xgb.DMatrix}, \link{getinfo.xgb.DMatrix}, \link{setinfo.xgb.DMatrix}
|
||||
}
|
||||
@ -17,7 +17,7 @@ Save xgb.DMatrix object to binary file
|
||||
\examples{
|
||||
data(agaricus.train, package='xgboost')
|
||||
dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label, nthread = 2))
|
||||
xgb.DMatrix.save(dtrain, 'xgb.DMatrix.data')
|
||||
dtrain <- xgb.DMatrix('xgb.DMatrix.data')
|
||||
if (file.exists('xgb.DMatrix.data')) file.remove('xgb.DMatrix.data')
|
||||
fname <- file.path(tempdir(), "xgb.DMatrix.data")
|
||||
xgb.DMatrix.save(dtrain, fname)
|
||||
dtrain <- xgb.DMatrix(fname)
|
||||
}
|
||||
|
||||
@ -5,7 +5,7 @@
|
||||
\alias{xgb.attr<-}
|
||||
\alias{xgb.attributes}
|
||||
\alias{xgb.attributes<-}
|
||||
\title{Accessors for serializable attributes of a model.}
|
||||
\title{Accessors for serializable attributes of a model}
|
||||
\usage{
|
||||
xgb.attr(object, name)
|
||||
|
||||
@ -16,64 +16,71 @@ xgb.attributes(object)
|
||||
xgb.attributes(object) <- value
|
||||
}
|
||||
\arguments{
|
||||
\item{object}{Object of class \code{xgb.Booster} or \code{xgb.Booster.handle}.}
|
||||
\item{object}{Object of class \code{xgb.Booster}. \bold{Will be modified in-place} when assigning to it.}
|
||||
|
||||
\item{name}{a non-empty character string specifying which attribute is to be accessed.}
|
||||
\item{name}{A non-empty character string specifying which attribute is to be accessed.}
|
||||
|
||||
\item{value}{a value of an attribute for \code{xgb.attr<-}; for \code{xgb.attributes<-}
|
||||
it's a list (or an object coercible to a list) with the names of attributes to set
|
||||
\item{value}{For \verb{xgb.attr<-}, a value of an attribute; for \verb{xgb.attributes<-},
|
||||
it is a list (or an object coercible to a list) with the names of attributes to set
|
||||
and the elements corresponding to attribute values.
|
||||
Non-character values are converted to character.
|
||||
When attribute value is not a scalar, only the first index is used.
|
||||
When an attribute value is not a scalar, only the first index is used.
|
||||
Use \code{NULL} to remove an attribute.}
|
||||
}
|
||||
\value{
|
||||
\code{xgb.attr} returns either a string value of an attribute
|
||||
\itemize{
|
||||
\item \code{xgb.attr()} returns either a string value of an attribute
|
||||
or \code{NULL} if an attribute wasn't stored in a model.
|
||||
|
||||
\code{xgb.attributes} returns a list of all attribute stored in a model
|
||||
\item \code{xgb.attributes()} returns a list of all attributes stored in a model
|
||||
or \code{NULL} if a model has no stored attributes.
|
||||
}
|
||||
}
|
||||
\description{
|
||||
These methods allow to manipulate the key-value attribute strings of an xgboost model.
|
||||
}
|
||||
\details{
|
||||
The primary purpose of xgboost model attributes is to store some meta-data about the model.
|
||||
The primary purpose of xgboost model attributes is to store some meta data about the model.
|
||||
Note that they are a separate concept from the object attributes in R.
|
||||
Specifically, they refer to key-value strings that can be attached to an xgboost model,
|
||||
stored together with the model's binary representation, and accessed later
|
||||
(from R or any other interface).
|
||||
In contrast, any R-attribute assigned to an R-object of \code{xgb.Booster} class
|
||||
would not be saved by \code{xgb.save} because an xgboost model is an external memory object
|
||||
In contrast, any R attribute assigned to an R object of \code{xgb.Booster} class
|
||||
would not be saved by \code{\link[=xgb.save]{xgb.save()}} because an xgboost model is an external memory object
|
||||
and its serialization is handled externally.
|
||||
Also, setting an attribute that has the same name as one of xgboost's parameters wouldn't
|
||||
change the value of that parameter for a model.
|
||||
Use \code{\link{xgb.parameters<-}} to set or change model parameters.
|
||||
Use \code{\link[=xgb.parameters<-]{xgb.parameters<-()}} to set or change model parameters.
|
||||
|
||||
The attribute setters would usually work more efficiently for \code{xgb.Booster.handle}
|
||||
than for \code{xgb.Booster}, since only just a handle (pointer) would need to be copied.
|
||||
That would only matter if attributes need to be set many times.
|
||||
Note, however, that when feeding a handle of an \code{xgb.Booster} object to the attribute setters,
|
||||
the raw model cache of an \code{xgb.Booster} object would not be automatically updated,
|
||||
and it would be user's responsibility to call \code{xgb.serialize} to update it.
|
||||
|
||||
The \code{xgb.attributes<-} setter either updates the existing or adds one or several attributes,
|
||||
The \verb{xgb.attributes<-} setter either updates the existing or adds one or several attributes,
|
||||
but it doesn't delete the other existing attributes.
|
||||
|
||||
Important: since this modifies the booster's C object, semantics for assignment here
|
||||
will differ from R's, as any object reference to the same booster will be modified
|
||||
too, while assignment of R attributes through \verb{attributes(model)$<attr> <- <value>}
|
||||
will follow the usual copy-on-write R semantics (see \link{xgb.copy.Booster} for an
|
||||
example of these behaviors).
|
||||
}
|
||||
\examples{
|
||||
data(agaricus.train, package='xgboost')
|
||||
data(agaricus.train, package = "xgboost")
|
||||
train <- agaricus.train
|
||||
|
||||
bst <- xgboost(data = train$data, label = train$label, max_depth = 2,
|
||||
eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic")
|
||||
bst <- xgboost(
|
||||
data = train$data,
|
||||
label = train$label,
|
||||
max_depth = 2,
|
||||
eta = 1,
|
||||
nthread = 2,
|
||||
nrounds = 2,
|
||||
objective = "binary:logistic"
|
||||
)
|
||||
|
||||
xgb.attr(bst, "my_attribute") <- "my attribute value"
|
||||
print(xgb.attr(bst, "my_attribute"))
|
||||
xgb.attributes(bst) <- list(a = 123, b = "abc")
|
||||
|
||||
xgb.save(bst, 'xgb.model')
|
||||
bst1 <- xgb.load('xgb.model')
|
||||
if (file.exists('xgb.model')) file.remove('xgb.model')
|
||||
fname <- file.path(tempdir(), "xgb.ubj")
|
||||
xgb.save(bst, fname)
|
||||
bst1 <- xgb.load(fname)
|
||||
print(xgb.attr(bst1, "my_attribute"))
|
||||
print(xgb.attributes(bst1))
|
||||
|
||||
|
||||
@ -3,31 +3,48 @@
|
||||
\name{xgb.config}
|
||||
\alias{xgb.config}
|
||||
\alias{xgb.config<-}
|
||||
\title{Accessors for model parameters as JSON string.}
|
||||
\title{Accessors for model parameters as JSON string}
|
||||
\usage{
|
||||
xgb.config(object)
|
||||
|
||||
xgb.config(object) <- value
|
||||
}
|
||||
\arguments{
|
||||
\item{object}{Object of class \code{xgb.Booster}}
|
||||
\item{object}{Object of class \code{xgb.Booster}. \bold{Will be modified in-place} when assigning to it.}
|
||||
|
||||
\item{value}{A JSON string.}
|
||||
\item{value}{An R list.}
|
||||
}
|
||||
\value{
|
||||
\code{xgb.config} will return the parameters as an R list.
|
||||
}
|
||||
\description{
|
||||
Accessors for model parameters as JSON string.
|
||||
Accessors for model parameters as JSON string
|
||||
}
|
||||
\details{
|
||||
Note that assignment is performed in-place on the booster C object, which unlike assignment
|
||||
of R attributes, doesn't follow typical copy-on-write semantics for assignment - i.e. all references
|
||||
to the same booster will also get updated.
|
||||
|
||||
See \link{xgb.copy.Booster} for an example of this behavior.
|
||||
}
|
||||
\examples{
|
||||
data(agaricus.train, package='xgboost')
|
||||
data(agaricus.train, package = "xgboost")
|
||||
|
||||
## Keep the number of threads to 1 for examples
|
||||
nthread <- 1
|
||||
data.table::setDTthreads(nthread)
|
||||
train <- agaricus.train
|
||||
|
||||
bst <- xgboost(
|
||||
data = train$data, label = train$label, max_depth = 2,
|
||||
eta = 1, nthread = nthread, nrounds = 2, objective = "binary:logistic"
|
||||
data = train$data,
|
||||
label = train$label,
|
||||
max_depth = 2,
|
||||
eta = 1,
|
||||
nthread = nthread,
|
||||
nrounds = 2,
|
||||
objective = "binary:logistic"
|
||||
)
|
||||
|
||||
config <- xgb.config(bst)
|
||||
|
||||
}
|
||||
|
||||
53
R-package/man/xgb.copy.Booster.Rd
Normal file
53
R-package/man/xgb.copy.Booster.Rd
Normal file
@ -0,0 +1,53 @@
|
||||
% Generated by roxygen2: do not edit by hand
|
||||
% Please edit documentation in R/xgb.Booster.R
|
||||
\name{xgb.copy.Booster}
|
||||
\alias{xgb.copy.Booster}
|
||||
\title{Deep-copies a Booster Object}
|
||||
\usage{
|
||||
xgb.copy.Booster(model)
|
||||
}
|
||||
\arguments{
|
||||
\item{model}{An 'xgb.Booster' object.}
|
||||
}
|
||||
\value{
|
||||
A deep copy of \code{model} - it will be identical in every way, but C-level
|
||||
functions called on that copy will not affect the \code{model} variable.
|
||||
}
|
||||
\description{
|
||||
Creates a deep copy of an 'xgb.Booster' object, such that the
|
||||
C object pointer contained will be a different object, and hence functions
|
||||
like \link{xgb.attr} will not affect the object from which it was copied.
|
||||
}
|
||||
\examples{
|
||||
library(xgboost)
|
||||
data(mtcars)
|
||||
y <- mtcars$mpg
|
||||
x <- mtcars[, -1]
|
||||
dm <- xgb.DMatrix(x, label = y, nthread = 1)
|
||||
model <- xgb.train(
|
||||
data = dm,
|
||||
params = list(nthread = 1),
|
||||
nround = 3
|
||||
)
|
||||
|
||||
# Set an arbitrary attribute kept at the C level
|
||||
xgb.attr(model, "my_attr") <- 100
|
||||
print(xgb.attr(model, "my_attr"))
|
||||
|
||||
# Just assigning to a new variable will not create
|
||||
# a deep copy - C object pointer is shared, and in-place
|
||||
# modifications will affect both objects
|
||||
model_shallow_copy <- model
|
||||
xgb.attr(model_shallow_copy, "my_attr") <- 333
|
||||
# 'model' was also affected by this change:
|
||||
print(xgb.attr(model, "my_attr"))
|
||||
|
||||
model_deep_copy <- xgb.copy.Booster(model)
|
||||
xgb.attr(model_deep_copy, "my_attr") <- 444
|
||||
# 'model' was NOT affected by this change
|
||||
# (keeps previous value that was assigned before)
|
||||
print(xgb.attr(model, "my_attr"))
|
||||
|
||||
# Verify that the new object was actually modified
|
||||
print(xgb.attr(model_deep_copy, "my_attr"))
|
||||
}
|
||||
@ -48,7 +48,7 @@ be the binary vector \code{[0, 1, 0, 1, 0]}, where the first 3 entries
|
||||
correspond to the leaves of the first subtree and last 2 to
|
||||
those of the second subtree.
|
||||
|
||||
[...]
|
||||
\link{...}
|
||||
|
||||
We can understand boosted decision tree
|
||||
based transformation as a supervised feature encoding that
|
||||
@ -62,7 +62,7 @@ data(agaricus.test, package='xgboost')
|
||||
dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label, nthread = 2))
|
||||
dtest <- with(agaricus.test, xgb.DMatrix(data, label = label, nthread = 2))
|
||||
|
||||
param <- list(max_depth=2, eta=1, silent=1, objective='binary:logistic')
|
||||
param <- list(max_depth=2, eta=1, objective='binary:logistic')
|
||||
nrounds = 4
|
||||
|
||||
bst = xgb.train(params = param, data = dtrain, nrounds = nrounds, nthread = 2)
|
||||
|
||||
@ -29,22 +29,22 @@ xgb.cv(
|
||||
}
|
||||
\arguments{
|
||||
\item{params}{the list of parameters. The complete list of parameters is
|
||||
available in the \href{http://xgboost.readthedocs.io/en/latest/parameter.html}{online documentation}. Below
|
||||
is a shorter summary:
|
||||
available in the \href{http://xgboost.readthedocs.io/en/latest/parameter.html}{online documentation}. Below
|
||||
is a shorter summary:
|
||||
\itemize{
|
||||
\item \code{objective} objective function, common ones are
|
||||
\itemize{
|
||||
\item \code{reg:squarederror} Regression with squared loss.
|
||||
\item \code{binary:logistic} logistic regression for classification.
|
||||
\item See \code{\link[=xgb.train]{xgb.train}()} for complete list of objectives.
|
||||
}
|
||||
\item \code{eta} step size of each boosting step
|
||||
\item \code{max_depth} maximum depth of the tree
|
||||
\item \code{nthread} number of thread used in training, if not set, all threads are used
|
||||
\item \code{objective} objective function, common ones are
|
||||
\itemize{
|
||||
\item \code{reg:squarederror} Regression with squared loss.
|
||||
\item \code{binary:logistic} logistic regression for classification.
|
||||
\item See \code{\link[=xgb.train]{xgb.train}()} for complete list of objectives.
|
||||
}
|
||||
\item \code{eta} step size of each boosting step
|
||||
\item \code{max_depth} maximum depth of the tree
|
||||
\item \code{nthread} number of thread used in training, if not set, all threads are used
|
||||
}
|
||||
|
||||
See \code{\link{xgb.train}} for further details.
|
||||
See also demo/ for walkthrough example in R.}
|
||||
See \code{\link{xgb.train}} for further details.
|
||||
See also demo/ for walkthrough example in R.}
|
||||
|
||||
\item{data}{takes an \code{xgb.DMatrix}, \code{matrix}, or \code{dgCMatrix} as the input.}
|
||||
|
||||
@ -64,17 +64,17 @@ from each CV model. This parameter engages the \code{\link{cb.cv.predict}} callb
|
||||
\item{showsd}{\code{boolean}, whether to show standard deviation of cross validation}
|
||||
|
||||
\item{metrics, }{list of evaluation metrics to be used in cross validation,
|
||||
when it is not specified, the evaluation metric is chosen according to objective function.
|
||||
Possible options are:
|
||||
when it is not specified, the evaluation metric is chosen according to objective function.
|
||||
Possible options are:
|
||||
\itemize{
|
||||
\item \code{error} binary classification error rate
|
||||
\item \code{rmse} Rooted mean square error
|
||||
\item \code{logloss} negative log-likelihood function
|
||||
\item \code{mae} Mean absolute error
|
||||
\item \code{mape} Mean absolute percentage error
|
||||
\item \code{auc} Area under curve
|
||||
\item \code{aucpr} Area under PR curve
|
||||
\item \code{merror} Exact matching error, used to evaluate multi-class classification
|
||||
\item \code{error} binary classification error rate
|
||||
\item \code{rmse} Rooted mean square error
|
||||
\item \code{logloss} negative log-likelihood function
|
||||
\item \code{mae} Mean absolute error
|
||||
\item \code{mape} Mean absolute percentage error
|
||||
\item \code{auc} Area under curve
|
||||
\item \code{aucpr} Area under PR curve
|
||||
\item \code{merror} Exact matching error, used to evaluate multi-class classification
|
||||
}}
|
||||
|
||||
\item{obj}{customized objective function. Returns gradient and second order
|
||||
@ -120,26 +120,26 @@ to customize the training process.}
|
||||
\value{
|
||||
An object of class \code{xgb.cv.synchronous} with the following elements:
|
||||
\itemize{
|
||||
\item \code{call} a function call.
|
||||
\item \code{params} parameters that were passed to the xgboost library. Note that it does not
|
||||
capture parameters changed by the \code{\link{cb.reset.parameters}} callback.
|
||||
\item \code{callbacks} callback functions that were either automatically assigned or
|
||||
explicitly passed.
|
||||
\item \code{evaluation_log} evaluation history stored as a \code{data.table} with the
|
||||
first column corresponding to iteration number and the rest corresponding to the
|
||||
CV-based evaluation means and standard deviations for the training and test CV-sets.
|
||||
It is created by the \code{\link{cb.evaluation.log}} callback.
|
||||
\item \code{niter} number of boosting iterations.
|
||||
\item \code{nfeatures} number of features in training data.
|
||||
\item \code{folds} the list of CV folds' indices - either those passed through the \code{folds}
|
||||
parameter or randomly generated.
|
||||
\item \code{best_iteration} iteration number with the best evaluation metric value
|
||||
(only available with early stopping).
|
||||
\item \code{best_ntreelimit} and the \code{ntreelimit} Deprecated attributes, use \code{best_iteration} instead.
|
||||
\item \code{pred} CV prediction values available when \code{prediction} is set.
|
||||
It is either vector or matrix (see \code{\link{cb.cv.predict}}).
|
||||
\item \code{models} a list of the CV folds' models. It is only available with the explicit
|
||||
setting of the \code{cb.cv.predict(save_models = TRUE)} callback.
|
||||
\item \code{call} a function call.
|
||||
\item \code{params} parameters that were passed to the xgboost library. Note that it does not
|
||||
capture parameters changed by the \code{\link{cb.reset.parameters}} callback.
|
||||
\item \code{callbacks} callback functions that were either automatically assigned or
|
||||
explicitly passed.
|
||||
\item \code{evaluation_log} evaluation history stored as a \code{data.table} with the
|
||||
first column corresponding to iteration number and the rest corresponding to the
|
||||
CV-based evaluation means and standard deviations for the training and test CV-sets.
|
||||
It is created by the \code{\link{cb.evaluation.log}} callback.
|
||||
\item \code{niter} number of boosting iterations.
|
||||
\item \code{nfeatures} number of features in training data.
|
||||
\item \code{folds} the list of CV folds' indices - either those passed through the \code{folds}
|
||||
parameter or randomly generated.
|
||||
\item \code{best_iteration} iteration number with the best evaluation metric value
|
||||
(only available with early stopping).
|
||||
\item \code{best_ntreelimit} and the \code{ntreelimit} Deprecated attributes, use \code{best_iteration} instead.
|
||||
\item \code{pred} CV prediction values available when \code{prediction} is set.
|
||||
It is either vector or matrix (see \code{\link{cb.cv.predict}}).
|
||||
\item \code{models} a list of the CV folds' models. It is only available with the explicit
|
||||
setting of the \code{cb.cv.predict(save_models = TRUE)} callback.
|
||||
}
|
||||
}
|
||||
\description{
|
||||
|
||||
@ -9,7 +9,7 @@ xgb.dump(
|
||||
fname = NULL,
|
||||
fmap = "",
|
||||
with_stats = FALSE,
|
||||
dump_format = c("text", "json"),
|
||||
dump_format = c("text", "json", "dot"),
|
||||
...
|
||||
)
|
||||
}
|
||||
@ -29,7 +29,10 @@ When this option is on, the model dump contains two additional values:
|
||||
gain is the approximate loss function gain we get in each split;
|
||||
cover is the sum of second order gradient in each node.}
|
||||
|
||||
\item{dump_format}{either 'text' or 'json' format could be specified.}
|
||||
\item{dump_format}{either 'text', 'json', or 'dot' (graphviz) format could be specified.
|
||||
|
||||
Format 'dot' for a single tree can be passed directly to packages that consume this format
|
||||
for graph visualization, such as function \code{\link[DiagrammeR:grViz]{DiagrammeR::grViz()}}}
|
||||
|
||||
\item{...}{currently not used}
|
||||
}
|
||||
@ -57,4 +60,8 @@ print(xgb.dump(bst, with_stats = TRUE))
|
||||
# print in JSON format:
|
||||
cat(xgb.dump(bst, with_stats = TRUE, dump_format='json'))
|
||||
|
||||
# plot first tree leveraging the 'dot' format
|
||||
if (requireNamespace('DiagrammeR', quietly = TRUE)) {
|
||||
DiagrammeR::grViz(xgb.dump(bst, dump_format = "dot")[[1L]])
|
||||
}
|
||||
}
|
||||
|
||||
@ -8,7 +8,8 @@ xgb.gblinear.history(model, class_index = NULL)
|
||||
}
|
||||
\arguments{
|
||||
\item{model}{either an \code{xgb.Booster} or a result of \code{xgb.cv()}, trained
|
||||
using the \code{cb.gblinear.history()} callback.}
|
||||
using the \code{cb.gblinear.history()} callback, but \bold{not} a booster
|
||||
loaded from \link{xgb.load} or \link{xgb.load.raw}.}
|
||||
|
||||
\item{class_index}{zero-based class index to extract the coefficients for only that
|
||||
specific class in a multinomial multiclass model. When it is NULL, all the
|
||||
@ -27,3 +28,11 @@ A helper function to extract the matrix of linear coefficients' history
|
||||
from a gblinear model created while using the \code{cb.gblinear.history()}
|
||||
callback.
|
||||
}
|
||||
\details{
|
||||
Note that this is an R-specific function that relies on R attributes that
|
||||
are not saved when using xgboost's own serialization functions like \link{xgb.load}
|
||||
or \link{xgb.load.raw}.
|
||||
|
||||
In order for a serialized model to be accepted by tgis function, one must use R
|
||||
serializers such as \link{saveRDS}.
|
||||
}
|
||||
|
||||
19
R-package/man/xgb.get.DMatrix.data.Rd
Normal file
19
R-package/man/xgb.get.DMatrix.data.Rd
Normal file
@ -0,0 +1,19 @@
|
||||
% Generated by roxygen2: do not edit by hand
|
||||
% Please edit documentation in R/xgb.DMatrix.R
|
||||
\name{xgb.get.DMatrix.data}
|
||||
\alias{xgb.get.DMatrix.data}
|
||||
\title{Get DMatrix Data}
|
||||
\usage{
|
||||
xgb.get.DMatrix.data(dmat)
|
||||
}
|
||||
\arguments{
|
||||
\item{dmat}{An \code{xgb.DMatrix} object, as returned by \link{xgb.DMatrix}.}
|
||||
}
|
||||
\value{
|
||||
The data held in the DMatrix, as a sparse CSR matrix (class \code{dgRMatrix}
|
||||
from package \code{Matrix}). If it had feature names, these will be added as column names
|
||||
in the output.
|
||||
}
|
||||
\description{
|
||||
Get DMatrix Data
|
||||
}
|
||||
17
R-package/man/xgb.get.DMatrix.num.non.missing.Rd
Normal file
17
R-package/man/xgb.get.DMatrix.num.non.missing.Rd
Normal file
@ -0,0 +1,17 @@
|
||||
% Generated by roxygen2: do not edit by hand
|
||||
% Please edit documentation in R/xgb.DMatrix.R
|
||||
\name{xgb.get.DMatrix.num.non.missing}
|
||||
\alias{xgb.get.DMatrix.num.non.missing}
|
||||
\title{Get Number of Non-Missing Entries in DMatrix}
|
||||
\usage{
|
||||
xgb.get.DMatrix.num.non.missing(dmat)
|
||||
}
|
||||
\arguments{
|
||||
\item{dmat}{An \code{xgb.DMatrix} object, as returned by \link{xgb.DMatrix}.}
|
||||
}
|
||||
\value{
|
||||
The number of non-missing entries in the DMatrix
|
||||
}
|
||||
\description{
|
||||
Get Number of Non-Missing Entries in DMatrix
|
||||
}
|
||||
58
R-package/man/xgb.get.DMatrix.qcut.Rd
Normal file
58
R-package/man/xgb.get.DMatrix.qcut.Rd
Normal file
@ -0,0 +1,58 @@
|
||||
% Generated by roxygen2: do not edit by hand
|
||||
% Please edit documentation in R/xgb.DMatrix.R
|
||||
\name{xgb.get.DMatrix.qcut}
|
||||
\alias{xgb.get.DMatrix.qcut}
|
||||
\title{Get Quantile Cuts from DMatrix}
|
||||
\usage{
|
||||
xgb.get.DMatrix.qcut(dmat, output = c("list", "arrays"))
|
||||
}
|
||||
\arguments{
|
||||
\item{dmat}{An \code{xgb.DMatrix} object, as returned by \link{xgb.DMatrix}.}
|
||||
|
||||
\item{output}{Output format for the quantile cuts. Possible options are:\itemize{
|
||||
\item \code{"list"} will return the output as a list with one entry per column, where
|
||||
each column will have a numeric vector with the cuts. The list will be named if
|
||||
\code{dmat} has column names assigned to it.
|
||||
\item \code{"arrays"} will return a list with entries \code{indptr} (base-0 indexing) and
|
||||
\code{data}. Here, the cuts for column 'i' are obtained by slicing 'data' from entries
|
||||
\code{indptr[i]+1} to \code{indptr[i+1]}.
|
||||
}}
|
||||
}
|
||||
\value{
|
||||
The quantile cuts, in the format specified by parameter \code{output}.
|
||||
}
|
||||
\description{
|
||||
Get the quantile cuts (a.k.a. borders) from an \code{xgb.DMatrix}
|
||||
that has been quantized for the histogram method (\code{tree_method="hist"}).
|
||||
|
||||
These cuts are used in order to assign observations to bins - i.e. these are ordered
|
||||
boundaries which are used to determine assignment condition \verb{border_low < x < border_high}.
|
||||
As such, the first and last bin will be outside of the range of the data, so as to include
|
||||
all of the observations there.
|
||||
|
||||
If a given column has 'n' bins, then there will be 'n+1' cuts / borders for that column,
|
||||
which will be output in sorted order from lowest to highest.
|
||||
|
||||
Different columns can have different numbers of bins according to their range.
|
||||
}
|
||||
\examples{
|
||||
library(xgboost)
|
||||
data(mtcars)
|
||||
y <- mtcars$mpg
|
||||
x <- as.matrix(mtcars[, -1])
|
||||
dm <- xgb.DMatrix(x, label = y, nthread = 1)
|
||||
|
||||
# DMatrix is not quantized right away, but will be once a hist model is generated
|
||||
model <- xgb.train(
|
||||
data = dm,
|
||||
params = list(
|
||||
tree_method = "hist",
|
||||
max_bin = 8,
|
||||
nthread = 1
|
||||
),
|
||||
nrounds = 3
|
||||
)
|
||||
|
||||
# Now can get the quantile cuts
|
||||
xgb.get.DMatrix.qcut(dm)
|
||||
}
|
||||
22
R-package/man/xgb.get.num.boosted.rounds.Rd
Normal file
22
R-package/man/xgb.get.num.boosted.rounds.Rd
Normal file
@ -0,0 +1,22 @@
|
||||
% Generated by roxygen2: do not edit by hand
|
||||
% Please edit documentation in R/xgb.Booster.R
|
||||
\name{xgb.get.num.boosted.rounds}
|
||||
\alias{xgb.get.num.boosted.rounds}
|
||||
\title{Get number of boosting in a fitted booster}
|
||||
\usage{
|
||||
xgb.get.num.boosted.rounds(model)
|
||||
}
|
||||
\arguments{
|
||||
\item{model}{A fitted \code{xgb.Booster} model.}
|
||||
}
|
||||
\value{
|
||||
The number of rounds saved in the model, as an integer.
|
||||
}
|
||||
\description{
|
||||
Get number of boosting in a fitted booster
|
||||
}
|
||||
\details{
|
||||
Note that setting booster parameters related to training
|
||||
continuation / updates through \link{xgb.parameters<-} will reset the
|
||||
number of rounds to zero.
|
||||
}
|
||||
@ -2,7 +2,7 @@
|
||||
% Please edit documentation in R/xgb.importance.R
|
||||
\name{xgb.importance}
|
||||
\alias{xgb.importance}
|
||||
\title{Importance of features in a model.}
|
||||
\title{Feature importance}
|
||||
\usage{
|
||||
xgb.importance(
|
||||
feature_names = NULL,
|
||||
@ -14,88 +14,126 @@ xgb.importance(
|
||||
)
|
||||
}
|
||||
\arguments{
|
||||
\item{feature_names}{character vector of feature names. If the model already
|
||||
contains feature names, those would be used when \code{feature_names=NULL} (default value).
|
||||
Non-null \code{feature_names} could be provided to override those in the model.}
|
||||
\item{feature_names}{Character vector used to overwrite the feature names
|
||||
of the model. The default is \code{NULL} (use original feature names).}
|
||||
|
||||
\item{model}{object of class \code{xgb.Booster}.}
|
||||
\item{model}{Object of class \code{xgb.Booster}.}
|
||||
|
||||
\item{trees}{(only for the gbtree booster) an integer vector of tree indices that should be included
|
||||
into the importance calculation. If set to \code{NULL}, all trees of the model are parsed.
|
||||
\item{trees}{An integer vector of tree indices that should be included
|
||||
into the importance calculation (only for the "gbtree" booster).
|
||||
The default (\code{NULL}) parses all trees.
|
||||
It could be useful, e.g., in multiclass classification to get feature importances
|
||||
for each class separately. IMPORTANT: the tree index in xgboost models
|
||||
is zero-based (e.g., use \code{trees = 0:4} for first 5 trees).}
|
||||
for each class separately. \emph{Important}: the tree index in XGBoost models
|
||||
is zero-based (e.g., use \code{trees = 0:4} for the first five trees).}
|
||||
|
||||
\item{data}{deprecated.}
|
||||
\item{data}{Deprecated.}
|
||||
|
||||
\item{label}{deprecated.}
|
||||
\item{label}{Deprecated.}
|
||||
|
||||
\item{target}{deprecated.}
|
||||
\item{target}{Deprecated.}
|
||||
}
|
||||
\value{
|
||||
For a tree model, a \code{data.table} with the following columns:
|
||||
A \code{data.table} with the following columns:
|
||||
|
||||
For a tree model:
|
||||
\itemize{
|
||||
\item \code{Features} names of the features used in the model;
|
||||
\item \code{Gain} represents fractional contribution of each feature to the model based on
|
||||
the total gain of this feature's splits. Higher percentage means a more important
|
||||
predictive feature.
|
||||
\item \code{Cover} metric of the number of observation related to this feature;
|
||||
\item \code{Frequency} percentage representing the relative number of times
|
||||
a feature have been used in trees.
|
||||
\item \code{Features}: Names of the features used in the model.
|
||||
\item \code{Gain}: Fractional contribution of each feature to the model based on
|
||||
the total gain of this feature's splits. Higher percentage means higher importance.
|
||||
\item \code{Cover}: Metric of the number of observation related to this feature.
|
||||
\item \code{Frequency}: Percentage of times a feature has been used in trees.
|
||||
}
|
||||
|
||||
A linear model's importance \code{data.table} has the following columns:
|
||||
For a linear model:
|
||||
\itemize{
|
||||
\item \code{Features} names of the features used in the model;
|
||||
\item \code{Weight} the linear coefficient of this feature;
|
||||
\item \code{Class} (only for multiclass models) class label.
|
||||
\item \code{Features}: Names of the features used in the model.
|
||||
\item \code{Weight}: Linear coefficient of this feature.
|
||||
\item \code{Class}: Class label (only for multiclass models).
|
||||
}
|
||||
|
||||
If \code{feature_names} is not provided and \code{model} doesn't have \code{feature_names},
|
||||
index of the features will be used instead. Because the index is extracted from the model dump
|
||||
the index of the features will be used instead. Because the index is extracted from the model dump
|
||||
(based on C++ code), it starts at 0 (as in C/C++ or Python) instead of 1 (usual in R).
|
||||
}
|
||||
\description{
|
||||
Creates a \code{data.table} of feature importances in a model.
|
||||
Creates a \code{data.table} of feature importances.
|
||||
}
|
||||
\details{
|
||||
This function works for both linear and tree models.
|
||||
|
||||
For linear models, the importance is the absolute magnitude of linear coefficients.
|
||||
For that reason, in order to obtain a meaningful ranking by importance for a linear model,
|
||||
the features need to be on the same scale (which you also would want to do when using either
|
||||
L1 or L2 regularization).
|
||||
To obtain a meaningful ranking by importance for linear models, the features need to
|
||||
be on the same scale (which is also recommended when using L1 or L2 regularization).
|
||||
}
|
||||
\examples{
|
||||
|
||||
# binomial classification using gbtree:
|
||||
data(agaricus.train, package='xgboost')
|
||||
bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max_depth = 2,
|
||||
eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic")
|
||||
# binomial classification using "gbtree":
|
||||
data(agaricus.train, package = "xgboost")
|
||||
|
||||
bst <- xgboost(
|
||||
data = agaricus.train$data,
|
||||
label = agaricus.train$label,
|
||||
max_depth = 2,
|
||||
eta = 1,
|
||||
nthread = 2,
|
||||
nrounds = 2,
|
||||
objective = "binary:logistic"
|
||||
)
|
||||
|
||||
xgb.importance(model = bst)
|
||||
|
||||
# binomial classification using gblinear:
|
||||
bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, booster = "gblinear",
|
||||
eta = 0.3, nthread = 1, nrounds = 20, objective = "binary:logistic")
|
||||
# binomial classification using "gblinear":
|
||||
bst <- xgboost(
|
||||
data = agaricus.train$data,
|
||||
label = agaricus.train$label,
|
||||
booster = "gblinear",
|
||||
eta = 0.3,
|
||||
nthread = 1,
|
||||
nrounds = 20,objective = "binary:logistic"
|
||||
)
|
||||
|
||||
xgb.importance(model = bst)
|
||||
|
||||
# multiclass classification using gbtree:
|
||||
# multiclass classification using "gbtree":
|
||||
nclass <- 3
|
||||
nrounds <- 10
|
||||
mbst <- xgboost(data = as.matrix(iris[, -5]), label = as.numeric(iris$Species) - 1,
|
||||
max_depth = 3, eta = 0.2, nthread = 2, nrounds = nrounds,
|
||||
objective = "multi:softprob", num_class = nclass)
|
||||
mbst <- xgboost(
|
||||
data = as.matrix(iris[, -5]),
|
||||
label = as.numeric(iris$Species) - 1,
|
||||
max_depth = 3,
|
||||
eta = 0.2,
|
||||
nthread = 2,
|
||||
nrounds = nrounds,
|
||||
objective = "multi:softprob",
|
||||
num_class = nclass
|
||||
)
|
||||
|
||||
# all classes clumped together:
|
||||
xgb.importance(model = mbst)
|
||||
# inspect importances separately for each class:
|
||||
xgb.importance(model = mbst, trees = seq(from=0, by=nclass, length.out=nrounds))
|
||||
xgb.importance(model = mbst, trees = seq(from=1, by=nclass, length.out=nrounds))
|
||||
xgb.importance(model = mbst, trees = seq(from=2, by=nclass, length.out=nrounds))
|
||||
|
||||
# multiclass classification using gblinear:
|
||||
mbst <- xgboost(data = scale(as.matrix(iris[, -5])), label = as.numeric(iris$Species) - 1,
|
||||
booster = "gblinear", eta = 0.2, nthread = 1, nrounds = 15,
|
||||
objective = "multi:softprob", num_class = nclass)
|
||||
# inspect importances separately for each class:
|
||||
xgb.importance(
|
||||
model = mbst, trees = seq(from = 0, by = nclass, length.out = nrounds)
|
||||
)
|
||||
xgb.importance(
|
||||
model = mbst, trees = seq(from = 1, by = nclass, length.out = nrounds)
|
||||
)
|
||||
xgb.importance(
|
||||
model = mbst, trees = seq(from = 2, by = nclass, length.out = nrounds)
|
||||
)
|
||||
|
||||
# multiclass classification using "gblinear":
|
||||
mbst <- xgboost(
|
||||
data = scale(as.matrix(iris[, -5])),
|
||||
label = as.numeric(iris$Species) - 1,
|
||||
booster = "gblinear",
|
||||
eta = 0.2,
|
||||
nthread = 1,
|
||||
nrounds = 15,
|
||||
objective = "multi:softprob",
|
||||
num_class = nclass
|
||||
)
|
||||
|
||||
xgb.importance(model = mbst)
|
||||
|
||||
}
|
||||
|
||||
59
R-package/man/xgb.is.same.Booster.Rd
Normal file
59
R-package/man/xgb.is.same.Booster.Rd
Normal file
@ -0,0 +1,59 @@
|
||||
% Generated by roxygen2: do not edit by hand
|
||||
% Please edit documentation in R/xgb.Booster.R
|
||||
\name{xgb.is.same.Booster}
|
||||
\alias{xgb.is.same.Booster}
|
||||
\title{Check if two boosters share the same C object}
|
||||
\usage{
|
||||
xgb.is.same.Booster(obj1, obj2)
|
||||
}
|
||||
\arguments{
|
||||
\item{obj1}{Booster model to compare with \code{obj2}.}
|
||||
|
||||
\item{obj2}{Booster model to compare with \code{obj1}.}
|
||||
}
|
||||
\value{
|
||||
Either \code{TRUE} or \code{FALSE} according to whether the two boosters share
|
||||
the underlying C object.
|
||||
}
|
||||
\description{
|
||||
Checks whether two booster objects refer to the same underlying C object.
|
||||
}
|
||||
\details{
|
||||
As booster objects (as returned by e.g. \link{xgb.train}) contain an R 'externalptr'
|
||||
object, they don't follow typical copy-on-write semantics of other R objects - that is, if
|
||||
one assigns a booster to a different variable and modifies that new variable through in-place
|
||||
methods like \link{xgb.attr<-}, the modification will be applied to both the old and the new
|
||||
variable, unlike typical R assignments which would only modify the latter.
|
||||
|
||||
This function allows checking whether two booster objects share the same 'externalptr',
|
||||
regardless of the R attributes that they might have.
|
||||
|
||||
In order to duplicate a booster in such a way that the copy wouldn't share the same
|
||||
'externalptr', one can use function \link{xgb.copy.Booster}.
|
||||
}
|
||||
\examples{
|
||||
library(xgboost)
|
||||
data(mtcars)
|
||||
y <- mtcars$mpg
|
||||
x <- as.matrix(mtcars[, -1])
|
||||
model <- xgb.train(
|
||||
params = list(nthread = 1),
|
||||
data = xgb.DMatrix(x, label = y, nthread = 1),
|
||||
nround = 3
|
||||
)
|
||||
|
||||
model_shallow_copy <- model
|
||||
xgb.is.same.Booster(model, model_shallow_copy) # same C object
|
||||
|
||||
model_deep_copy <- xgb.copy.Booster(model)
|
||||
xgb.is.same.Booster(model, model_deep_copy) # different C objects
|
||||
|
||||
# In-place assignments modify all references,
|
||||
# but not full/deep copies of the booster
|
||||
xgb.attr(model_shallow_copy, "my_attr") <- 111
|
||||
xgb.attr(model, "my_attr") # gets modified
|
||||
xgb.attr(model_deep_copy, "my_attr") # doesn't get modified
|
||||
}
|
||||
\seealso{
|
||||
\link{xgb.copy.Booster}
|
||||
}
|
||||
@ -34,17 +34,19 @@ data.table::setDTthreads(nthread)
|
||||
|
||||
train <- agaricus.train
|
||||
test <- agaricus.test
|
||||
bst <- xgboost(
|
||||
data = train$data, label = train$label, max_depth = 2, eta = 1,
|
||||
bst <- xgb.train(
|
||||
data = xgb.DMatrix(train$data, label = train$label),
|
||||
max_depth = 2,
|
||||
eta = 1,
|
||||
nthread = nthread,
|
||||
nrounds = 2,
|
||||
objective = "binary:logistic"
|
||||
)
|
||||
|
||||
xgb.save(bst, 'xgb.model')
|
||||
bst <- xgb.load('xgb.model')
|
||||
if (file.exists('xgb.model')) file.remove('xgb.model')
|
||||
fname <- file.path(tempdir(), "xgb.ubj")
|
||||
xgb.save(bst, fname)
|
||||
bst <- xgb.load(fname)
|
||||
}
|
||||
\seealso{
|
||||
\code{\link{xgb.save}}, \code{\link{xgb.Booster.complete}}.
|
||||
\code{\link{xgb.save}}
|
||||
}
|
||||
|
||||
@ -4,12 +4,10 @@
|
||||
\alias{xgb.load.raw}
|
||||
\title{Load serialised xgboost model from R's raw vector}
|
||||
\usage{
|
||||
xgb.load.raw(buffer, as_booster = FALSE)
|
||||
xgb.load.raw(buffer)
|
||||
}
|
||||
\arguments{
|
||||
\item{buffer}{the buffer returned by xgb.save.raw}
|
||||
|
||||
\item{as_booster}{Return the loaded model as xgb.Booster instead of xgb.Booster.handle.}
|
||||
}
|
||||
\description{
|
||||
User can generate raw memory buffer by calling xgb.save.raw
|
||||
|
||||
@ -2,7 +2,7 @@
|
||||
% Please edit documentation in R/xgb.model.dt.tree.R
|
||||
\name{xgb.model.dt.tree}
|
||||
\alias{xgb.model.dt.tree}
|
||||
\title{Parse a boosted tree model text dump}
|
||||
\title{Parse model text dump}
|
||||
\usage{
|
||||
xgb.model.dt.tree(
|
||||
feature_names = NULL,
|
||||
@ -14,49 +14,48 @@ xgb.model.dt.tree(
|
||||
)
|
||||
}
|
||||
\arguments{
|
||||
\item{feature_names}{character vector of feature names. If the model already
|
||||
contains feature names, those would be used when \code{feature_names=NULL} (default value).
|
||||
Non-null \code{feature_names} could be provided to override those in the model.}
|
||||
\item{feature_names}{Character vector of feature names. If the model already
|
||||
contains feature names, those will be used when \code{feature_names=NULL} (default value).
|
||||
|
||||
\item{model}{object of class \code{xgb.Booster}}
|
||||
\if{html}{\out{<div class="sourceCode">}}\preformatted{ Note that, if the model already contains feature names, it's \\bold\{not\} possible to override them here.
|
||||
}\if{html}{\out{</div>}}}
|
||||
|
||||
\item{text}{\code{character} vector previously generated by the \code{xgb.dump}
|
||||
function (where parameter \code{with_stats = TRUE} should have been set).
|
||||
\code{text} takes precedence over \code{model}.}
|
||||
\item{model}{Object of class \code{xgb.Booster}.}
|
||||
|
||||
\item{trees}{an integer vector of tree indices that should be parsed.
|
||||
If set to \code{NULL}, all trees of the model are parsed.
|
||||
It could be useful, e.g., in multiclass classification to get only
|
||||
the trees of one certain class. IMPORTANT: the tree index in xgboost models
|
||||
is zero-based (e.g., use \code{trees = 0:4} for first 5 trees).}
|
||||
\item{text}{Character vector previously generated by the function \code{\link[=xgb.dump]{xgb.dump()}}
|
||||
(called with parameter \code{with_stats = TRUE}). \code{text} takes precedence over \code{model}.}
|
||||
|
||||
\item{use_int_id}{a logical flag indicating whether nodes in columns "Yes", "No", "Missing" should be
|
||||
represented as integers (when FALSE) or as "Tree-Node" character strings (when FALSE).}
|
||||
\item{trees}{An integer vector of tree indices that should be used.
|
||||
The default (\code{NULL}) uses all trees.
|
||||
Useful, e.g., in multiclass classification to get only
|
||||
the trees of one class. \emph{Important}: the tree index in XGBoost models
|
||||
is zero-based (e.g., use \code{trees = 0:4} for the first five trees).}
|
||||
|
||||
\item{...}{currently not used.}
|
||||
\item{use_int_id}{A logical flag indicating whether nodes in columns "Yes", "No", and
|
||||
"Missing" should be represented as integers (when \code{TRUE}) or as "Tree-Node"
|
||||
character strings (when \code{FALSE}, default).}
|
||||
|
||||
\item{...}{Currently not used.}
|
||||
}
|
||||
\value{
|
||||
A \code{data.table} with detailed information about model trees' nodes.
|
||||
|
||||
The columns of the \code{data.table} are:
|
||||
|
||||
A \code{data.table} with detailed information about tree nodes. It has the following columns:
|
||||
\itemize{
|
||||
\item \code{Tree}: integer ID of a tree in a model (zero-based index)
|
||||
\item \code{Node}: integer ID of a node in a tree (zero-based index)
|
||||
\item \code{ID}: character identifier of a node in a model (only when \code{use_int_id=FALSE})
|
||||
\item \code{Feature}: for a branch node, it's a feature id or name (when available);
|
||||
for a leaf note, it simply labels it as \code{'Leaf'}
|
||||
\item \code{Split}: location of the split for a branch node (split condition is always "less than")
|
||||
\item \code{Yes}: ID of the next node when the split condition is met
|
||||
\item \code{No}: ID of the next node when the split condition is not met
|
||||
\item \code{Missing}: ID of the next node when branch value is missing
|
||||
\item \code{Quality}: either the split gain (change in loss) or the leaf value
|
||||
\item \code{Cover}: metric related to the number of observation either seen by a split
|
||||
or collected by a leaf during training.
|
||||
\item \code{Tree}: integer ID of a tree in a model (zero-based index).
|
||||
\item \code{Node}: integer ID of a node in a tree (zero-based index).
|
||||
\item \code{ID}: character identifier of a node in a model (only when \code{use_int_id = FALSE}).
|
||||
\item \code{Feature}: for a branch node, a feature ID or name (when available);
|
||||
for a leaf node, it simply labels it as \code{"Leaf"}.
|
||||
\item \code{Split}: location of the split for a branch node (split condition is always "less than").
|
||||
\item \code{Yes}: ID of the next node when the split condition is met.
|
||||
\item \code{No}: ID of the next node when the split condition is not met.
|
||||
\item \code{Missing}: ID of the next node when the branch value is missing.
|
||||
\item \code{Gain}: either the split gain (change in loss) or the leaf value.
|
||||
\item \code{Cover}: metric related to the number of observations either seen by a split
|
||||
or collected by a leaf during training.
|
||||
}
|
||||
|
||||
When \code{use_int_id=FALSE}, columns "Yes", "No", and "Missing" point to model-wide node identifiers
|
||||
in the "ID" column. When \code{use_int_id=TRUE}, those columns point to node identifiers from
|
||||
When \code{use_int_id = FALSE}, columns "Yes", "No", and "Missing" point to model-wide node identifiers
|
||||
in the "ID" column. When \code{use_int_id = TRUE}, those columns point to node identifiers from
|
||||
the corresponding trees in the "Node" column.
|
||||
}
|
||||
\description{
|
||||
@ -65,22 +64,31 @@ Parse a boosted tree model text dump into a \code{data.table} structure.
|
||||
\examples{
|
||||
# Basic use:
|
||||
|
||||
data(agaricus.train, package='xgboost')
|
||||
data(agaricus.train, package = "xgboost")
|
||||
## Keep the number of threads to 1 for examples
|
||||
nthread <- 1
|
||||
data.table::setDTthreads(nthread)
|
||||
|
||||
bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max_depth = 2,
|
||||
eta = 1, nthread = nthread, nrounds = 2,objective = "binary:logistic")
|
||||
|
||||
(dt <- xgb.model.dt.tree(colnames(agaricus.train$data), bst))
|
||||
bst <- xgboost(
|
||||
data = agaricus.train$data,
|
||||
label = agaricus.train$label,
|
||||
max_depth = 2,
|
||||
eta = 1,
|
||||
nthread = nthread,
|
||||
nrounds = 2,
|
||||
objective = "binary:logistic"
|
||||
)
|
||||
|
||||
# This bst model already has feature_names stored with it, so those would be used when
|
||||
# feature_names is not set:
|
||||
(dt <- xgb.model.dt.tree(model = bst))
|
||||
|
||||
# How to match feature names of splits that are following a current 'Yes' branch:
|
||||
|
||||
merge(dt, dt[, .(ID, Y.Feature=Feature)], by.x='Yes', by.y='ID', all.x=TRUE)[order(Tree,Node)]
|
||||
merge(
|
||||
dt,
|
||||
dt[, .(ID, Y.Feature = Feature)], by.x = "Yes", by.y = "ID", all.x = TRUE
|
||||
)[
|
||||
order(Tree, Node)
|
||||
]
|
||||
|
||||
}
|
||||
|
||||
@ -2,29 +2,46 @@
|
||||
% Please edit documentation in R/xgb.Booster.R
|
||||
\name{xgb.parameters<-}
|
||||
\alias{xgb.parameters<-}
|
||||
\title{Accessors for model parameters.}
|
||||
\title{Accessors for model parameters}
|
||||
\usage{
|
||||
xgb.parameters(object) <- value
|
||||
}
|
||||
\arguments{
|
||||
\item{object}{Object of class \code{xgb.Booster} or \code{xgb.Booster.handle}.}
|
||||
\item{object}{Object of class \code{xgb.Booster}. \bold{Will be modified in-place}.}
|
||||
|
||||
\item{value}{a list (or an object coercible to a list) with the names of parameters to set
|
||||
\item{value}{A list (or an object coercible to a list) with the names of parameters to set
|
||||
and the elements corresponding to parameter values.}
|
||||
}
|
||||
\value{
|
||||
The same booster \code{object}, which gets modified in-place.
|
||||
}
|
||||
\description{
|
||||
Only the setter for xgboost parameters is currently implemented.
|
||||
}
|
||||
\details{
|
||||
Note that the setter would usually work more efficiently for \code{xgb.Booster.handle}
|
||||
than for \code{xgb.Booster}, since only just a handle would need to be copied.
|
||||
Just like \link{xgb.attr}, this function will make in-place modifications
|
||||
on the booster object which do not follow typical R assignment semantics - that is,
|
||||
all references to the same booster will also be updated, unlike assingment of R
|
||||
attributes which follow copy-on-write semantics.
|
||||
|
||||
See \link{xgb.copy.Booster} for an example of this behavior.
|
||||
|
||||
Be aware that setting parameters of a fitted booster related to training continuation / updates
|
||||
will reset its number of rounds indicator to zero.
|
||||
}
|
||||
\examples{
|
||||
data(agaricus.train, package='xgboost')
|
||||
data(agaricus.train, package = "xgboost")
|
||||
train <- agaricus.train
|
||||
|
||||
bst <- xgboost(data = train$data, label = train$label, max_depth = 2,
|
||||
eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic")
|
||||
bst <- xgboost(
|
||||
data = train$data,
|
||||
label = train$label,
|
||||
max_depth = 2,
|
||||
eta = 1,
|
||||
nthread = 2,
|
||||
nrounds = 2,
|
||||
objective = "binary:logistic"
|
||||
)
|
||||
|
||||
xgb.parameters(bst) <- list(eta = 0.1)
|
||||
|
||||
|
||||
@ -3,7 +3,7 @@
|
||||
\name{xgb.ggplot.deepness}
|
||||
\alias{xgb.ggplot.deepness}
|
||||
\alias{xgb.plot.deepness}
|
||||
\title{Plot model trees deepness}
|
||||
\title{Plot model tree depth}
|
||||
\usage{
|
||||
xgb.ggplot.deepness(
|
||||
model = NULL,
|
||||
@ -18,66 +18,84 @@ xgb.plot.deepness(
|
||||
)
|
||||
}
|
||||
\arguments{
|
||||
\item{model}{either an \code{xgb.Booster} model generated by the \code{xgb.train} function
|
||||
or a data.table result of the \code{xgb.model.dt.tree} function.}
|
||||
\item{model}{Either an \code{xgb.Booster} model, or the "data.table" returned by \code{\link[=xgb.model.dt.tree]{xgb.model.dt.tree()}}.}
|
||||
|
||||
\item{which}{which distribution to plot (see details).}
|
||||
\item{which}{Which distribution to plot (see details).}
|
||||
|
||||
\item{plot}{(base R barplot) whether a barplot should be produced.
|
||||
If FALSE, only a data.table is returned.}
|
||||
\item{plot}{Should the plot be shown? Default is \code{TRUE}.}
|
||||
|
||||
\item{...}{other parameters passed to \code{barplot} or \code{plot}.}
|
||||
\item{...}{Other parameters passed to \code{\link[graphics:barplot]{graphics::barplot()}} or \code{\link[graphics:plot.default]{graphics::plot()}}.}
|
||||
}
|
||||
\value{
|
||||
Other than producing plots (when \code{plot=TRUE}), the \code{xgb.plot.deepness} function
|
||||
silently returns a processed data.table where each row corresponds to a terminal leaf in a tree model,
|
||||
and contains information about leaf's depth, cover, and weight (which is used in calculating predictions).
|
||||
|
||||
The \code{xgb.ggplot.deepness} silently returns either a list of two ggplot graphs when \code{which="2x1"}
|
||||
or a single ggplot graph for the other \code{which} options.
|
||||
The return value of the two functions is as follows:
|
||||
\itemize{
|
||||
\item \code{xgb.plot.deepness()}: A "data.table" (invisibly).
|
||||
Each row corresponds to a terminal leaf in the model. It contains its information
|
||||
about depth, cover, and weight (used in calculating predictions).
|
||||
If \code{plot = TRUE}, also a plot is shown.
|
||||
\item \code{xgb.ggplot.deepness()}: When \code{which = "2x1"}, a list of two "ggplot" objects,
|
||||
and a single "ggplot" object otherwise.
|
||||
}
|
||||
}
|
||||
\description{
|
||||
Visualizes distributions related to depth of tree leafs.
|
||||
\code{xgb.plot.deepness} uses base R graphics, while \code{xgb.ggplot.deepness} uses the ggplot backend.
|
||||
Visualizes distributions related to the depth of tree leaves.
|
||||
\itemize{
|
||||
\item \code{xgb.plot.deepness()} uses base R graphics, while
|
||||
\item \code{xgb.ggplot.deepness()} uses "ggplot2".
|
||||
}
|
||||
}
|
||||
\details{
|
||||
When \code{which="2x1"}, two distributions with respect to the leaf depth
|
||||
When \code{which = "2x1"}, two distributions with respect to the leaf depth
|
||||
are plotted on top of each other:
|
||||
\itemize{
|
||||
\item the distribution of the number of leafs in a tree model at a certain depth;
|
||||
\item the distribution of average weighted number of observations ("cover")
|
||||
ending up in leafs at certain depth.
|
||||
\enumerate{
|
||||
\item The distribution of the number of leaves in a tree model at a certain depth.
|
||||
\item The distribution of the average weighted number of observations ("cover")
|
||||
ending up in leaves at a certain depth.
|
||||
}
|
||||
|
||||
Those could be helpful in determining sensible ranges of the \code{max_depth}
|
||||
and \code{min_child_weight} parameters.
|
||||
|
||||
When \code{which="max.depth"} or \code{which="med.depth"}, plots of either maximum or median depth
|
||||
per tree with respect to tree number are created. And \code{which="med.weight"} allows to see how
|
||||
When \code{which = "max.depth"} or \code{which = "med.depth"}, plots of either maximum or
|
||||
median depth per tree with respect to the tree number are created.
|
||||
|
||||
Finally, \code{which = "med.weight"} allows to see how
|
||||
a tree's median absolute leaf weight changes through the iterations.
|
||||
|
||||
This function was inspired by the blog post
|
||||
These functions have been inspired by the blog post
|
||||
\url{https://github.com/aysent/random-forest-leaf-visualization}.
|
||||
}
|
||||
\examples{
|
||||
|
||||
data(agaricus.train, package='xgboost')
|
||||
data(agaricus.train, package = "xgboost")
|
||||
## Keep the number of threads to 2 for examples
|
||||
nthread <- 2
|
||||
data.table::setDTthreads(nthread)
|
||||
|
||||
## Change max_depth to a higher number to get a more significant result
|
||||
bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max_depth = 6,
|
||||
eta = 0.1, nthread = nthread, nrounds = 50, objective = "binary:logistic",
|
||||
subsample = 0.5, min_child_weight = 2)
|
||||
bst <- xgboost(
|
||||
data = agaricus.train$data,
|
||||
label = agaricus.train$label,
|
||||
max_depth = 6,
|
||||
nthread = nthread,
|
||||
nrounds = 50,
|
||||
objective = "binary:logistic",
|
||||
subsample = 0.5,
|
||||
min_child_weight = 2
|
||||
)
|
||||
|
||||
xgb.plot.deepness(bst)
|
||||
xgb.ggplot.deepness(bst)
|
||||
|
||||
xgb.plot.deepness(bst, which='max.depth', pch=16, col=rgb(0,0,1,0.3), cex=2)
|
||||
xgb.plot.deepness(
|
||||
bst, which = "max.depth", pch = 16, col = rgb(0, 0, 1, 0.3), cex = 2
|
||||
)
|
||||
|
||||
xgb.plot.deepness(bst, which='med.weight', pch=16, col=rgb(0,0,1,0.3), cex=2)
|
||||
xgb.plot.deepness(
|
||||
bst, which = "med.weight", pch = 16, col = rgb(0, 0, 1, 0.3), cex = 2
|
||||
)
|
||||
|
||||
}
|
||||
\seealso{
|
||||
\code{\link{xgb.train}}, \code{\link{xgb.model.dt.tree}}.
|
||||
\code{\link[=xgb.train]{xgb.train()}} and \code{\link[=xgb.model.dt.tree]{xgb.model.dt.tree()}}.
|
||||
}
|
||||
|
||||
@ -3,7 +3,7 @@
|
||||
\name{xgb.ggplot.importance}
|
||||
\alias{xgb.ggplot.importance}
|
||||
\alias{xgb.plot.importance}
|
||||
\title{Plot feature importance as a bar graph}
|
||||
\title{Plot feature importance}
|
||||
\usage{
|
||||
xgb.ggplot.importance(
|
||||
importance_matrix = NULL,
|
||||
@ -26,74 +26,90 @@ xgb.plot.importance(
|
||||
)
|
||||
}
|
||||
\arguments{
|
||||
\item{importance_matrix}{a \code{data.table} returned by \code{\link{xgb.importance}}.}
|
||||
\item{importance_matrix}{A \code{data.table} as returned by \code{\link[=xgb.importance]{xgb.importance()}}.}
|
||||
|
||||
\item{top_n}{maximal number of top features to include into the plot.}
|
||||
\item{top_n}{Maximal number of top features to include into the plot.}
|
||||
|
||||
\item{measure}{the name of importance measure to plot.
|
||||
\item{measure}{The name of importance measure to plot.
|
||||
When \code{NULL}, 'Gain' would be used for trees and 'Weight' would be used for gblinear.}
|
||||
|
||||
\item{rel_to_first}{whether importance values should be represented as relative to the highest ranked feature.
|
||||
See Details.}
|
||||
\item{rel_to_first}{Whether importance values should be represented as relative to
|
||||
the highest ranked feature, see Details.}
|
||||
|
||||
\item{n_clusters}{(ggplot only) a \code{numeric} vector containing the min and the max range
|
||||
\item{n_clusters}{A numeric vector containing the min and the max range
|
||||
of the possible number of clusters of bars.}
|
||||
|
||||
\item{...}{other parameters passed to \code{barplot} (except horiz, border, cex.names, names.arg, and las).}
|
||||
\item{...}{Other parameters passed to \code{\link[graphics:barplot]{graphics::barplot()}}
|
||||
(except \code{horiz}, \code{border}, \code{cex.names}, \code{names.arg}, and \code{las}).
|
||||
Only used in \code{xgb.plot.importance()}.}
|
||||
|
||||
\item{left_margin}{(base R barplot) allows to adjust the left margin size to fit feature names.
|
||||
When it is NULL, the existing \code{par('mar')} is used.}
|
||||
\item{left_margin}{Adjust the left margin size to fit feature names.
|
||||
When \code{NULL}, the existing \code{par("mar")} is used.}
|
||||
|
||||
\item{cex}{(base R barplot) passed as \code{cex.names} parameter to \code{barplot}.}
|
||||
\item{cex}{Passed as \code{cex.names} parameter to \code{\link[graphics:barplot]{graphics::barplot()}}.}
|
||||
|
||||
\item{plot}{(base R barplot) whether a barplot should be produced.
|
||||
If FALSE, only a data.table is returned.}
|
||||
\item{plot}{Should the barplot be shown? Default is \code{TRUE}.}
|
||||
}
|
||||
\value{
|
||||
The \code{xgb.plot.importance} function creates a \code{barplot} (when \code{plot=TRUE})
|
||||
and silently returns a processed data.table with \code{n_top} features sorted by importance.
|
||||
|
||||
The \code{xgb.ggplot.importance} function returns a ggplot graph which could be customized afterwards.
|
||||
E.g., to change the title of the graph, add \code{+ ggtitle("A GRAPH NAME")} to the result.
|
||||
The return value depends on the function:
|
||||
\itemize{
|
||||
\item \code{xgb.plot.importance()}: Invisibly, a "data.table" with \code{n_top} features sorted
|
||||
by importance. If \code{plot = TRUE}, the values are also plotted as barplot.
|
||||
\item \code{xgb.ggplot.importance()}: A customizable "ggplot" object.
|
||||
E.g., to change the title, set \code{+ ggtitle("A GRAPH NAME")}.
|
||||
}
|
||||
}
|
||||
\description{
|
||||
Represents previously calculated feature importance as a bar graph.
|
||||
\code{xgb.plot.importance} uses base R graphics, while \code{xgb.ggplot.importance} uses the ggplot backend.
|
||||
\itemize{
|
||||
\item \code{xgb.plot.importance()} uses base R graphics, while
|
||||
\item \code{xgb.ggplot.importance()} uses "ggplot".
|
||||
}
|
||||
}
|
||||
\details{
|
||||
The graph represents each feature as a horizontal bar of length proportional to the importance of a feature.
|
||||
Features are shown ranked in a decreasing importance order.
|
||||
It works for importances from both \code{gblinear} and \code{gbtree} models.
|
||||
Features are sorted by decreasing importance.
|
||||
It works for both "gblinear" and "gbtree" models.
|
||||
|
||||
When \code{rel_to_first = FALSE}, the values would be plotted as they were in \code{importance_matrix}.
|
||||
For gbtree model, that would mean being normalized to the total of 1
|
||||
When \code{rel_to_first = FALSE}, the values would be plotted as in \code{importance_matrix}.
|
||||
For a "gbtree" model, that would mean being normalized to the total of 1
|
||||
("what is feature's importance contribution relative to the whole model?").
|
||||
For linear models, \code{rel_to_first = FALSE} would show actual values of the coefficients.
|
||||
Setting \code{rel_to_first = TRUE} allows to see the picture from the perspective of
|
||||
"what is feature's importance contribution relative to the most important feature?"
|
||||
|
||||
The ggplot-backend method also performs 1-D clustering of the importance values,
|
||||
with bar colors corresponding to different clusters that have somewhat similar importance values.
|
||||
The "ggplot" backend performs 1-D clustering of the importance values,
|
||||
with bar colors corresponding to different clusters having similar importance values.
|
||||
}
|
||||
\examples{
|
||||
data(agaricus.train)
|
||||
|
||||
## Keep the number of threads to 2 for examples
|
||||
nthread <- 2
|
||||
data.table::setDTthreads(nthread)
|
||||
|
||||
bst <- xgboost(
|
||||
data = agaricus.train$data, label = agaricus.train$label, max_depth = 3,
|
||||
eta = 1, nthread = nthread, nrounds = 2, objective = "binary:logistic"
|
||||
data = agaricus.train$data,
|
||||
label = agaricus.train$label,
|
||||
max_depth = 3,
|
||||
eta = 1,
|
||||
nthread = nthread,
|
||||
nrounds = 2,
|
||||
objective = "binary:logistic"
|
||||
)
|
||||
|
||||
importance_matrix <- xgb.importance(colnames(agaricus.train$data), model = bst)
|
||||
xgb.plot.importance(
|
||||
importance_matrix, rel_to_first = TRUE, xlab = "Relative importance"
|
||||
)
|
||||
|
||||
xgb.plot.importance(importance_matrix, rel_to_first = TRUE, xlab = "Relative importance")
|
||||
|
||||
(gg <- xgb.ggplot.importance(importance_matrix, measure = "Frequency", rel_to_first = TRUE))
|
||||
gg <- xgb.ggplot.importance(
|
||||
importance_matrix, measure = "Frequency", rel_to_first = TRUE
|
||||
)
|
||||
gg
|
||||
gg + ggplot2::ylab("Frequency")
|
||||
|
||||
}
|
||||
\seealso{
|
||||
\code{\link[graphics]{barplot}}.
|
||||
\code{\link[graphics:barplot]{graphics::barplot()}}
|
||||
}
|
||||
|
||||
@ -2,7 +2,7 @@
|
||||
% Please edit documentation in R/xgb.plot.multi.trees.R
|
||||
\name{xgb.plot.multi.trees}
|
||||
\alias{xgb.plot.multi.trees}
|
||||
\title{Project all trees on one tree and plot it}
|
||||
\title{Project all trees on one tree}
|
||||
\usage{
|
||||
xgb.plot.multi.trees(
|
||||
model,
|
||||
@ -15,29 +15,31 @@ xgb.plot.multi.trees(
|
||||
)
|
||||
}
|
||||
\arguments{
|
||||
\item{model}{produced by the \code{xgb.train} function.}
|
||||
\item{model}{Object of class \code{xgb.Booster}.}
|
||||
|
||||
\item{feature_names}{names of each feature as a \code{character} vector.}
|
||||
\item{feature_names}{Character vector used to overwrite the feature names
|
||||
of the model. The default (\code{NULL}) uses the original feature names.}
|
||||
|
||||
\item{features_keep}{number of features to keep in each position of the multi trees.}
|
||||
\item{features_keep}{Number of features to keep in each position of the multi trees,
|
||||
by default 5.}
|
||||
|
||||
\item{plot_width}{width in pixels of the graph to produce}
|
||||
\item{plot_width, plot_height}{Width and height of the graph in pixels.
|
||||
The values are passed to \code{\link[DiagrammeR:render_graph]{DiagrammeR::render_graph()}}.}
|
||||
|
||||
\item{plot_height}{height in pixels of the graph to produce}
|
||||
\item{render}{Should the graph be rendered or not? The default is \code{TRUE}.}
|
||||
|
||||
\item{render}{a logical flag for whether the graph should be rendered (see Value).}
|
||||
|
||||
\item{...}{currently not used}
|
||||
\item{...}{currently not used.}
|
||||
}
|
||||
\value{
|
||||
When \code{render = TRUE}:
|
||||
returns a rendered graph object which is an \code{htmlwidget} of class \code{grViz}.
|
||||
Similar to ggplot objects, it needs to be printed to see it when not running from command line.
|
||||
|
||||
When \code{render = FALSE}:
|
||||
silently returns a graph object which is of DiagrammeR's class \code{dgr_graph}.
|
||||
The value depends on the \code{render} parameter:
|
||||
\itemize{
|
||||
\item If \code{render = TRUE} (default): Rendered graph object which is an htmlwidget of
|
||||
class \code{grViz}. Similar to "ggplot" objects, it needs to be printed when not
|
||||
running from the command line.
|
||||
\item If \code{render = FALSE}: Graph object which is of DiagrammeR's class \code{dgr_graph}.
|
||||
This could be useful if one wants to modify some of the graph attributes
|
||||
before rendering the graph with \code{\link[DiagrammeR]{render_graph}}.
|
||||
before rendering the graph with \code{\link[DiagrammeR:render_graph]{DiagrammeR::render_graph()}}.
|
||||
}
|
||||
}
|
||||
\description{
|
||||
Visualization of the ensemble of trees as a single collective unit.
|
||||
@ -62,15 +64,22 @@ This function is inspired by this blog post:
|
||||
}
|
||||
\examples{
|
||||
|
||||
data(agaricus.train, package='xgboost')
|
||||
data(agaricus.train, package = "xgboost")
|
||||
|
||||
## Keep the number of threads to 2 for examples
|
||||
nthread <- 2
|
||||
data.table::setDTthreads(nthread)
|
||||
|
||||
bst <- xgboost(
|
||||
data = agaricus.train$data, label = agaricus.train$label, max_depth = 15,
|
||||
eta = 1, nthread = nthread, nrounds = 30, objective = "binary:logistic",
|
||||
min_child_weight = 50, verbose = 0
|
||||
data = agaricus.train$data,
|
||||
label = agaricus.train$label,
|
||||
max_depth = 15,
|
||||
eta = 1,
|
||||
nthread = nthread,
|
||||
nrounds = 30,
|
||||
objective = "binary:logistic",
|
||||
min_child_weight = 50,
|
||||
verbose = 0
|
||||
)
|
||||
|
||||
p <- xgb.plot.multi.trees(model = bst, features_keep = 3)
|
||||
@ -78,10 +87,13 @@ print(p)
|
||||
|
||||
\dontrun{
|
||||
# Below is an example of how to save this plot to a file.
|
||||
# Note that for `export_graph` to work, the DiagrammeRsvg and rsvg packages must also be installed.
|
||||
# Note that for export_graph() to work, the {DiagrammeRsvg} and {rsvg} packages
|
||||
# must also be installed.
|
||||
|
||||
library(DiagrammeR)
|
||||
gr <- xgb.plot.multi.trees(model=bst, features_keep = 3, render=FALSE)
|
||||
export_graph(gr, 'tree.pdf', width=1500, height=600)
|
||||
|
||||
gr <- xgb.plot.multi.trees(model = bst, features_keep = 3, render = FALSE)
|
||||
export_graph(gr, "tree.pdf", width = 1500, height = 600)
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -2,7 +2,7 @@
|
||||
% Please edit documentation in R/xgb.plot.shap.R
|
||||
\name{xgb.plot.shap}
|
||||
\alias{xgb.plot.shap}
|
||||
\title{SHAP contribution dependency plots}
|
||||
\title{SHAP dependence plots}
|
||||
\usage{
|
||||
xgb.plot.shap(
|
||||
data,
|
||||
@ -33,87 +33,93 @@ xgb.plot.shap(
|
||||
)
|
||||
}
|
||||
\arguments{
|
||||
\item{data}{data as a \code{matrix} or \code{dgCMatrix}.}
|
||||
\item{data}{The data to explain as a \code{matrix} or \code{dgCMatrix}.}
|
||||
|
||||
\item{shap_contrib}{a matrix of SHAP contributions that was computed earlier for the above
|
||||
\code{data}. When it is NULL, it is computed internally using \code{model} and \code{data}.}
|
||||
\item{shap_contrib}{Matrix of SHAP contributions of \code{data}.
|
||||
The default (\code{NULL}) computes it from \code{model} and \code{data}.}
|
||||
|
||||
\item{features}{a vector of either column indices or of feature names to plot. When it is NULL,
|
||||
feature importance is calculated, and \code{top_n} high ranked features are taken.}
|
||||
\item{features}{Vector of column indices or feature names to plot.
|
||||
When \code{NULL} (default), the \code{top_n} most important features are selected
|
||||
by \code{\link[=xgb.importance]{xgb.importance()}}.}
|
||||
|
||||
\item{top_n}{when \code{features} is NULL, top_n [1, 100] most important features in a model are taken.}
|
||||
\item{top_n}{How many of the most important features (<= 100) should be selected?
|
||||
By default 1 for SHAP dependence and 10 for SHAP summary).
|
||||
Only used when \code{features = NULL}.}
|
||||
|
||||
\item{model}{an \code{xgb.Booster} model. It has to be provided when either \code{shap_contrib}
|
||||
or \code{features} is missing.}
|
||||
\item{model}{An \code{xgb.Booster} model. Only required when \code{shap_contrib = NULL} or
|
||||
\code{features = NULL}.}
|
||||
|
||||
\item{trees}{passed to \code{\link{xgb.importance}} when \code{features = NULL}.}
|
||||
\item{trees}{Passed to \code{\link[=xgb.importance]{xgb.importance()}} when \code{features = NULL}.}
|
||||
|
||||
\item{target_class}{is only relevant for multiclass models. When it is set to a 0-based class index,
|
||||
only SHAP contributions for that specific class are used.
|
||||
If it is not set, SHAP importances are averaged over all classes.}
|
||||
\item{target_class}{Only relevant for multiclass models. The default (\code{NULL})
|
||||
averages the SHAP values over all classes. Pass a (0-based) class index
|
||||
to show only SHAP values of that class.}
|
||||
|
||||
\item{approxcontrib}{passed to \code{\link{predict.xgb.Booster}} when \code{shap_contrib = NULL}.}
|
||||
\item{approxcontrib}{Passed to \code{predict()} when \code{shap_contrib = NULL}.}
|
||||
|
||||
\item{subsample}{a random fraction of data points to use for plotting. When it is NULL,
|
||||
it is set so that up to 100K data points are used.}
|
||||
\item{subsample}{Fraction of data points randomly picked for plotting.
|
||||
The default (\code{NULL}) will use up to 100k data points.}
|
||||
|
||||
\item{n_col}{a number of columns in a grid of plots.}
|
||||
\item{n_col}{Number of columns in a grid of plots.}
|
||||
|
||||
\item{col}{color of the scatterplot markers.}
|
||||
\item{col}{Color of the scatterplot markers.}
|
||||
|
||||
\item{pch}{scatterplot marker.}
|
||||
\item{pch}{Scatterplot marker.}
|
||||
|
||||
\item{discrete_n_uniq}{a maximal number of unique values in a feature to consider it as discrete.}
|
||||
\item{discrete_n_uniq}{Maximal number of unique feature values to consider the
|
||||
feature as discrete.}
|
||||
|
||||
\item{discrete_jitter}{an \code{amount} parameter of jitter added to discrete features' positions.}
|
||||
\item{discrete_jitter}{Jitter amount added to the values of discrete features.}
|
||||
|
||||
\item{ylab}{a y-axis label in 1D plots.}
|
||||
\item{ylab}{The y-axis label in 1D plots.}
|
||||
|
||||
\item{plot_NA}{whether the contributions of cases with missing values should also be plotted.}
|
||||
\item{plot_NA}{Should contributions of cases with missing values be plotted?
|
||||
Default is \code{TRUE}.}
|
||||
|
||||
\item{col_NA}{a color of marker for missing value contributions.}
|
||||
\item{col_NA}{Color of marker for missing value contributions.}
|
||||
|
||||
\item{pch_NA}{a marker type for NA values.}
|
||||
\item{pch_NA}{Marker type for \code{NA} values.}
|
||||
|
||||
\item{pos_NA}{a relative position of the x-location where NA values are shown:
|
||||
\item{pos_NA}{Relative position of the x-location where \code{NA} values are shown:
|
||||
\code{min(x) + (max(x) - min(x)) * pos_NA}.}
|
||||
|
||||
\item{plot_loess}{whether to plot loess-smoothed curves. The smoothing is only done for features with
|
||||
more than 5 distinct values.}
|
||||
\item{plot_loess}{Should loess-smoothed curves be plotted? (Default is \code{TRUE}).
|
||||
The smoothing is only done for features with more than 5 distinct values.}
|
||||
|
||||
\item{col_loess}{a color to use for the loess curves.}
|
||||
\item{col_loess}{Color of loess curves.}
|
||||
|
||||
\item{span_loess}{the \code{span} parameter in \code{\link[stats]{loess}}'s call.}
|
||||
\item{span_loess}{The \code{span} parameter of \code{\link[stats:loess]{stats::loess()}}.}
|
||||
|
||||
\item{which}{whether to do univariate or bivariate plotting. NOTE: only 1D is implemented so far.}
|
||||
\item{which}{Whether to do univariate or bivariate plotting. Currently, only "1d" is implemented.}
|
||||
|
||||
\item{plot}{whether a plot should be drawn. If FALSE, only a list of matrices is returned.}
|
||||
\item{plot}{Should the plot be drawn? (Default is \code{TRUE}).
|
||||
If \code{FALSE}, only a list of matrices is returned.}
|
||||
|
||||
\item{...}{other parameters passed to \code{plot}.}
|
||||
\item{...}{Other parameters passed to \code{\link[graphics:plot.default]{graphics::plot()}}.}
|
||||
}
|
||||
\value{
|
||||
In addition to producing plots (when \code{plot=TRUE}), it silently returns a list of two matrices:
|
||||
In addition to producing plots (when \code{plot = TRUE}), it silently returns a list of two matrices:
|
||||
\itemize{
|
||||
\item \code{data} the values of selected features;
|
||||
\item \code{shap_contrib} the contributions of selected features.
|
||||
\item \code{data}: Feature value matrix.
|
||||
\item \code{shap_contrib}: Corresponding SHAP value matrix.
|
||||
}
|
||||
}
|
||||
\description{
|
||||
Visualizing the SHAP feature contribution to prediction dependencies on feature value.
|
||||
Visualizes SHAP values against feature values to gain an impression of feature effects.
|
||||
}
|
||||
\details{
|
||||
These scatterplots represent how SHAP feature contributions depend of feature values.
|
||||
The similarity to partial dependency plots is that they also give an idea for how feature values
|
||||
affect predictions. However, in partial dependency plots, we usually see marginal dependencies
|
||||
of model prediction on feature value, while SHAP contribution dependency plots display the estimated
|
||||
contributions of a feature to model prediction for each individual case.
|
||||
The similarity to partial dependence plots is that they also give an idea for how feature values
|
||||
affect predictions. However, in partial dependence plots, we see marginal dependencies
|
||||
of model prediction on feature value, while SHAP dependence plots display the estimated
|
||||
contributions of a feature to the prediction for each individual case.
|
||||
|
||||
When \code{plot_loess = TRUE} is set, feature values are rounded to 3 significant digits and
|
||||
weighted LOESS is computed and plotted, where weights are the numbers of data points
|
||||
When \code{plot_loess = TRUE}, feature values are rounded to three significant digits and
|
||||
weighted LOESS is computed and plotted, where the weights are the numbers of data points
|
||||
at each rounded value.
|
||||
|
||||
Note: SHAP contributions are shown on the scale of model margin. E.g., for a logistic binomial objective,
|
||||
the margin is prediction before a sigmoidal transform into probability-like values.
|
||||
Note: SHAP contributions are on the scale of the model margin.
|
||||
E.g., for a logistic binomial objective, the margin is on log-odds scale.
|
||||
Also, since SHAP stands for "SHapley Additive exPlanation" (model prediction = sum of SHAP
|
||||
contributions for all features + bias), depending on the objective used, transforming SHAP
|
||||
contributions for a feature from the marginal to the prediction space is not necessarily
|
||||
@ -121,44 +127,99 @@ a meaningful thing to do.
|
||||
}
|
||||
\examples{
|
||||
|
||||
data(agaricus.train, package='xgboost')
|
||||
data(agaricus.test, package='xgboost')
|
||||
data(agaricus.train, package = "xgboost")
|
||||
data(agaricus.test, package = "xgboost")
|
||||
|
||||
## Keep the number of threads to 1 for examples
|
||||
nthread <- 1
|
||||
data.table::setDTthreads(nthread)
|
||||
nrounds <- 20
|
||||
|
||||
bst <- xgboost(agaricus.train$data, agaricus.train$label, nrounds = nrounds,
|
||||
eta = 0.1, max_depth = 3, subsample = .5,
|
||||
method = "hist", objective = "binary:logistic", nthread = nthread, verbose = 0)
|
||||
bst <- xgboost(
|
||||
agaricus.train$data,
|
||||
agaricus.train$label,
|
||||
nrounds = nrounds,
|
||||
eta = 0.1,
|
||||
max_depth = 3,
|
||||
subsample = 0.5,
|
||||
objective = "binary:logistic",
|
||||
nthread = nthread,
|
||||
verbose = 0
|
||||
)
|
||||
|
||||
xgb.plot.shap(agaricus.test$data, model = bst, features = "odor=none")
|
||||
|
||||
contr <- predict(bst, agaricus.test$data, predcontrib = TRUE)
|
||||
xgb.plot.shap(agaricus.test$data, contr, model = bst, top_n = 12, n_col = 3)
|
||||
xgb.ggplot.shap.summary(agaricus.test$data, contr, model = bst, top_n = 12) # Summary plot
|
||||
|
||||
# multiclass example - plots for each class separately:
|
||||
# Summary plot
|
||||
xgb.ggplot.shap.summary(agaricus.test$data, contr, model = bst, top_n = 12)
|
||||
|
||||
# Multiclass example - plots for each class separately:
|
||||
nclass <- 3
|
||||
x <- as.matrix(iris[, -5])
|
||||
set.seed(123)
|
||||
is.na(x[sample(nrow(x) * 4, 30)]) <- TRUE # introduce some missing values
|
||||
mbst <- xgboost(data = x, label = as.numeric(iris$Species) - 1, nrounds = nrounds,
|
||||
max_depth = 2, eta = 0.3, subsample = .5, nthread = nthread,
|
||||
objective = "multi:softprob", num_class = nclass, verbose = 0)
|
||||
trees0 <- seq(from=0, by=nclass, length.out=nrounds)
|
||||
|
||||
mbst <- xgboost(
|
||||
data = x,
|
||||
label = as.numeric(iris$Species) - 1,
|
||||
nrounds = nrounds,
|
||||
max_depth = 2,
|
||||
eta = 0.3,
|
||||
subsample = 0.5,
|
||||
nthread = nthread,
|
||||
objective = "multi:softprob",
|
||||
num_class = nclass,
|
||||
verbose = 0
|
||||
)
|
||||
trees0 <- seq(from = 0, by = nclass, length.out = nrounds)
|
||||
col <- rgb(0, 0, 1, 0.5)
|
||||
xgb.plot.shap(x, model = mbst, trees = trees0, target_class = 0, top_n = 4,
|
||||
n_col = 2, col = col, pch = 16, pch_NA = 17)
|
||||
xgb.plot.shap(x, model = mbst, trees = trees0 + 1, target_class = 1, top_n = 4,
|
||||
n_col = 2, col = col, pch = 16, pch_NA = 17)
|
||||
xgb.plot.shap(x, model = mbst, trees = trees0 + 2, target_class = 2, top_n = 4,
|
||||
n_col = 2, col = col, pch = 16, pch_NA = 17)
|
||||
xgb.ggplot.shap.summary(x, model = mbst, target_class = 0, top_n = 4) # Summary plot
|
||||
xgb.plot.shap(
|
||||
x,
|
||||
model = mbst,
|
||||
trees = trees0,
|
||||
target_class = 0,
|
||||
top_n = 4,
|
||||
n_col = 2,
|
||||
col = col,
|
||||
pch = 16,
|
||||
pch_NA = 17
|
||||
)
|
||||
|
||||
xgb.plot.shap(
|
||||
x,
|
||||
model = mbst,
|
||||
trees = trees0 + 1,
|
||||
target_class = 1,
|
||||
top_n = 4,
|
||||
n_col = 2,
|
||||
col = col,
|
||||
pch = 16,
|
||||
pch_NA = 17
|
||||
)
|
||||
|
||||
xgb.plot.shap(
|
||||
x,
|
||||
model = mbst,
|
||||
trees = trees0 + 2,
|
||||
target_class = 2,
|
||||
top_n = 4,
|
||||
n_col = 2,
|
||||
col = col,
|
||||
pch = 16,
|
||||
pch_NA = 17
|
||||
)
|
||||
|
||||
# Summary plot
|
||||
xgb.ggplot.shap.summary(x, model = mbst, target_class = 0, top_n = 4)
|
||||
|
||||
}
|
||||
\references{
|
||||
Scott M. Lundberg, Su-In Lee, "A Unified Approach to Interpreting Model Predictions", NIPS Proceedings 2017, \url{https://arxiv.org/abs/1705.07874}
|
||||
|
||||
Scott M. Lundberg, Su-In Lee, "Consistent feature attribution for tree ensembles", \url{https://arxiv.org/abs/1706.06060}
|
||||
\enumerate{
|
||||
\item Scott M. Lundberg, Su-In Lee, "A Unified Approach to Interpreting Model Predictions",
|
||||
NIPS Proceedings 2017, \url{https://arxiv.org/abs/1705.07874}
|
||||
\item Scott M. Lundberg, Su-In Lee, "Consistent feature attribution for tree ensembles",
|
||||
\url{https://arxiv.org/abs/1706.06060}
|
||||
}
|
||||
}
|
||||
|
||||
@ -3,7 +3,7 @@
|
||||
\name{xgb.ggplot.shap.summary}
|
||||
\alias{xgb.ggplot.shap.summary}
|
||||
\alias{xgb.plot.shap.summary}
|
||||
\title{SHAP contribution dependency summary plot}
|
||||
\title{SHAP summary plot}
|
||||
\usage{
|
||||
xgb.ggplot.shap.summary(
|
||||
data,
|
||||
@ -30,49 +30,54 @@ xgb.plot.shap.summary(
|
||||
)
|
||||
}
|
||||
\arguments{
|
||||
\item{data}{data as a \code{matrix} or \code{dgCMatrix}.}
|
||||
\item{data}{The data to explain as a \code{matrix} or \code{dgCMatrix}.}
|
||||
|
||||
\item{shap_contrib}{a matrix of SHAP contributions that was computed earlier for the above
|
||||
\code{data}. When it is NULL, it is computed internally using \code{model} and \code{data}.}
|
||||
\item{shap_contrib}{Matrix of SHAP contributions of \code{data}.
|
||||
The default (\code{NULL}) computes it from \code{model} and \code{data}.}
|
||||
|
||||
\item{features}{a vector of either column indices or of feature names to plot. When it is NULL,
|
||||
feature importance is calculated, and \code{top_n} high ranked features are taken.}
|
||||
\item{features}{Vector of column indices or feature names to plot.
|
||||
When \code{NULL} (default), the \code{top_n} most important features are selected
|
||||
by \code{\link[=xgb.importance]{xgb.importance()}}.}
|
||||
|
||||
\item{top_n}{when \code{features} is NULL, top_n [1, 100] most important features in a model are taken.}
|
||||
\item{top_n}{How many of the most important features (<= 100) should be selected?
|
||||
By default 1 for SHAP dependence and 10 for SHAP summary).
|
||||
Only used when \code{features = NULL}.}
|
||||
|
||||
\item{model}{an \code{xgb.Booster} model. It has to be provided when either \code{shap_contrib}
|
||||
or \code{features} is missing.}
|
||||
\item{model}{An \code{xgb.Booster} model. Only required when \code{shap_contrib = NULL} or
|
||||
\code{features = NULL}.}
|
||||
|
||||
\item{trees}{passed to \code{\link{xgb.importance}} when \code{features = NULL}.}
|
||||
\item{trees}{Passed to \code{\link[=xgb.importance]{xgb.importance()}} when \code{features = NULL}.}
|
||||
|
||||
\item{target_class}{is only relevant for multiclass models. When it is set to a 0-based class index,
|
||||
only SHAP contributions for that specific class are used.
|
||||
If it is not set, SHAP importances are averaged over all classes.}
|
||||
\item{target_class}{Only relevant for multiclass models. The default (\code{NULL})
|
||||
averages the SHAP values over all classes. Pass a (0-based) class index
|
||||
to show only SHAP values of that class.}
|
||||
|
||||
\item{approxcontrib}{passed to \code{\link{predict.xgb.Booster}} when \code{shap_contrib = NULL}.}
|
||||
\item{approxcontrib}{Passed to \code{predict()} when \code{shap_contrib = NULL}.}
|
||||
|
||||
\item{subsample}{a random fraction of data points to use for plotting. When it is NULL,
|
||||
it is set so that up to 100K data points are used.}
|
||||
\item{subsample}{Fraction of data points randomly picked for plotting.
|
||||
The default (\code{NULL}) will use up to 100k data points.}
|
||||
}
|
||||
\value{
|
||||
A \code{ggplot2} object.
|
||||
}
|
||||
\description{
|
||||
Compare SHAP contributions of different features.
|
||||
Visualizes SHAP contributions of different features.
|
||||
}
|
||||
\details{
|
||||
A point plot (each point representing one sample from \code{data}) is
|
||||
A point plot (each point representing one observation from \code{data}) is
|
||||
produced for each feature, with the points plotted on the SHAP value axis.
|
||||
Each point (observation) is coloured based on its feature value. The plot
|
||||
hence allows us to see which features have a negative / positive contribution
|
||||
Each point (observation) is coloured based on its feature value.
|
||||
|
||||
The plot allows to see which features have a negative / positive contribution
|
||||
on the model prediction, and whether the contribution is different for larger
|
||||
or smaller values of the feature. We effectively try to replicate the
|
||||
\code{summary_plot} function from https://github.com/shap/shap.
|
||||
or smaller values of the feature. Inspired by the summary plot of
|
||||
\url{https://github.com/shap/shap}.
|
||||
}
|
||||
\examples{
|
||||
# See \code{\link{xgb.plot.shap}}.
|
||||
# See examples in xgb.plot.shap()
|
||||
|
||||
}
|
||||
\seealso{
|
||||
\code{\link{xgb.plot.shap}}, \code{\link{xgb.ggplot.shap.summary}},
|
||||
\url{https://github.com/shap/shap}
|
||||
\code{\link[=xgb.plot.shap]{xgb.plot.shap()}}, \code{\link[=xgb.ggplot.shap.summary]{xgb.ggplot.shap.summary()}},
|
||||
and the Python library \url{https://github.com/shap/shap}.
|
||||
}
|
||||
|
||||
@ -2,7 +2,7 @@
|
||||
% Please edit documentation in R/xgb.plot.tree.R
|
||||
\name{xgb.plot.tree}
|
||||
\alias{xgb.plot.tree}
|
||||
\title{Plot a boosted tree model}
|
||||
\title{Plot boosted trees}
|
||||
\usage{
|
||||
xgb.plot.tree(
|
||||
feature_names = NULL,
|
||||
@ -12,80 +12,124 @@ xgb.plot.tree(
|
||||
plot_height = NULL,
|
||||
render = TRUE,
|
||||
show_node_id = FALSE,
|
||||
style = c("R", "xgboost"),
|
||||
...
|
||||
)
|
||||
}
|
||||
\arguments{
|
||||
\item{feature_names}{names of each feature as a \code{character} vector.}
|
||||
\item{feature_names}{Character vector used to overwrite the feature names
|
||||
of the model. The default (\code{NULL}) uses the original feature names.}
|
||||
|
||||
\item{model}{produced by the \code{xgb.train} function.}
|
||||
\item{model}{Object of class \code{xgb.Booster}.}
|
||||
|
||||
\item{trees}{an integer vector of tree indices that should be visualized.
|
||||
If set to \code{NULL}, all trees of the model are included.
|
||||
IMPORTANT: the tree index in xgboost model is zero-based
|
||||
(e.g., use \code{trees = 0:2} for the first 3 trees in a model).}
|
||||
\item{trees}{An integer vector of tree indices that should be used.
|
||||
The default (\code{NULL}) uses all trees.
|
||||
Useful, e.g., in multiclass classification to get only
|
||||
the trees of one class. \emph{Important}: the tree index in XGBoost models
|
||||
is zero-based (e.g., use \code{trees = 0:2} for the first three trees).}
|
||||
|
||||
\item{plot_width}{the width of the diagram in pixels.}
|
||||
\item{plot_width, plot_height}{Width and height of the graph in pixels.
|
||||
The values are passed to \code{\link[DiagrammeR:render_graph]{DiagrammeR::render_graph()}}.}
|
||||
|
||||
\item{plot_height}{the height of the diagram in pixels.}
|
||||
|
||||
\item{render}{a logical flag for whether the graph should be rendered (see Value).}
|
||||
\item{render}{Should the graph be rendered or not? The default is \code{TRUE}.}
|
||||
|
||||
\item{show_node_id}{a logical flag for whether to show node id's in the graph.}
|
||||
|
||||
\item{style}{Style to use for the plot. Options are:\itemize{
|
||||
\item \code{"xgboost"}: will use the plot style defined in the core XGBoost library,
|
||||
which is shared between different interfaces through the 'dot' format. This
|
||||
style was not available before version 2.1.0 in R. It always plots the trees
|
||||
vertically (from top to bottom).
|
||||
\item \code{"R"}: will use the style defined from XGBoost's R interface, which predates
|
||||
the introducition of the standardized style from the core library. It might plot
|
||||
the trees horizontally (from left to right).
|
||||
}
|
||||
|
||||
Note that \code{style="xgboost"} is only supported when all of the following conditions are met:\itemize{
|
||||
\item Only a single tree is being plotted.
|
||||
\item Node IDs are not added to the graph.
|
||||
\item The graph is being returned as \code{htmlwidget} (\code{render=TRUE}).
|
||||
}}
|
||||
|
||||
\item{...}{currently not used.}
|
||||
}
|
||||
\value{
|
||||
When \code{render = TRUE}:
|
||||
returns a rendered graph object which is an \code{htmlwidget} of class \code{grViz}.
|
||||
Similar to ggplot objects, it needs to be printed to see it when not running from command line.
|
||||
|
||||
When \code{render = FALSE}:
|
||||
silently returns a graph object which is of DiagrammeR's class \code{dgr_graph}.
|
||||
The value depends on the \code{render} parameter:
|
||||
\itemize{
|
||||
\item If \code{render = TRUE} (default): Rendered graph object which is an htmlwidget of
|
||||
class \code{grViz}. Similar to "ggplot" objects, it needs to be printed when not
|
||||
running from the command line.
|
||||
\item If \code{render = FALSE}: Graph object which is of DiagrammeR's class \code{dgr_graph}.
|
||||
This could be useful if one wants to modify some of the graph attributes
|
||||
before rendering the graph with \code{\link[DiagrammeR]{render_graph}}.
|
||||
before rendering the graph with \code{\link[DiagrammeR:render_graph]{DiagrammeR::render_graph()}}.
|
||||
}
|
||||
}
|
||||
\description{
|
||||
Read a tree model text dump and plot the model.
|
||||
}
|
||||
\details{
|
||||
The content of each node is organised that way:
|
||||
|
||||
When using \code{style="xgboost"}, the content of each node is visualized as follows:
|
||||
\itemize{
|
||||
\item Feature name.
|
||||
\item \code{Cover}: The sum of second order gradient of training data classified to the leaf.
|
||||
If it is square loss, this simply corresponds to the number of instances seen by a split
|
||||
or collected by a leaf during training.
|
||||
The deeper in the tree a node is, the lower this metric will be.
|
||||
\item \code{Gain} (for split nodes): the information gain metric of a split
|
||||
(corresponds to the importance of the node in the model).
|
||||
\item \code{Value} (for leafs): the margin value that the leaf may contribute to prediction.
|
||||
\item For non-terminal nodes, it will display the split condition (number or name if
|
||||
available, and the condition that would decide to which node to go next).
|
||||
\item Those nodes will be connected to their children by arrows that indicate whether the
|
||||
branch corresponds to the condition being met or not being met.
|
||||
\item Terminal (leaf) nodes contain the margin to add when ending there.
|
||||
}
|
||||
The tree root nodes also indicate the Tree index (0-based).
|
||||
|
||||
When using \code{style="R"}, the content of each node is visualized like this:
|
||||
\itemize{
|
||||
\item \emph{Feature name}.
|
||||
\item \emph{Cover:} The sum of second order gradients of training data.
|
||||
For the squared loss, this simply corresponds to the number of instances in the node.
|
||||
The deeper in the tree, the lower the value.
|
||||
\item \emph{Gain} (for split nodes): Information gain metric of a split
|
||||
(corresponds to the importance of the node in the model).
|
||||
\item \emph{Value} (for leaves): Margin value that the leaf may contribute to the prediction.
|
||||
}
|
||||
|
||||
The tree root nodes also indicate the tree index (0-based).
|
||||
|
||||
The "Yes" branches are marked by the "< split_value" label.
|
||||
The branches that also used for missing values are marked as bold
|
||||
The branches also used for missing values are marked as bold
|
||||
(as in "carrying extra capacity").
|
||||
|
||||
This function uses \href{https://www.graphviz.org/}{GraphViz} as a backend of DiagrammeR.
|
||||
This function uses \href{https://www.graphviz.org/}{GraphViz} as DiagrammeR backend.
|
||||
}
|
||||
\examples{
|
||||
data(agaricus.train, package='xgboost')
|
||||
data(agaricus.train, package = "xgboost")
|
||||
|
||||
bst <- xgboost(
|
||||
data = agaricus.train$data,
|
||||
label = agaricus.train$label,
|
||||
max_depth = 3,
|
||||
eta = 1,
|
||||
nthread = 2,
|
||||
nrounds = 2,
|
||||
objective = "binary:logistic"
|
||||
)
|
||||
|
||||
# plot the first tree, using the style from xgboost's core library
|
||||
# (this plot should look identical to the ones generated from other
|
||||
# interfaces like the python package for xgboost)
|
||||
xgb.plot.tree(model = bst, trees = 1, style = "xgboost")
|
||||
|
||||
bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max_depth = 3,
|
||||
eta = 1, nthread = 2, nrounds = 2,objective = "binary:logistic")
|
||||
# plot all the trees
|
||||
xgb.plot.tree(model = bst)
|
||||
xgb.plot.tree(model = bst, trees = NULL)
|
||||
|
||||
# plot only the first tree and display the node ID:
|
||||
xgb.plot.tree(model = bst, trees = 0, show_node_id = TRUE)
|
||||
|
||||
\dontrun{
|
||||
# Below is an example of how to save this plot to a file.
|
||||
# Note that for `export_graph` to work, the DiagrammeRsvg and rsvg packages must also be installed.
|
||||
# Note that for export_graph() to work, the {DiagrammeRsvg}
|
||||
# and {rsvg} packages must also be installed.
|
||||
|
||||
library(DiagrammeR)
|
||||
gr <- xgb.plot.tree(model=bst, trees=0:1, render=FALSE)
|
||||
export_graph(gr, 'tree.pdf', width=1500, height=1900)
|
||||
export_graph(gr, 'tree.png', width=1500, height=1900)
|
||||
|
||||
gr <- xgb.plot.tree(model = bst, trees = 0:1, render = FALSE)
|
||||
export_graph(gr, "tree.pdf", width = 1500, height = 1900)
|
||||
export_graph(gr, "tree.png", width = 1500, height = 1900)
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -7,15 +7,27 @@
|
||||
xgb.save(model, fname)
|
||||
}
|
||||
\arguments{
|
||||
\item{model}{model object of \code{xgb.Booster} class.}
|
||||
\item{model}{Model object of \code{xgb.Booster} class.}
|
||||
|
||||
\item{fname}{name of the file to write.}
|
||||
\item{fname}{Name of the file to write.
|
||||
|
||||
Note that the extension of this file name determined the serialization format to use:\itemize{
|
||||
\item Extension ".ubj" will use the universal binary JSON format (recommended).
|
||||
This format uses binary types for e.g. floating point numbers, thereby preventing any loss
|
||||
of precision when converting to a human-readable JSON text or similar.
|
||||
\item Extension ".json" will use plain JSON, which is a human-readable format.
|
||||
\item Extension ".deprecated" will use a \bold{deprecated} binary format. This format will
|
||||
not be able to save attributes introduced after v1 of XGBoost, such as the "best_iteration"
|
||||
attribute that boosters might keep, nor feature names or user-specifiec attributes.
|
||||
\item If the format is not specified by passing one of the file extensions above, will
|
||||
default to UBJ.
|
||||
}}
|
||||
}
|
||||
\description{
|
||||
Save xgboost model to a file in binary format.
|
||||
Save xgboost model to a file in binary or JSON format.
|
||||
}
|
||||
\details{
|
||||
This methods allows to save a model in an xgboost-internal binary format which is universal
|
||||
This methods allows to save a model in an xgboost-internal binary or text format which is universal
|
||||
among the various xgboost interfaces. In R, the saved model file could be read-in later
|
||||
using either the \code{\link{xgb.load}} function or the \code{xgb_model} parameter
|
||||
of \code{\link{xgb.train}}.
|
||||
@ -23,7 +35,7 @@ of \code{\link{xgb.train}}.
|
||||
Note: a model can also be saved as an R-object (e.g., by using \code{\link[base]{readRDS}}
|
||||
or \code{\link[base]{save}}). However, it would then only be compatible with R, and
|
||||
corresponding R-methods would need to be used to load it. Moreover, persisting the model with
|
||||
\code{\link[base]{readRDS}} or \code{\link[base]{save}}) will cause compatibility problems in
|
||||
\code{\link[base]{readRDS}} or \code{\link[base]{save}}) might cause compatibility problems in
|
||||
future versions of XGBoost. Consult \code{\link{a-compatibility-note-for-saveRDS-save}} to learn
|
||||
how to persist models in a future-proof way, i.e. to make the model accessible in future
|
||||
releases of XGBoost.
|
||||
@ -38,16 +50,18 @@ data.table::setDTthreads(nthread)
|
||||
|
||||
train <- agaricus.train
|
||||
test <- agaricus.test
|
||||
bst <- xgboost(
|
||||
data = train$data, label = train$label, max_depth = 2, eta = 1,
|
||||
bst <- xgb.train(
|
||||
data = xgb.DMatrix(train$data, label = train$label),
|
||||
max_depth = 2,
|
||||
eta = 1,
|
||||
nthread = nthread,
|
||||
nrounds = 2,
|
||||
objective = "binary:logistic"
|
||||
)
|
||||
xgb.save(bst, 'xgb.model')
|
||||
bst <- xgb.load('xgb.model')
|
||||
if (file.exists('xgb.model')) file.remove('xgb.model')
|
||||
fname <- file.path(tempdir(), "xgb.ubj")
|
||||
xgb.save(bst, fname)
|
||||
bst <- xgb.load(fname)
|
||||
}
|
||||
\seealso{
|
||||
\code{\link{xgb.load}}, \code{\link{xgb.Booster.complete}}.
|
||||
\code{\link{xgb.load}}
|
||||
}
|
||||
|
||||
@ -5,19 +5,17 @@
|
||||
\title{Save xgboost model to R's raw vector,
|
||||
user can call xgb.load.raw to load the model back from raw vector}
|
||||
\usage{
|
||||
xgb.save.raw(model, raw_format = "deprecated")
|
||||
xgb.save.raw(model, raw_format = "ubj")
|
||||
}
|
||||
\arguments{
|
||||
\item{model}{the model object.}
|
||||
|
||||
\item{raw_format}{The format for encoding the booster. Available options are
|
||||
\itemize{
|
||||
\item \code{json}: Encode the booster into JSON text document.
|
||||
\item \code{ubj}: Encode the booster into Universal Binary JSON.
|
||||
\item \code{deprecated}: Encode the booster into old customized binary format.
|
||||
}
|
||||
|
||||
Right now the default is \code{deprecated} but will be changed to \code{ubj} in upcoming release.}
|
||||
\item \code{json}: Encode the booster into JSON text document.
|
||||
\item \code{ubj}: Encode the booster into Universal Binary JSON.
|
||||
\item \code{deprecated}: Encode the booster into old customized binary format.
|
||||
}}
|
||||
}
|
||||
\description{
|
||||
Save xgboost model from xgboost or xgb.train
|
||||
@ -32,8 +30,8 @@ data.table::setDTthreads(nthread)
|
||||
|
||||
train <- agaricus.train
|
||||
test <- agaricus.test
|
||||
bst <- xgboost(data = train$data, label = train$label, max_depth = 2,
|
||||
eta = 1, nthread = nthread, nrounds = 2,objective = "binary:logistic")
|
||||
bst <- xgb.train(data = xgb.DMatrix(train$data, label = train$label), max_depth = 2,
|
||||
eta = 1, nthread = nthread, nrounds = 2,objective = "binary:logistic")
|
||||
|
||||
raw <- xgb.save.raw(bst)
|
||||
bst <- xgb.load.raw(raw)
|
||||
|
||||
@ -1,29 +0,0 @@
|
||||
% Generated by roxygen2: do not edit by hand
|
||||
% Please edit documentation in R/xgb.serialize.R
|
||||
\name{xgb.serialize}
|
||||
\alias{xgb.serialize}
|
||||
\title{Serialize the booster instance into R's raw vector. The serialization method differs
|
||||
from \code{\link{xgb.save.raw}} as the latter one saves only the model but not
|
||||
parameters. This serialization format is not stable across different xgboost versions.}
|
||||
\usage{
|
||||
xgb.serialize(booster)
|
||||
}
|
||||
\arguments{
|
||||
\item{booster}{the booster instance}
|
||||
}
|
||||
\description{
|
||||
Serialize the booster instance into R's raw vector. The serialization method differs
|
||||
from \code{\link{xgb.save.raw}} as the latter one saves only the model but not
|
||||
parameters. This serialization format is not stable across different xgboost versions.
|
||||
}
|
||||
\examples{
|
||||
data(agaricus.train, package='xgboost')
|
||||
data(agaricus.test, package='xgboost')
|
||||
train <- agaricus.train
|
||||
test <- agaricus.test
|
||||
bst <- xgboost(data = train$data, label = train$label, max_depth = 2,
|
||||
eta = 1, nthread = 2, nrounds = 2,objective = "binary:logistic")
|
||||
raw <- xgb.serialize(bst)
|
||||
bst <- xgb.unserialize(raw)
|
||||
|
||||
}
|
||||
@ -1,55 +0,0 @@
|
||||
% Generated by roxygen2: do not edit by hand
|
||||
% Please edit documentation in R/xgb.plot.shap.R
|
||||
\name{xgb.shap.data}
|
||||
\alias{xgb.shap.data}
|
||||
\title{Prepare data for SHAP plots. To be used in xgb.plot.shap, xgb.plot.shap.summary, etc.
|
||||
Internal utility function.}
|
||||
\usage{
|
||||
xgb.shap.data(
|
||||
data,
|
||||
shap_contrib = NULL,
|
||||
features = NULL,
|
||||
top_n = 1,
|
||||
model = NULL,
|
||||
trees = NULL,
|
||||
target_class = NULL,
|
||||
approxcontrib = FALSE,
|
||||
subsample = NULL,
|
||||
max_observations = 1e+05
|
||||
)
|
||||
}
|
||||
\arguments{
|
||||
\item{data}{data as a \code{matrix} or \code{dgCMatrix}.}
|
||||
|
||||
\item{shap_contrib}{a matrix of SHAP contributions that was computed earlier for the above
|
||||
\code{data}. When it is NULL, it is computed internally using \code{model} and \code{data}.}
|
||||
|
||||
\item{features}{a vector of either column indices or of feature names to plot. When it is NULL,
|
||||
feature importance is calculated, and \code{top_n} high ranked features are taken.}
|
||||
|
||||
\item{top_n}{when \code{features} is NULL, top_n [1, 100] most important features in a model are taken.}
|
||||
|
||||
\item{model}{an \code{xgb.Booster} model. It has to be provided when either \code{shap_contrib}
|
||||
or \code{features} is missing.}
|
||||
|
||||
\item{trees}{passed to \code{\link{xgb.importance}} when \code{features = NULL}.}
|
||||
|
||||
\item{target_class}{is only relevant for multiclass models. When it is set to a 0-based class index,
|
||||
only SHAP contributions for that specific class are used.
|
||||
If it is not set, SHAP importances are averaged over all classes.}
|
||||
|
||||
\item{approxcontrib}{passed to \code{\link{predict.xgb.Booster}} when \code{shap_contrib = NULL}.}
|
||||
|
||||
\item{subsample}{a random fraction of data points to use for plotting. When it is NULL,
|
||||
it is set so that up to 100K data points are used.}
|
||||
}
|
||||
\value{
|
||||
A list containing: 'data', a matrix containing sample observations
|
||||
and their feature values; 'shap_contrib', a matrix containing the SHAP contribution
|
||||
values for these observations.
|
||||
}
|
||||
\description{
|
||||
Prepare data for SHAP plots. To be used in xgb.plot.shap, xgb.plot.shap.summary, etc.
|
||||
Internal utility function.
|
||||
}
|
||||
\keyword{internal}
|
||||
@ -43,111 +43,114 @@ xgboost(
|
||||
}
|
||||
\arguments{
|
||||
\item{params}{the list of parameters. The complete list of parameters is
|
||||
available in the \href{http://xgboost.readthedocs.io/en/latest/parameter.html}{online documentation}. Below
|
||||
is a shorter summary:
|
||||
|
||||
1. General Parameters
|
||||
|
||||
\itemize{
|
||||
\item \code{booster} which booster to use, can be \code{gbtree} or \code{gblinear}. Default: \code{gbtree}.
|
||||
available in the \href{http://xgboost.readthedocs.io/en/latest/parameter.html}{online documentation}. Below
|
||||
is a shorter summary:
|
||||
\enumerate{
|
||||
\item General Parameters
|
||||
}
|
||||
|
||||
2. Booster Parameters
|
||||
\itemize{
|
||||
\item \code{booster} which booster to use, can be \code{gbtree} or \code{gblinear}. Default: \code{gbtree}.
|
||||
}
|
||||
\enumerate{
|
||||
\item Booster Parameters
|
||||
}
|
||||
|
||||
2.1. Parameters for Tree Booster
|
||||
|
||||
\itemize{
|
||||
\item{ \code{eta} control the learning rate: scale the contribution of each tree by a factor of \code{0 < eta < 1}
|
||||
when it is added to the current approximation.
|
||||
Used to prevent overfitting by making the boosting process more conservative.
|
||||
Lower value for \code{eta} implies larger value for \code{nrounds}: low \code{eta} value means model
|
||||
more robust to overfitting but slower to compute. Default: 0.3}
|
||||
\item{ \code{gamma} minimum loss reduction required to make a further partition on a leaf node of the tree.
|
||||
the larger, the more conservative the algorithm will be.}
|
||||
\item \code{max_depth} maximum depth of a tree. Default: 6
|
||||
\item{\code{min_child_weight} minimum sum of instance weight (hessian) needed in a child.
|
||||
If the tree partition step results in a leaf node with the sum of instance weight less than min_child_weight,
|
||||
then the building process will give up further partitioning.
|
||||
In linear regression mode, this simply corresponds to minimum number of instances needed to be in each node.
|
||||
The larger, the more conservative the algorithm will be. Default: 1}
|
||||
\item{ \code{subsample} subsample ratio of the training instance.
|
||||
Setting it to 0.5 means that xgboost randomly collected half of the data instances to grow trees
|
||||
and this will prevent overfitting. It makes computation shorter (because less data to analyse).
|
||||
It is advised to use this parameter with \code{eta} and increase \code{nrounds}. Default: 1}
|
||||
\item \code{colsample_bytree} subsample ratio of columns when constructing each tree. Default: 1
|
||||
\item \code{lambda} L2 regularization term on weights. Default: 1
|
||||
\item \code{alpha} L1 regularization term on weights. (there is no L1 reg on bias because it is not important). Default: 0
|
||||
\item{ \code{num_parallel_tree} Experimental parameter. number of trees to grow per round.
|
||||
Useful to test Random Forest through XGBoost
|
||||
(set \code{colsample_bytree < 1}, \code{subsample < 1} and \code{round = 1}) accordingly.
|
||||
Default: 1}
|
||||
\item{ \code{monotone_constraints} A numerical vector consists of \code{1}, \code{0} and \code{-1} with its length
|
||||
equals to the number of features in the training data.
|
||||
\code{1} is increasing, \code{-1} is decreasing and \code{0} is no constraint.}
|
||||
\item{ \code{interaction_constraints} A list of vectors specifying feature indices of permitted interactions.
|
||||
Each item of the list represents one permitted interaction where specified features are allowed to interact with each other.
|
||||
Feature index values should start from \code{0} (\code{0} references the first column).
|
||||
Leave argument unspecified for no interaction constraints.}
|
||||
\item{ \code{eta} control the learning rate: scale the contribution of each tree by a factor of \code{0 < eta < 1}
|
||||
when it is added to the current approximation.
|
||||
Used to prevent overfitting by making the boosting process more conservative.
|
||||
Lower value for \code{eta} implies larger value for \code{nrounds}: low \code{eta} value means model
|
||||
more robust to overfitting but slower to compute. Default: 0.3}
|
||||
\item{ \code{gamma} minimum loss reduction required to make a further partition on a leaf node of the tree.
|
||||
the larger, the more conservative the algorithm will be.}
|
||||
\item \code{max_depth} maximum depth of a tree. Default: 6
|
||||
\item{\code{min_child_weight} minimum sum of instance weight (hessian) needed in a child.
|
||||
If the tree partition step results in a leaf node with the sum of instance weight less than min_child_weight,
|
||||
then the building process will give up further partitioning.
|
||||
In linear regression mode, this simply corresponds to minimum number of instances needed to be in each node.
|
||||
The larger, the more conservative the algorithm will be. Default: 1}
|
||||
\item{ \code{subsample} subsample ratio of the training instance.
|
||||
Setting it to 0.5 means that xgboost randomly collected half of the data instances to grow trees
|
||||
and this will prevent overfitting. It makes computation shorter (because less data to analyse).
|
||||
It is advised to use this parameter with \code{eta} and increase \code{nrounds}. Default: 1}
|
||||
\item \code{colsample_bytree} subsample ratio of columns when constructing each tree. Default: 1
|
||||
\item \code{lambda} L2 regularization term on weights. Default: 1
|
||||
\item \code{alpha} L1 regularization term on weights. (there is no L1 reg on bias because it is not important). Default: 0
|
||||
\item{ \code{num_parallel_tree} Experimental parameter. number of trees to grow per round.
|
||||
Useful to test Random Forest through XGBoost
|
||||
(set \code{colsample_bytree < 1}, \code{subsample < 1} and \code{round = 1}) accordingly.
|
||||
Default: 1}
|
||||
\item{ \code{monotone_constraints} A numerical vector consists of \code{1}, \code{0} and \code{-1} with its length
|
||||
equals to the number of features in the training data.
|
||||
\code{1} is increasing, \code{-1} is decreasing and \code{0} is no constraint.}
|
||||
\item{ \code{interaction_constraints} A list of vectors specifying feature indices of permitted interactions.
|
||||
Each item of the list represents one permitted interaction where specified features are allowed to interact with each other.
|
||||
Feature index values should start from \code{0} (\code{0} references the first column).
|
||||
Leave argument unspecified for no interaction constraints.}
|
||||
}
|
||||
|
||||
2.2. Parameters for Linear Booster
|
||||
|
||||
\itemize{
|
||||
\item \code{lambda} L2 regularization term on weights. Default: 0
|
||||
\item \code{lambda_bias} L2 regularization term on bias. Default: 0
|
||||
\item \code{alpha} L1 regularization term on weights. (there is no L1 reg on bias because it is not important). Default: 0
|
||||
\item \code{lambda} L2 regularization term on weights. Default: 0
|
||||
\item \code{lambda_bias} L2 regularization term on bias. Default: 0
|
||||
\item \code{alpha} L1 regularization term on weights. (there is no L1 reg on bias because it is not important). Default: 0
|
||||
}
|
||||
\enumerate{
|
||||
\item Task Parameters
|
||||
}
|
||||
|
||||
3. Task Parameters
|
||||
|
||||
\itemize{
|
||||
\item{ \code{objective} specify the learning task and the corresponding learning objective, users can pass a self-defined function to it.
|
||||
The default objective options are below:
|
||||
\itemize{
|
||||
\item \code{reg:squarederror} Regression with squared loss (Default).
|
||||
\item{ \code{reg:squaredlogerror}: regression with squared log loss \eqn{1/2 * (log(pred + 1) - log(label + 1))^2}.
|
||||
All inputs are required to be greater than -1.
|
||||
Also, see metric rmsle for possible issue with this objective.}
|
||||
\item \code{reg:logistic} logistic regression.
|
||||
\item \code{reg:pseudohubererror}: regression with Pseudo Huber loss, a twice differentiable alternative to absolute loss.
|
||||
\item \code{binary:logistic} logistic regression for binary classification. Output probability.
|
||||
\item \code{binary:logitraw} logistic regression for binary classification, output score before logistic transformation.
|
||||
\item \code{binary:hinge}: hinge loss for binary classification. This makes predictions of 0 or 1, rather than producing probabilities.
|
||||
\item{ \code{count:poisson}: Poisson regression for count data, output mean of Poisson distribution.
|
||||
\code{max_delta_step} is set to 0.7 by default in poisson regression (used to safeguard optimization).}
|
||||
\item{ \code{survival:cox}: Cox regression for right censored survival time data (negative values are considered right censored).
|
||||
Note that predictions are returned on the hazard ratio scale (i.e., as HR = exp(marginal_prediction) in the proportional
|
||||
hazard function \code{h(t) = h0(t) * HR)}.}
|
||||
\item{ \code{survival:aft}: Accelerated failure time model for censored survival time data. See
|
||||
\href{https://xgboost.readthedocs.io/en/latest/tutorials/aft_survival_analysis.html}{Survival Analysis with Accelerated Failure Time}
|
||||
for details.}
|
||||
\item \code{aft_loss_distribution}: Probability Density Function used by \code{survival:aft} and \code{aft-nloglik} metric.
|
||||
\item{ \code{multi:softmax} set xgboost to do multiclass classification using the softmax objective.
|
||||
Class is represented by a number and should be from 0 to \code{num_class - 1}.}
|
||||
\item{ \code{multi:softprob} same as softmax, but prediction outputs a vector of ndata * nclass elements, which can be
|
||||
further reshaped to ndata, nclass matrix. The result contains predicted probabilities of each data point belonging
|
||||
to each class.}
|
||||
\item \code{rank:pairwise} set xgboost to do ranking task by minimizing the pairwise loss.
|
||||
\item{ \code{rank:ndcg}: Use LambdaMART to perform list-wise ranking where
|
||||
\href{https://en.wikipedia.org/wiki/Discounted_cumulative_gain}{Normalized Discounted Cumulative Gain (NDCG)} is maximized.}
|
||||
\item{ \code{rank:map}: Use LambdaMART to perform list-wise ranking where
|
||||
\href{https://en.wikipedia.org/wiki/Evaluation_measures_(information_retrieval)#Mean_average_precision}{Mean Average Precision (MAP)}
|
||||
is maximized.}
|
||||
\item{ \code{reg:gamma}: gamma regression with log-link.
|
||||
Output is a mean of gamma distribution.
|
||||
It might be useful, e.g., for modeling insurance claims severity, or for any outcome that might be
|
||||
\href{https://en.wikipedia.org/wiki/Gamma_distribution#Applications}{gamma-distributed}.}
|
||||
\item{ \code{reg:tweedie}: Tweedie regression with log-link.
|
||||
It might be useful, e.g., for modeling total loss in insurance, or for any outcome that might be
|
||||
\href{https://en.wikipedia.org/wiki/Tweedie_distribution#Applications}{Tweedie-distributed}.}
|
||||
}
|
||||
}
|
||||
\item \code{base_score} the initial prediction score of all instances, global bias. Default: 0.5
|
||||
\item{ \code{eval_metric} evaluation metrics for validation data.
|
||||
Users can pass a self-defined function to it.
|
||||
Default: metric will be assigned according to objective
|
||||
(rmse for regression, and error for classification, mean average precision for ranking).
|
||||
List is provided in detail section.}
|
||||
The default objective options are below:
|
||||
\itemize{
|
||||
\item \code{reg:squarederror} Regression with squared loss (Default).
|
||||
\item{ \code{reg:squaredlogerror}: regression with squared log loss \eqn{1/2 * (log(pred + 1) - log(label + 1))^2}.
|
||||
All inputs are required to be greater than -1.
|
||||
Also, see metric rmsle for possible issue with this objective.}
|
||||
\item \code{reg:logistic} logistic regression.
|
||||
\item \code{reg:pseudohubererror}: regression with Pseudo Huber loss, a twice differentiable alternative to absolute loss.
|
||||
\item \code{binary:logistic} logistic regression for binary classification. Output probability.
|
||||
\item \code{binary:logitraw} logistic regression for binary classification, output score before logistic transformation.
|
||||
\item \code{binary:hinge}: hinge loss for binary classification. This makes predictions of 0 or 1, rather than producing probabilities.
|
||||
\item{ \code{count:poisson}: Poisson regression for count data, output mean of Poisson distribution.
|
||||
\code{max_delta_step} is set to 0.7 by default in poisson regression (used to safeguard optimization).}
|
||||
\item{ \code{survival:cox}: Cox regression for right censored survival time data (negative values are considered right censored).
|
||||
Note that predictions are returned on the hazard ratio scale (i.e., as HR = exp(marginal_prediction) in the proportional
|
||||
hazard function \code{h(t) = h0(t) * HR)}.}
|
||||
\item{ \code{survival:aft}: Accelerated failure time model for censored survival time data. See
|
||||
\href{https://xgboost.readthedocs.io/en/latest/tutorials/aft_survival_analysis.html}{Survival Analysis with Accelerated Failure Time}
|
||||
for details.}
|
||||
\item \code{aft_loss_distribution}: Probability Density Function used by \code{survival:aft} and \code{aft-nloglik} metric.
|
||||
\item{ \code{multi:softmax} set xgboost to do multiclass classification using the softmax objective.
|
||||
Class is represented by a number and should be from 0 to \code{num_class - 1}.}
|
||||
\item{ \code{multi:softprob} same as softmax, but prediction outputs a vector of ndata * nclass elements, which can be
|
||||
further reshaped to ndata, nclass matrix. The result contains predicted probabilities of each data point belonging
|
||||
to each class.}
|
||||
\item \code{rank:pairwise} set xgboost to do ranking task by minimizing the pairwise loss.
|
||||
\item{ \code{rank:ndcg}: Use LambdaMART to perform list-wise ranking where
|
||||
\href{https://en.wikipedia.org/wiki/Discounted_cumulative_gain}{Normalized Discounted Cumulative Gain (NDCG)} is maximized.}
|
||||
\item{ \code{rank:map}: Use LambdaMART to perform list-wise ranking where
|
||||
\href{https://en.wikipedia.org/wiki/Evaluation_measures_(information_retrieval)#Mean_average_precision}{Mean Average Precision (MAP)}
|
||||
is maximized.}
|
||||
\item{ \code{reg:gamma}: gamma regression with log-link.
|
||||
Output is a mean of gamma distribution.
|
||||
It might be useful, e.g., for modeling insurance claims severity, or for any outcome that might be
|
||||
\href{https://en.wikipedia.org/wiki/Gamma_distribution#Applications}{gamma-distributed}.}
|
||||
\item{ \code{reg:tweedie}: Tweedie regression with log-link.
|
||||
It might be useful, e.g., for modeling total loss in insurance, or for any outcome that might be
|
||||
\href{https://en.wikipedia.org/wiki/Tweedie_distribution#Applications}{Tweedie-distributed}.}
|
||||
}
|
||||
}
|
||||
\item \code{base_score} the initial prediction score of all instances, global bias. Default: 0.5
|
||||
\item{ \code{eval_metric} evaluation metrics for validation data.
|
||||
Users can pass a self-defined function to it.
|
||||
Default: metric will be assigned according to objective
|
||||
(rmse for regression, and error for classification, mean average precision for ranking).
|
||||
List is provided in detail section.}
|
||||
}}
|
||||
|
||||
\item{data}{training dataset. \code{xgb.train} accepts only an \code{xgb.DMatrix} as the input.
|
||||
@ -202,7 +205,12 @@ file with a previously saved model.}
|
||||
\item{callbacks}{a list of callback functions to perform various task during boosting.
|
||||
See \code{\link{callbacks}}. Some of the callbacks are automatically created depending on the
|
||||
parameters' values. User can provide either existing or their own callback methods in order
|
||||
to customize the training process.}
|
||||
to customize the training process.
|
||||
|
||||
\if{html}{\out{<div class="sourceCode">}}\preformatted{ Note that some callbacks might try to set an evaluation log - be aware that these evaluation logs
|
||||
are kept as R attributes, and thus do not get saved when using non-R serializaters like
|
||||
\link{xgb.save} (but are kept when using R serializers like \link{saveRDS}).
|
||||
}\if{html}{\out{</div>}}}
|
||||
|
||||
\item{...}{other parameters to pass to \code{params}.}
|
||||
|
||||
@ -216,27 +224,7 @@ This parameter is only used when input is a dense matrix.}
|
||||
\item{weight}{a vector indicating the weight for each row of the input.}
|
||||
}
|
||||
\value{
|
||||
An object of class \code{xgb.Booster} with the following elements:
|
||||
\itemize{
|
||||
\item \code{handle} a handle (pointer) to the xgboost model in memory.
|
||||
\item \code{raw} a cached memory dump of the xgboost model saved as R's \code{raw} type.
|
||||
\item \code{niter} number of boosting iterations.
|
||||
\item \code{evaluation_log} evaluation history stored as a \code{data.table} with the
|
||||
first column corresponding to iteration number and the rest corresponding to evaluation
|
||||
metrics' values. It is created by the \code{\link{cb.evaluation.log}} callback.
|
||||
\item \code{call} a function call.
|
||||
\item \code{params} parameters that were passed to the xgboost library. Note that it does not
|
||||
capture parameters changed by the \code{\link{cb.reset.parameters}} callback.
|
||||
\item \code{callbacks} callback functions that were either automatically assigned or
|
||||
explicitly passed.
|
||||
\item \code{best_iteration} iteration number with the best evaluation metric value
|
||||
(only available with early stopping).
|
||||
\item \code{best_score} the best evaluation metric value during early stopping.
|
||||
(only available with early stopping).
|
||||
\item \code{feature_names} names of the training dataset features
|
||||
(only when column names were defined in training data).
|
||||
\item \code{nfeatures} number of features in training data.
|
||||
}
|
||||
An object of class \code{xgb.Booster}.
|
||||
}
|
||||
\description{
|
||||
\code{xgb.train} is an advanced interface for training an xgboost model.
|
||||
@ -258,30 +246,45 @@ when the \code{eval_metric} parameter is not provided.
|
||||
User may set one or several \code{eval_metric} parameters.
|
||||
Note that when using a customized metric, only this single metric can be used.
|
||||
The following is the list of built-in metrics for which XGBoost provides optimized implementation:
|
||||
\itemize{
|
||||
\item \code{rmse} root mean square error. \url{https://en.wikipedia.org/wiki/Root_mean_square_error}
|
||||
\item \code{logloss} negative log-likelihood. \url{https://en.wikipedia.org/wiki/Log-likelihood}
|
||||
\item \code{mlogloss} multiclass logloss. \url{https://scikit-learn.org/stable/modules/generated/sklearn.metrics.log_loss.html}
|
||||
\item \code{error} Binary classification error rate. It is calculated as \code{(# wrong cases) / (# all cases)}.
|
||||
By default, it uses the 0.5 threshold for predicted values to define negative and positive instances.
|
||||
Different threshold (e.g., 0.) could be specified as "error@0."
|
||||
\item \code{merror} Multiclass classification error rate. It is calculated as \code{(# wrong cases) / (# all cases)}.
|
||||
\item \code{mae} Mean absolute error
|
||||
\item \code{mape} Mean absolute percentage error
|
||||
\item{ \code{auc} Area under the curve.
|
||||
\url{https://en.wikipedia.org/wiki/Receiver_operating_characteristic#'Area_under_curve} for ranking evaluation.}
|
||||
\item \code{aucpr} Area under the PR curve. \url{https://en.wikipedia.org/wiki/Precision_and_recall} for ranking evaluation.
|
||||
\item \code{ndcg} Normalized Discounted Cumulative Gain (for ranking task). \url{https://en.wikipedia.org/wiki/NDCG}
|
||||
}
|
||||
\itemize{
|
||||
\item \code{rmse} root mean square error. \url{https://en.wikipedia.org/wiki/Root_mean_square_error}
|
||||
\item \code{logloss} negative log-likelihood. \url{https://en.wikipedia.org/wiki/Log-likelihood}
|
||||
\item \code{mlogloss} multiclass logloss. \url{https://scikit-learn.org/stable/modules/generated/sklearn.metrics.log_loss.html}
|
||||
\item \code{error} Binary classification error rate. It is calculated as \code{(# wrong cases) / (# all cases)}.
|
||||
By default, it uses the 0.5 threshold for predicted values to define negative and positive instances.
|
||||
Different threshold (e.g., 0.) could be specified as "error@0."
|
||||
\item \code{merror} Multiclass classification error rate. It is calculated as \code{(# wrong cases) / (# all cases)}.
|
||||
\item \code{mae} Mean absolute error
|
||||
\item \code{mape} Mean absolute percentage error
|
||||
\item{ \code{auc} Area under the curve.
|
||||
\url{https://en.wikipedia.org/wiki/Receiver_operating_characteristic#'Area_under_curve} for ranking evaluation.}
|
||||
\item \code{aucpr} Area under the PR curve. \url{https://en.wikipedia.org/wiki/Precision_and_recall} for ranking evaluation.
|
||||
\item \code{ndcg} Normalized Discounted Cumulative Gain (for ranking task). \url{https://en.wikipedia.org/wiki/NDCG}
|
||||
}
|
||||
|
||||
The following callbacks are automatically created when certain parameters are set:
|
||||
\itemize{
|
||||
\item \code{cb.print.evaluation} is turned on when \code{verbose > 0};
|
||||
and the \code{print_every_n} parameter is passed to it.
|
||||
\item \code{cb.evaluation.log} is on when \code{watchlist} is present.
|
||||
\item \code{cb.early.stop}: when \code{early_stopping_rounds} is set.
|
||||
\item \code{cb.save.model}: when \code{save_period > 0} is set.
|
||||
\item \code{cb.print.evaluation} is turned on when \code{verbose > 0};
|
||||
and the \code{print_every_n} parameter is passed to it.
|
||||
\item \code{cb.evaluation.log} is on when \code{watchlist} is present.
|
||||
\item \code{cb.early.stop}: when \code{early_stopping_rounds} is set.
|
||||
\item \code{cb.save.model}: when \code{save_period > 0} is set.
|
||||
}
|
||||
|
||||
Note that objects of type \code{xgb.Booster} as returned by this function behave a bit differently
|
||||
from typical R objects (it's an 'altrep' list class), and it makes a separation between
|
||||
internal booster attributes (restricted to jsonifyable data), accessed through \link{xgb.attr}
|
||||
and shared between interfaces through serialization functions like \link{xgb.save}; and
|
||||
R-specific attributes, accessed through \link{attributes} and \link{attr}, which are otherwise
|
||||
only used in the R interface, only kept when using R's serializers like \link{saveRDS}, and
|
||||
not anyhow used by functions like \link{predict.xgb.Booster}.
|
||||
|
||||
Be aware that one such R attribute that is automatically added is \code{params} - this attribute
|
||||
is assigned from the \code{params} argument to this function, and is only meant to serve as a
|
||||
reference for what went into the booster, but is not used in other methods that take a booster
|
||||
object - so for example, changing the booster's configuration requires calling \verb{xgb.config<-}
|
||||
or 'xgb.parameters<-', while simply modifying \verb{attributes(model)$params$<...>} will have no
|
||||
effect elsewhere.
|
||||
}
|
||||
\examples{
|
||||
data(agaricus.train, package='xgboost')
|
||||
@ -300,9 +303,9 @@ dtest <- with(
|
||||
watchlist <- list(train = dtrain, eval = dtest)
|
||||
|
||||
## A simple xgb.train example:
|
||||
param <- list(max_depth = 2, eta = 1, verbose = 0, nthread = nthread,
|
||||
param <- list(max_depth = 2, eta = 1, nthread = nthread,
|
||||
objective = "binary:logistic", eval_metric = "auc")
|
||||
bst <- xgb.train(param, dtrain, nrounds = 2, watchlist)
|
||||
bst <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0)
|
||||
|
||||
## An xgb.train example where custom objective and evaluation metric are
|
||||
## used:
|
||||
@ -321,13 +324,13 @@ evalerror <- function(preds, dtrain) {
|
||||
|
||||
# These functions could be used by passing them either:
|
||||
# as 'objective' and 'eval_metric' parameters in the params list:
|
||||
param <- list(max_depth = 2, eta = 1, verbose = 0, nthread = nthread,
|
||||
param <- list(max_depth = 2, eta = 1, nthread = nthread,
|
||||
objective = logregobj, eval_metric = evalerror)
|
||||
bst <- xgb.train(param, dtrain, nrounds = 2, watchlist)
|
||||
bst <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0)
|
||||
|
||||
# or through the ... arguments:
|
||||
param <- list(max_depth = 2, eta = 1, verbose = 0, nthread = nthread)
|
||||
bst <- xgb.train(param, dtrain, nrounds = 2, watchlist,
|
||||
param <- list(max_depth = 2, eta = 1, nthread = nthread)
|
||||
bst <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0,
|
||||
objective = logregobj, eval_metric = evalerror)
|
||||
|
||||
# or as dedicated 'obj' and 'feval' parameters of xgb.train:
|
||||
@ -336,10 +339,10 @@ bst <- xgb.train(param, dtrain, nrounds = 2, watchlist,
|
||||
|
||||
|
||||
## An xgb.train example of using variable learning rates at each iteration:
|
||||
param <- list(max_depth = 2, eta = 1, verbose = 0, nthread = nthread,
|
||||
param <- list(max_depth = 2, eta = 1, nthread = nthread,
|
||||
objective = "binary:logistic", eval_metric = "auc")
|
||||
my_etas <- list(eta = c(0.5, 0.1))
|
||||
bst <- xgb.train(param, dtrain, nrounds = 2, watchlist,
|
||||
bst <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0,
|
||||
callbacks = list(cb.reset.parameters(my_etas)))
|
||||
|
||||
## Early stopping:
|
||||
|
||||
@ -1,21 +0,0 @@
|
||||
% Generated by roxygen2: do not edit by hand
|
||||
% Please edit documentation in R/xgb.unserialize.R
|
||||
\name{xgb.unserialize}
|
||||
\alias{xgb.unserialize}
|
||||
\title{Load the instance back from \code{\link{xgb.serialize}}}
|
||||
\usage{
|
||||
xgb.unserialize(buffer, handle = NULL)
|
||||
}
|
||||
\arguments{
|
||||
\item{buffer}{the buffer containing booster instance saved by \code{\link{xgb.serialize}}}
|
||||
|
||||
\item{handle}{An \code{xgb.Booster.handle} object which will be overwritten with
|
||||
the new deserialized object. Must be a null handle (e.g. when loading the model through
|
||||
`readRDS`). If not provided, a new handle will be created.}
|
||||
}
|
||||
\value{
|
||||
An \code{xgb.Booster.handle} object.
|
||||
}
|
||||
\description{
|
||||
Load the instance back from \code{\link{xgb.serialize}}
|
||||
}
|
||||
@ -63,6 +63,7 @@ OBJECTS= \
|
||||
$(PKGROOT)/src/gbm/gblinear.o \
|
||||
$(PKGROOT)/src/gbm/gblinear_model.o \
|
||||
$(PKGROOT)/src/data/adapter.o \
|
||||
$(PKGROOT)/src/data/array_interface.o \
|
||||
$(PKGROOT)/src/data/simple_dmatrix.o \
|
||||
$(PKGROOT)/src/data/data.o \
|
||||
$(PKGROOT)/src/data/sparse_page_raw_format.o \
|
||||
|
||||
@ -63,6 +63,7 @@ OBJECTS= \
|
||||
$(PKGROOT)/src/gbm/gblinear.o \
|
||||
$(PKGROOT)/src/gbm/gblinear_model.o \
|
||||
$(PKGROOT)/src/data/adapter.o \
|
||||
$(PKGROOT)/src/data/array_interface.o \
|
||||
$(PKGROOT)/src/data/simple_dmatrix.o \
|
||||
$(PKGROOT)/src/data/data.o \
|
||||
$(PKGROOT)/src/data/sparse_page_raw_format.o \
|
||||
|
||||
@ -15,9 +15,16 @@ Check these declarations against the C/Fortran source code.
|
||||
*/
|
||||
|
||||
/* .Call calls */
|
||||
extern void XGBInitializeAltrepClass_R(DllInfo *info);
|
||||
extern SEXP XGDuplicate_R(SEXP);
|
||||
extern SEXP XGPointerEqComparison_R(SEXP, SEXP);
|
||||
extern SEXP XGBoosterTrainOneIter_R(SEXP, SEXP, SEXP, SEXP, SEXP);
|
||||
extern SEXP XGBoosterCreate_R(SEXP);
|
||||
extern SEXP XGBoosterCreateInEmptyObj_R(SEXP, SEXP);
|
||||
extern SEXP XGBoosterCopyInfoFromDMatrix_R(SEXP, SEXP);
|
||||
extern SEXP XGBoosterSetStrFeatureInfo_R(SEXP, SEXP, SEXP);
|
||||
extern SEXP XGBoosterGetStrFeatureInfo_R(SEXP, SEXP);
|
||||
extern SEXP XGBoosterBoostedRounds_R(SEXP);
|
||||
extern SEXP XGBoosterGetNumFeature_R(SEXP);
|
||||
extern SEXP XGBoosterDumpModel_R(SEXP, SEXP, SEXP, SEXP);
|
||||
extern SEXP XGBoosterEvalOneIter_R(SEXP, SEXP, SEXP, SEXP);
|
||||
extern SEXP XGBoosterGetAttrNames_R(SEXP);
|
||||
@ -39,10 +46,15 @@ extern SEXP XGDMatrixCreateFromCSC_R(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP);
|
||||
extern SEXP XGDMatrixCreateFromCSR_R(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP);
|
||||
extern SEXP XGDMatrixCreateFromFile_R(SEXP, SEXP);
|
||||
extern SEXP XGDMatrixCreateFromMat_R(SEXP, SEXP, SEXP);
|
||||
extern SEXP XGDMatrixGetInfo_R(SEXP, SEXP);
|
||||
extern SEXP XGDMatrixGetFloatInfo_R(SEXP, SEXP);
|
||||
extern SEXP XGDMatrixGetUIntInfo_R(SEXP, SEXP);
|
||||
extern SEXP XGDMatrixCreateFromDF_R(SEXP, SEXP, SEXP);
|
||||
extern SEXP XGDMatrixGetStrFeatureInfo_R(SEXP, SEXP);
|
||||
extern SEXP XGDMatrixNumCol_R(SEXP);
|
||||
extern SEXP XGDMatrixNumRow_R(SEXP);
|
||||
extern SEXP XGDMatrixGetQuantileCut_R(SEXP);
|
||||
extern SEXP XGDMatrixNumNonMissing_R(SEXP);
|
||||
extern SEXP XGDMatrixGetDataAsCSR_R(SEXP);
|
||||
extern SEXP XGDMatrixSaveBinary_R(SEXP, SEXP, SEXP);
|
||||
extern SEXP XGDMatrixSetInfo_R(SEXP, SEXP, SEXP);
|
||||
extern SEXP XGDMatrixSetStrFeatureInfo_R(SEXP, SEXP, SEXP);
|
||||
@ -52,9 +64,15 @@ extern SEXP XGBGetGlobalConfig_R(void);
|
||||
extern SEXP XGBoosterFeatureScore_R(SEXP, SEXP);
|
||||
|
||||
static const R_CallMethodDef CallEntries[] = {
|
||||
{"XGBoosterBoostOneIter_R", (DL_FUNC) &XGBoosterTrainOneIter_R, 5},
|
||||
{"XGDuplicate_R", (DL_FUNC) &XGDuplicate_R, 1},
|
||||
{"XGPointerEqComparison_R", (DL_FUNC) &XGPointerEqComparison_R, 2},
|
||||
{"XGBoosterTrainOneIter_R", (DL_FUNC) &XGBoosterTrainOneIter_R, 5},
|
||||
{"XGBoosterCreate_R", (DL_FUNC) &XGBoosterCreate_R, 1},
|
||||
{"XGBoosterCreateInEmptyObj_R", (DL_FUNC) &XGBoosterCreateInEmptyObj_R, 2},
|
||||
{"XGBoosterCopyInfoFromDMatrix_R", (DL_FUNC) &XGBoosterCopyInfoFromDMatrix_R, 2},
|
||||
{"XGBoosterSetStrFeatureInfo_R",(DL_FUNC) &XGBoosterSetStrFeatureInfo_R,3}, // NOLINT
|
||||
{"XGBoosterGetStrFeatureInfo_R",(DL_FUNC) &XGBoosterGetStrFeatureInfo_R,2}, // NOLINT
|
||||
{"XGBoosterBoostedRounds_R", (DL_FUNC) &XGBoosterBoostedRounds_R, 1},
|
||||
{"XGBoosterGetNumFeature_R", (DL_FUNC) &XGBoosterGetNumFeature_R, 1},
|
||||
{"XGBoosterDumpModel_R", (DL_FUNC) &XGBoosterDumpModel_R, 4},
|
||||
{"XGBoosterEvalOneIter_R", (DL_FUNC) &XGBoosterEvalOneIter_R, 4},
|
||||
{"XGBoosterGetAttrNames_R", (DL_FUNC) &XGBoosterGetAttrNames_R, 1},
|
||||
@ -76,10 +94,15 @@ static const R_CallMethodDef CallEntries[] = {
|
||||
{"XGDMatrixCreateFromCSR_R", (DL_FUNC) &XGDMatrixCreateFromCSR_R, 6},
|
||||
{"XGDMatrixCreateFromFile_R", (DL_FUNC) &XGDMatrixCreateFromFile_R, 2},
|
||||
{"XGDMatrixCreateFromMat_R", (DL_FUNC) &XGDMatrixCreateFromMat_R, 3},
|
||||
{"XGDMatrixGetInfo_R", (DL_FUNC) &XGDMatrixGetInfo_R, 2},
|
||||
{"XGDMatrixGetFloatInfo_R", (DL_FUNC) &XGDMatrixGetFloatInfo_R, 2},
|
||||
{"XGDMatrixGetUIntInfo_R", (DL_FUNC) &XGDMatrixGetUIntInfo_R, 2},
|
||||
{"XGDMatrixCreateFromDF_R", (DL_FUNC) &XGDMatrixCreateFromDF_R, 3},
|
||||
{"XGDMatrixGetStrFeatureInfo_R", (DL_FUNC) &XGDMatrixGetStrFeatureInfo_R, 2},
|
||||
{"XGDMatrixNumCol_R", (DL_FUNC) &XGDMatrixNumCol_R, 1},
|
||||
{"XGDMatrixNumRow_R", (DL_FUNC) &XGDMatrixNumRow_R, 1},
|
||||
{"XGDMatrixGetQuantileCut_R", (DL_FUNC) &XGDMatrixGetQuantileCut_R, 1},
|
||||
{"XGDMatrixNumNonMissing_R", (DL_FUNC) &XGDMatrixNumNonMissing_R, 1},
|
||||
{"XGDMatrixGetDataAsCSR_R", (DL_FUNC) &XGDMatrixGetDataAsCSR_R, 1},
|
||||
{"XGDMatrixSaveBinary_R", (DL_FUNC) &XGDMatrixSaveBinary_R, 3},
|
||||
{"XGDMatrixSetInfo_R", (DL_FUNC) &XGDMatrixSetInfo_R, 3},
|
||||
{"XGDMatrixSetStrFeatureInfo_R", (DL_FUNC) &XGDMatrixSetStrFeatureInfo_R, 3},
|
||||
@ -96,4 +119,5 @@ __declspec(dllexport)
|
||||
void attribute_visible R_init_xgboost(DllInfo *dll) {
|
||||
R_registerRoutines(dll, NULL, CallEntries, NULL, NULL);
|
||||
R_useDynamicSymbols(dll, FALSE);
|
||||
XGBInitializeAltrepClass_R(dll);
|
||||
}
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
/**
|
||||
* Copyright 2014-2023 by XGBoost Contributors
|
||||
* Copyright 2014-2024, XGBoost Contributors
|
||||
*/
|
||||
#include <dmlc/common.h>
|
||||
#include <dmlc/omp.h>
|
||||
@ -8,9 +8,12 @@
|
||||
#include <xgboost/data.h>
|
||||
#include <xgboost/logging.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <cmath>
|
||||
#include <cstdint>
|
||||
#include <cstdio>
|
||||
#include <cstring>
|
||||
#include <limits>
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
#include <utility>
|
||||
@ -19,14 +22,14 @@
|
||||
#include "../../src/c_api/c_api_error.h"
|
||||
#include "../../src/c_api/c_api_utils.h" // MakeSparseFromPtr
|
||||
#include "../../src/common/threading_utils.h"
|
||||
#include "../../src/data/array_interface.h" // for ArrayInterface
|
||||
|
||||
#include "./xgboost_R.h" // Must follow other includes.
|
||||
|
||||
namespace {
|
||||
|
||||
struct ErrorWithUnwind : public std::exception {};
|
||||
|
||||
void ThrowExceptionFromRError(void *unused, Rboolean jump) {
|
||||
void ThrowExceptionFromRError(void *, Rboolean jump) {
|
||||
if (jump) {
|
||||
throw ErrorWithUnwind();
|
||||
}
|
||||
@ -48,8 +51,35 @@ SEXP SafeMkChar(const char *c_str, SEXP continuation_token) {
|
||||
continuation_token);
|
||||
}
|
||||
|
||||
SEXP WrappedAllocReal(void *void_ptr) {
|
||||
size_t *size = static_cast<size_t*>(void_ptr);
|
||||
return Rf_allocVector(REALSXP, *size);
|
||||
}
|
||||
|
||||
SEXP SafeAllocReal(size_t size, SEXP continuation_token) {
|
||||
return R_UnwindProtect(
|
||||
WrappedAllocReal, static_cast<void*>(&size),
|
||||
ThrowExceptionFromRError, nullptr,
|
||||
continuation_token);
|
||||
}
|
||||
|
||||
SEXP WrappedAllocInteger(void *void_ptr) {
|
||||
size_t *size = static_cast<size_t*>(void_ptr);
|
||||
return Rf_allocVector(INTSXP, *size);
|
||||
}
|
||||
|
||||
SEXP SafeAllocInteger(size_t size, SEXP continuation_token) {
|
||||
return R_UnwindProtect(
|
||||
WrappedAllocInteger, static_cast<void*>(&size),
|
||||
ThrowExceptionFromRError, nullptr,
|
||||
continuation_token);
|
||||
}
|
||||
|
||||
[[nodiscard]] std::string MakeArrayInterfaceFromRMat(SEXP R_mat) {
|
||||
SEXP mat_dims = Rf_getAttrib(R_mat, R_DimSymbol);
|
||||
if (Rf_xlength(mat_dims) > 2) {
|
||||
LOG(FATAL) << "Passed input array with more than two dimensions, which is not supported.";
|
||||
}
|
||||
const int *ptr_mat_dims = INTEGER(mat_dims);
|
||||
|
||||
// Lambda for type dispatch.
|
||||
@ -132,45 +162,116 @@ SEXP SafeMkChar(const char *c_str, SEXP continuation_token) {
|
||||
jconfig["nthread"] = Rf_asInteger(n_threads);
|
||||
return Json::Dump(jconfig);
|
||||
}
|
||||
|
||||
// Allocate a R vector and copy an array interface encoded object to it.
|
||||
[[nodiscard]] SEXP CopyArrayToR(const char *array_str, SEXP ctoken) {
|
||||
xgboost::ArrayInterface<1> array{xgboost::StringView{array_str}};
|
||||
// R supports only int and double.
|
||||
bool is_int_type =
|
||||
xgboost::DispatchDType(array.type, [](auto t) { return std::is_integral_v<decltype(t)>; });
|
||||
bool is_float = xgboost::DispatchDType(
|
||||
array.type, [](auto v) { return std::is_floating_point_v<decltype(v)>; });
|
||||
CHECK(is_int_type || is_float) << "Internal error: Invalid DType.";
|
||||
CHECK(array.is_contiguous) << "Internal error: Return by XGBoost should be contiguous";
|
||||
|
||||
// Note: the only case in which this will receive an integer type is
|
||||
// for the 'indptr' part of the quantile cut outputs, which comes
|
||||
// in sorted order, so the last element contains the maximum value.
|
||||
bool fits_into_C_int = xgboost::DispatchDType(array.type, [&](auto t) {
|
||||
using T = decltype(t);
|
||||
if (!std::is_integral_v<decltype(t)>) {
|
||||
return false;
|
||||
}
|
||||
auto ptr = static_cast<T const *>(array.data);
|
||||
T last_elt = ptr[array.n - 1];
|
||||
if (last_elt < 0) {
|
||||
last_elt = -last_elt; // no std::abs overload for all possible types
|
||||
}
|
||||
return last_elt <= std::numeric_limits<int>::max();
|
||||
});
|
||||
bool use_int = is_int_type && fits_into_C_int;
|
||||
|
||||
// Allocate memory in R
|
||||
SEXP out =
|
||||
Rf_protect(use_int ? SafeAllocInteger(array.n, ctoken) : SafeAllocReal(array.n, ctoken));
|
||||
|
||||
xgboost::DispatchDType(array.type, [&](auto t) {
|
||||
using T = decltype(t);
|
||||
auto in_ptr = static_cast<T const *>(array.data);
|
||||
if (use_int) {
|
||||
auto out_ptr = INTEGER(out);
|
||||
std::copy_n(in_ptr, array.n, out_ptr);
|
||||
} else {
|
||||
auto out_ptr = REAL(out);
|
||||
std::copy_n(in_ptr, array.n, out_ptr);
|
||||
}
|
||||
});
|
||||
|
||||
Rf_unprotect(1);
|
||||
return out;
|
||||
}
|
||||
} // namespace
|
||||
|
||||
struct RRNGStateController {
|
||||
RRNGStateController() {
|
||||
GetRNGstate();
|
||||
}
|
||||
|
||||
~RRNGStateController() {
|
||||
PutRNGstate();
|
||||
}
|
||||
};
|
||||
|
||||
/*!
|
||||
* \brief macro to annotate begin of api
|
||||
*/
|
||||
#define R_API_BEGIN() \
|
||||
GetRNGstate(); \
|
||||
try {
|
||||
try { \
|
||||
RRNGStateController rng_controller{};
|
||||
|
||||
/* Note: an R error triggers a long jump, hence all C++ objects that
|
||||
allocated memory through non-R allocators, including the exception
|
||||
object, need to be destructed before triggering the R error.
|
||||
In order to preserve the error message, it gets copied to a temporary
|
||||
buffer, and the R error section is reached through a 'goto' statement
|
||||
that bypasses usual function control flow. */
|
||||
char cpp_ex_msg[512];
|
||||
/*!
|
||||
* \brief macro to annotate end of api
|
||||
*/
|
||||
#define R_API_END() \
|
||||
} catch(dmlc::Error& e) { \
|
||||
PutRNGstate(); \
|
||||
error(e.what()); \
|
||||
} catch(std::exception &e) { \
|
||||
std::strncpy(cpp_ex_msg, e.what(), 512); \
|
||||
goto throw_cpp_ex_as_R_err; \
|
||||
} \
|
||||
PutRNGstate();
|
||||
if (false) { \
|
||||
throw_cpp_ex_as_R_err: \
|
||||
Rf_error("%s", cpp_ex_msg); \
|
||||
}
|
||||
|
||||
/*!
|
||||
* \brief macro to check the call.
|
||||
/**
|
||||
* @brief Macro for checking XGBoost return code.
|
||||
*/
|
||||
#define CHECK_CALL(x) \
|
||||
if ((x) != 0) { \
|
||||
error(XGBGetLastError()); \
|
||||
#define CHECK_CALL(__rc) \
|
||||
if ((__rc) != 0) { \
|
||||
Rf_error("%s", XGBGetLastError()); \
|
||||
}
|
||||
|
||||
using dmlc::BeginPtr;
|
||||
|
||||
XGB_DLL SEXP XGCheckNullPtr_R(SEXP handle) {
|
||||
return ScalarLogical(R_ExternalPtrAddr(handle) == NULL);
|
||||
return Rf_ScalarLogical(R_ExternalPtrAddr(handle) == nullptr);
|
||||
}
|
||||
|
||||
XGB_DLL void _DMatrixFinalizer(SEXP ext) {
|
||||
namespace {
|
||||
void _DMatrixFinalizer(SEXP ext) {
|
||||
R_API_BEGIN();
|
||||
if (R_ExternalPtrAddr(ext) == NULL) return;
|
||||
CHECK_CALL(XGDMatrixFree(R_ExternalPtrAddr(ext)));
|
||||
R_ClearExternalPtr(ext);
|
||||
R_API_END();
|
||||
}
|
||||
} /* namespace */
|
||||
|
||||
XGB_DLL SEXP XGBSetGlobalConfig_R(SEXP json_str) {
|
||||
R_API_BEGIN();
|
||||
@ -219,6 +320,69 @@ XGB_DLL SEXP XGDMatrixCreateFromMat_R(SEXP mat, SEXP missing, SEXP n_threads) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
XGB_DLL SEXP XGDMatrixCreateFromDF_R(SEXP df, SEXP missing, SEXP n_threads) {
|
||||
SEXP ret = Rf_protect(R_MakeExternalPtr(nullptr, R_NilValue, R_NilValue));
|
||||
R_API_BEGIN();
|
||||
|
||||
DMatrixHandle handle;
|
||||
|
||||
auto make_vec = [&](auto const *ptr, std::int32_t len) {
|
||||
auto v = xgboost::linalg::MakeVec(ptr, len);
|
||||
return xgboost::linalg::ArrayInterface(v);
|
||||
};
|
||||
|
||||
std::int32_t rc{0};
|
||||
{
|
||||
using xgboost::Json;
|
||||
auto n_features = Rf_xlength(df);
|
||||
std::vector<Json> array(n_features);
|
||||
CHECK_GT(n_features, 0);
|
||||
auto len = Rf_xlength(VECTOR_ELT(df, 0));
|
||||
// The `data.frame` in R actually converts all data into numeric. The other type
|
||||
// handlers here are not used. At the moment they are kept as a reference for when we
|
||||
// can avoid making data copies during transformation.
|
||||
for (decltype(n_features) i = 0; i < n_features; ++i) {
|
||||
switch (TYPEOF(VECTOR_ELT(df, i))) {
|
||||
case INTSXP: {
|
||||
auto const *ptr = INTEGER(VECTOR_ELT(df, i));
|
||||
array[i] = make_vec(ptr, len);
|
||||
break;
|
||||
}
|
||||
case REALSXP: {
|
||||
auto const *ptr = REAL(VECTOR_ELT(df, i));
|
||||
array[i] = make_vec(ptr, len);
|
||||
break;
|
||||
}
|
||||
case LGLSXP: {
|
||||
auto const *ptr = LOGICAL(VECTOR_ELT(df, i));
|
||||
array[i] = make_vec(ptr, len);
|
||||
break;
|
||||
}
|
||||
default: {
|
||||
LOG(FATAL) << "data.frame has unsupported type.";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Json jinterface{std::move(array)};
|
||||
auto sinterface = Json::Dump(jinterface);
|
||||
Json jconfig{xgboost::Object{}};
|
||||
jconfig["missing"] = asReal(missing);
|
||||
jconfig["nthread"] = asInteger(n_threads);
|
||||
auto sconfig = Json::Dump(jconfig);
|
||||
|
||||
rc = XGDMatrixCreateFromColumnar(sinterface.c_str(), sconfig.c_str(), &handle);
|
||||
}
|
||||
|
||||
CHECK_CALL(rc);
|
||||
R_SetExternalPtrAddr(ret, handle);
|
||||
R_RegisterCFinalizerEx(ret, _DMatrixFinalizer, TRUE);
|
||||
R_API_END();
|
||||
Rf_unprotect(1);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
namespace {
|
||||
void CreateFromSparse(SEXP indptr, SEXP indices, SEXP data, std::string *indptr_str,
|
||||
std::string *indices_str, std::string *data_str) {
|
||||
@ -294,6 +458,7 @@ XGB_DLL SEXP XGDMatrixCreateFromCSR_R(SEXP indptr, SEXP indices, SEXP data, SEXP
|
||||
res_code = XGDMatrixCreateFromCSR(sindptr.c_str(), sindices.c_str(), sdata.c_str(), ncol,
|
||||
config.c_str(), &handle);
|
||||
}
|
||||
CHECK_CALL(res_code);
|
||||
R_SetExternalPtrAddr(ret, handle);
|
||||
R_RegisterCFinalizerEx(ret, _DMatrixFinalizer, TRUE);
|
||||
R_API_END();
|
||||
@ -342,9 +507,11 @@ XGB_DLL SEXP XGDMatrixSaveBinary_R(SEXP handle, SEXP fname, SEXP silent) {
|
||||
XGB_DLL SEXP XGDMatrixSetInfo_R(SEXP handle, SEXP field, SEXP array) {
|
||||
R_API_BEGIN();
|
||||
SEXP field_ = PROTECT(Rf_asChar(field));
|
||||
SEXP arr_dim = Rf_getAttrib(array, R_DimSymbol);
|
||||
int res_code;
|
||||
{
|
||||
const std::string array_str = MakeArrayInterfaceFromRVector(array);
|
||||
const std::string array_str = Rf_isNull(arr_dim)?
|
||||
MakeArrayInterfaceFromRVector(array) : MakeArrayInterfaceFromRMat(array);
|
||||
res_code = XGDMatrixSetInfoFromInterface(
|
||||
R_ExternalPtrAddr(handle), CHAR(field_), array_str.c_str());
|
||||
}
|
||||
@ -362,8 +529,14 @@ XGB_DLL SEXP XGDMatrixSetStrFeatureInfo_R(SEXP handle, SEXP field, SEXP array) {
|
||||
}
|
||||
|
||||
SEXP str_info_holder = PROTECT(Rf_allocVector(VECSXP, len));
|
||||
for (size_t i = 0; i < len; ++i) {
|
||||
SET_VECTOR_ELT(str_info_holder, i, Rf_asChar(VECTOR_ELT(array, i)));
|
||||
if (TYPEOF(array) == STRSXP) {
|
||||
for (size_t i = 0; i < len; ++i) {
|
||||
SET_VECTOR_ELT(str_info_holder, i, STRING_ELT(array, i));
|
||||
}
|
||||
} else {
|
||||
for (size_t i = 0; i < len; ++i) {
|
||||
SET_VECTOR_ELT(str_info_holder, i, Rf_asChar(VECTOR_ELT(array, i)));
|
||||
}
|
||||
}
|
||||
|
||||
SEXP field_ = PROTECT(Rf_asChar(field));
|
||||
@ -407,17 +580,27 @@ XGB_DLL SEXP XGDMatrixGetStrFeatureInfo_R(SEXP handle, SEXP field) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
XGB_DLL SEXP XGDMatrixGetInfo_R(SEXP handle, SEXP field) {
|
||||
XGB_DLL SEXP XGDMatrixGetFloatInfo_R(SEXP handle, SEXP field) {
|
||||
SEXP ret;
|
||||
R_API_BEGIN();
|
||||
bst_ulong olen;
|
||||
const float *res;
|
||||
CHECK_CALL(XGDMatrixGetFloatInfo(R_ExternalPtrAddr(handle), CHAR(asChar(field)), &olen, &res));
|
||||
ret = PROTECT(allocVector(REALSXP, olen));
|
||||
double *ret_ = REAL(ret);
|
||||
for (size_t i = 0; i < olen; ++i) {
|
||||
ret_[i] = res[i];
|
||||
}
|
||||
std::copy(res, res + olen, REAL(ret));
|
||||
R_API_END();
|
||||
UNPROTECT(1);
|
||||
return ret;
|
||||
}
|
||||
|
||||
XGB_DLL SEXP XGDMatrixGetUIntInfo_R(SEXP handle, SEXP field) {
|
||||
SEXP ret;
|
||||
R_API_BEGIN();
|
||||
bst_ulong olen;
|
||||
const unsigned *res;
|
||||
CHECK_CALL(XGDMatrixGetUIntInfo(R_ExternalPtrAddr(handle), CHAR(asChar(field)), &olen, &res));
|
||||
ret = PROTECT(allocVector(INTSXP, olen));
|
||||
std::copy(res, res + olen, INTEGER(ret));
|
||||
R_API_END();
|
||||
UNPROTECT(1);
|
||||
return ret;
|
||||
@ -439,15 +622,210 @@ XGB_DLL SEXP XGDMatrixNumCol_R(SEXP handle) {
|
||||
return ScalarInteger(static_cast<int>(ncol));
|
||||
}
|
||||
|
||||
XGB_DLL SEXP XGDuplicate_R(SEXP obj) {
|
||||
return Rf_duplicate(obj);
|
||||
}
|
||||
|
||||
XGB_DLL SEXP XGPointerEqComparison_R(SEXP obj1, SEXP obj2) {
|
||||
return Rf_ScalarLogical(R_ExternalPtrAddr(obj1) == R_ExternalPtrAddr(obj2));
|
||||
}
|
||||
|
||||
XGB_DLL SEXP XGDMatrixGetQuantileCut_R(SEXP handle) {
|
||||
const char *out_names[] = {"indptr", "data", ""};
|
||||
SEXP continuation_token = Rf_protect(R_MakeUnwindCont());
|
||||
SEXP out = Rf_protect(Rf_mkNamed(VECSXP, out_names));
|
||||
R_API_BEGIN();
|
||||
const char *out_indptr;
|
||||
const char *out_data;
|
||||
CHECK_CALL(XGDMatrixGetQuantileCut(R_ExternalPtrAddr(handle), "{}", &out_indptr, &out_data));
|
||||
try {
|
||||
SET_VECTOR_ELT(out, 0, CopyArrayToR(out_indptr, continuation_token));
|
||||
SET_VECTOR_ELT(out, 1, CopyArrayToR(out_data, continuation_token));
|
||||
} catch (ErrorWithUnwind &e) {
|
||||
R_ContinueUnwind(continuation_token);
|
||||
}
|
||||
R_API_END();
|
||||
Rf_unprotect(2);
|
||||
return out;
|
||||
}
|
||||
|
||||
XGB_DLL SEXP XGDMatrixNumNonMissing_R(SEXP handle) {
|
||||
SEXP out = Rf_protect(Rf_allocVector(REALSXP, 1));
|
||||
R_API_BEGIN();
|
||||
bst_ulong out_;
|
||||
CHECK_CALL(XGDMatrixNumNonMissing(R_ExternalPtrAddr(handle), &out_));
|
||||
REAL(out)[0] = static_cast<double>(out_);
|
||||
R_API_END();
|
||||
Rf_unprotect(1);
|
||||
return out;
|
||||
}
|
||||
|
||||
XGB_DLL SEXP XGDMatrixGetDataAsCSR_R(SEXP handle) {
|
||||
const char *out_names[] = {"indptr", "indices", "data", "ncols", ""};
|
||||
SEXP out = Rf_protect(Rf_mkNamed(VECSXP, out_names));
|
||||
R_API_BEGIN();
|
||||
|
||||
bst_ulong nrows, ncols, nnz;
|
||||
CHECK_CALL(XGDMatrixNumRow(R_ExternalPtrAddr(handle), &nrows));
|
||||
CHECK_CALL(XGDMatrixNumCol(R_ExternalPtrAddr(handle), &ncols));
|
||||
CHECK_CALL(XGDMatrixNumNonMissing(R_ExternalPtrAddr(handle), &nnz));
|
||||
if (std::max(nrows, ncols) > std::numeric_limits<int>::max()) {
|
||||
Rf_error("%s", "Error: resulting DMatrix data does not fit into R 'dgRMatrix'.");
|
||||
}
|
||||
|
||||
SET_VECTOR_ELT(out, 0, Rf_allocVector(INTSXP, nrows + 1));
|
||||
SET_VECTOR_ELT(out, 1, Rf_allocVector(INTSXP, nnz));
|
||||
SET_VECTOR_ELT(out, 2, Rf_allocVector(REALSXP, nnz));
|
||||
SET_VECTOR_ELT(out, 3, Rf_ScalarInteger(ncols));
|
||||
|
||||
std::unique_ptr<bst_ulong[]> indptr(new bst_ulong[nrows + 1]);
|
||||
std::unique_ptr<unsigned[]> indices(new unsigned[nnz]);
|
||||
std::unique_ptr<float[]> data(new float[nnz]);
|
||||
|
||||
CHECK_CALL(XGDMatrixGetDataAsCSR(R_ExternalPtrAddr(handle),
|
||||
"{}",
|
||||
indptr.get(),
|
||||
indices.get(),
|
||||
data.get()));
|
||||
|
||||
std::copy(indptr.get(), indptr.get() + nrows + 1, INTEGER(VECTOR_ELT(out, 0)));
|
||||
std::copy(indices.get(), indices.get() + nnz, INTEGER(VECTOR_ELT(out, 1)));
|
||||
std::copy(data.get(), data.get() + nnz, REAL(VECTOR_ELT(out, 2)));
|
||||
|
||||
R_API_END();
|
||||
Rf_unprotect(1);
|
||||
return out;
|
||||
}
|
||||
|
||||
// functions related to booster
|
||||
void _BoosterFinalizer(SEXP ext) {
|
||||
if (R_ExternalPtrAddr(ext) == NULL) return;
|
||||
CHECK_CALL(XGBoosterFree(R_ExternalPtrAddr(ext)));
|
||||
R_ClearExternalPtr(ext);
|
||||
namespace {
|
||||
void _BoosterFinalizer(SEXP R_ptr) {
|
||||
if (R_ExternalPtrAddr(R_ptr) == NULL) return;
|
||||
CHECK_CALL(XGBoosterFree(R_ExternalPtrAddr(R_ptr)));
|
||||
R_ClearExternalPtr(R_ptr);
|
||||
}
|
||||
|
||||
/* Booster is represented as an altrep list with one element which
|
||||
corresponds to an 'externalptr' holding the C object, forbidding
|
||||
modification by not implementing setters, and adding custom serialization. */
|
||||
R_altrep_class_t XGBAltrepPointerClass;
|
||||
|
||||
R_xlen_t XGBAltrepPointerLength_R(SEXP R_altrepped_obj) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
SEXP XGBAltrepPointerGetElt_R(SEXP R_altrepped_obj, R_xlen_t idx) {
|
||||
return R_altrep_data1(R_altrepped_obj);
|
||||
}
|
||||
|
||||
SEXP XGBMakeEmptyAltrep() {
|
||||
SEXP class_name = Rf_protect(Rf_mkString("xgb.Booster"));
|
||||
SEXP elt_names = Rf_protect(Rf_mkString("ptr"));
|
||||
SEXP R_ptr = Rf_protect(R_MakeExternalPtr(nullptr, R_NilValue, R_NilValue));
|
||||
SEXP R_altrepped_obj = Rf_protect(R_new_altrep(XGBAltrepPointerClass, R_ptr, R_NilValue));
|
||||
Rf_setAttrib(R_altrepped_obj, R_NamesSymbol, elt_names);
|
||||
Rf_setAttrib(R_altrepped_obj, R_ClassSymbol, class_name);
|
||||
Rf_unprotect(4);
|
||||
return R_altrepped_obj;
|
||||
}
|
||||
|
||||
/* Note: the idea for separating this function from the one above is to be
|
||||
able to trigger all R allocations first before doing non-R allocations. */
|
||||
void XGBAltrepSetPointer(SEXP R_altrepped_obj, BoosterHandle handle) {
|
||||
SEXP R_ptr = R_altrep_data1(R_altrepped_obj);
|
||||
R_SetExternalPtrAddr(R_ptr, handle);
|
||||
R_RegisterCFinalizerEx(R_ptr, _BoosterFinalizer, TRUE);
|
||||
}
|
||||
|
||||
SEXP XGBAltrepSerializer_R(SEXP R_altrepped_obj) {
|
||||
R_API_BEGIN();
|
||||
BoosterHandle handle = R_ExternalPtrAddr(R_altrep_data1(R_altrepped_obj));
|
||||
char const *serialized_bytes;
|
||||
bst_ulong serialized_length;
|
||||
CHECK_CALL(XGBoosterSerializeToBuffer(
|
||||
handle, &serialized_length, &serialized_bytes));
|
||||
SEXP R_state = Rf_protect(Rf_allocVector(RAWSXP, serialized_length));
|
||||
if (serialized_length != 0) {
|
||||
std::memcpy(RAW(R_state), serialized_bytes, serialized_length);
|
||||
}
|
||||
Rf_unprotect(1);
|
||||
return R_state;
|
||||
R_API_END();
|
||||
return R_NilValue; /* <- should not be reached */
|
||||
}
|
||||
|
||||
SEXP XGBAltrepDeserializer_R(SEXP unused, SEXP R_state) {
|
||||
SEXP R_altrepped_obj = Rf_protect(XGBMakeEmptyAltrep());
|
||||
R_API_BEGIN();
|
||||
BoosterHandle handle = nullptr;
|
||||
CHECK_CALL(XGBoosterCreate(nullptr, 0, &handle));
|
||||
int res_code = XGBoosterUnserializeFromBuffer(handle,
|
||||
RAW(R_state),
|
||||
Rf_xlength(R_state));
|
||||
if (res_code != 0) {
|
||||
XGBoosterFree(handle);
|
||||
}
|
||||
CHECK_CALL(res_code);
|
||||
XGBAltrepSetPointer(R_altrepped_obj, handle);
|
||||
R_API_END();
|
||||
Rf_unprotect(1);
|
||||
return R_altrepped_obj;
|
||||
}
|
||||
|
||||
// https://purrple.cat/blog/2018/10/14/altrep-and-cpp/
|
||||
Rboolean XGBAltrepInspector_R(
|
||||
SEXP x, int pre, int deep, int pvec,
|
||||
void (*inspect_subtree)(SEXP, int, int, int)) {
|
||||
Rprintf("Altrepped external pointer [address:%p]\n",
|
||||
R_ExternalPtrAddr(R_altrep_data1(x)));
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
SEXP XGBAltrepDuplicate_R(SEXP R_altrepped_obj, Rboolean deep) {
|
||||
R_API_BEGIN();
|
||||
if (!deep) {
|
||||
SEXP out = Rf_protect(XGBMakeEmptyAltrep());
|
||||
R_set_altrep_data1(out, R_altrep_data1(R_altrepped_obj));
|
||||
Rf_unprotect(1);
|
||||
return out;
|
||||
} else {
|
||||
SEXP out = Rf_protect(XGBMakeEmptyAltrep());
|
||||
char const *serialized_bytes;
|
||||
bst_ulong serialized_length;
|
||||
CHECK_CALL(XGBoosterSerializeToBuffer(
|
||||
R_ExternalPtrAddr(R_altrep_data1(R_altrepped_obj)),
|
||||
&serialized_length, &serialized_bytes));
|
||||
BoosterHandle new_handle = nullptr;
|
||||
CHECK_CALL(XGBoosterCreate(nullptr, 0, &new_handle));
|
||||
int res_code = XGBoosterUnserializeFromBuffer(new_handle,
|
||||
serialized_bytes,
|
||||
serialized_length);
|
||||
if (res_code != 0) {
|
||||
XGBoosterFree(new_handle);
|
||||
}
|
||||
CHECK_CALL(res_code);
|
||||
XGBAltrepSetPointer(out, new_handle);
|
||||
Rf_unprotect(1);
|
||||
return out;
|
||||
}
|
||||
R_API_END();
|
||||
return R_NilValue; /* <- should not be reached */
|
||||
}
|
||||
|
||||
} /* namespace */
|
||||
|
||||
XGB_DLL void XGBInitializeAltrepClass_R(DllInfo *dll) {
|
||||
XGBAltrepPointerClass = R_make_altlist_class("XGBAltrepPointerClass", "xgboost", dll);
|
||||
R_set_altrep_Length_method(XGBAltrepPointerClass, XGBAltrepPointerLength_R);
|
||||
R_set_altlist_Elt_method(XGBAltrepPointerClass, XGBAltrepPointerGetElt_R);
|
||||
R_set_altrep_Inspect_method(XGBAltrepPointerClass, XGBAltrepInspector_R);
|
||||
R_set_altrep_Serialized_state_method(XGBAltrepPointerClass, XGBAltrepSerializer_R);
|
||||
R_set_altrep_Unserialize_method(XGBAltrepPointerClass, XGBAltrepDeserializer_R);
|
||||
R_set_altrep_Duplicate_method(XGBAltrepPointerClass, XGBAltrepDuplicate_R);
|
||||
}
|
||||
|
||||
XGB_DLL SEXP XGBoosterCreate_R(SEXP dmats) {
|
||||
SEXP ret = PROTECT(R_MakeExternalPtr(nullptr, R_NilValue, R_NilValue));
|
||||
SEXP out = Rf_protect(XGBMakeEmptyAltrep());
|
||||
R_API_BEGIN();
|
||||
R_xlen_t len = Rf_xlength(dmats);
|
||||
BoosterHandle handle;
|
||||
@ -461,33 +839,104 @@ XGB_DLL SEXP XGBoosterCreate_R(SEXP dmats) {
|
||||
res_code = XGBoosterCreate(BeginPtr(dvec), dvec.size(), &handle);
|
||||
}
|
||||
CHECK_CALL(res_code);
|
||||
R_SetExternalPtrAddr(ret, handle);
|
||||
R_RegisterCFinalizerEx(ret, _BoosterFinalizer, TRUE);
|
||||
XGBAltrepSetPointer(out, handle);
|
||||
R_API_END();
|
||||
UNPROTECT(1);
|
||||
return ret;
|
||||
Rf_unprotect(1);
|
||||
return out;
|
||||
}
|
||||
|
||||
XGB_DLL SEXP XGBoosterCreateInEmptyObj_R(SEXP dmats, SEXP R_handle) {
|
||||
XGB_DLL SEXP XGBoosterCopyInfoFromDMatrix_R(SEXP booster, SEXP dmat) {
|
||||
R_API_BEGIN();
|
||||
R_xlen_t len = Rf_xlength(dmats);
|
||||
BoosterHandle handle;
|
||||
char const **feature_names;
|
||||
bst_ulong len_feature_names = 0;
|
||||
CHECK_CALL(XGDMatrixGetStrFeatureInfo(R_ExternalPtrAddr(dmat),
|
||||
"feature_name",
|
||||
&len_feature_names,
|
||||
&feature_names));
|
||||
if (len_feature_names) {
|
||||
CHECK_CALL(XGBoosterSetStrFeatureInfo(R_ExternalPtrAddr(booster),
|
||||
"feature_name",
|
||||
feature_names,
|
||||
len_feature_names));
|
||||
}
|
||||
|
||||
char const **feature_types;
|
||||
bst_ulong len_feature_types = 0;
|
||||
CHECK_CALL(XGDMatrixGetStrFeatureInfo(R_ExternalPtrAddr(dmat),
|
||||
"feature_type",
|
||||
&len_feature_types,
|
||||
&feature_types));
|
||||
if (len_feature_types) {
|
||||
CHECK_CALL(XGBoosterSetStrFeatureInfo(R_ExternalPtrAddr(booster),
|
||||
"feature_type",
|
||||
feature_types,
|
||||
len_feature_types));
|
||||
}
|
||||
R_API_END();
|
||||
return R_NilValue;
|
||||
}
|
||||
|
||||
XGB_DLL SEXP XGBoosterSetStrFeatureInfo_R(SEXP handle, SEXP field, SEXP features) {
|
||||
R_API_BEGIN();
|
||||
SEXP field_char = Rf_protect(Rf_asChar(field));
|
||||
bst_ulong len_features = Rf_xlength(features);
|
||||
|
||||
int res_code;
|
||||
{
|
||||
std::vector<void*> dvec(len);
|
||||
for (R_xlen_t i = 0; i < len; ++i) {
|
||||
dvec[i] = R_ExternalPtrAddr(VECTOR_ELT(dmats, i));
|
||||
std::vector<const char*> str_arr(len_features);
|
||||
for (bst_ulong idx = 0; idx < len_features; idx++) {
|
||||
str_arr[idx] = CHAR(STRING_ELT(features, idx));
|
||||
}
|
||||
res_code = XGBoosterCreate(BeginPtr(dvec), dvec.size(), &handle);
|
||||
res_code = XGBoosterSetStrFeatureInfo(R_ExternalPtrAddr(handle),
|
||||
CHAR(field_char),
|
||||
str_arr.data(),
|
||||
len_features);
|
||||
}
|
||||
CHECK_CALL(res_code);
|
||||
R_SetExternalPtrAddr(R_handle, handle);
|
||||
R_RegisterCFinalizerEx(R_handle, _BoosterFinalizer, TRUE);
|
||||
Rf_unprotect(1);
|
||||
R_API_END();
|
||||
return R_NilValue;
|
||||
}
|
||||
|
||||
XGB_DLL SEXP XGBoosterGetStrFeatureInfo_R(SEXP handle, SEXP field) {
|
||||
R_API_BEGIN();
|
||||
bst_ulong len;
|
||||
const char **out_features;
|
||||
SEXP field_char = Rf_protect(Rf_asChar(field));
|
||||
CHECK_CALL(XGBoosterGetStrFeatureInfo(R_ExternalPtrAddr(handle),
|
||||
CHAR(field_char), &len, &out_features));
|
||||
SEXP out = Rf_protect(Rf_allocVector(STRSXP, len));
|
||||
for (bst_ulong idx = 0; idx < len; idx++) {
|
||||
SET_STRING_ELT(out, idx, Rf_mkChar(out_features[idx]));
|
||||
}
|
||||
Rf_unprotect(2);
|
||||
return out;
|
||||
R_API_END();
|
||||
return R_NilValue; /* <- should not be reached */
|
||||
}
|
||||
|
||||
XGB_DLL SEXP XGBoosterBoostedRounds_R(SEXP handle) {
|
||||
SEXP out = Rf_protect(Rf_allocVector(INTSXP, 1));
|
||||
R_API_BEGIN();
|
||||
CHECK_CALL(XGBoosterBoostedRounds(R_ExternalPtrAddr(handle), INTEGER(out)));
|
||||
R_API_END();
|
||||
Rf_unprotect(1);
|
||||
return out;
|
||||
}
|
||||
|
||||
/* Note: R's integer class is 32-bit-and-signed only, while xgboost
|
||||
supports more, so it returns it as a floating point instead */
|
||||
XGB_DLL SEXP XGBoosterGetNumFeature_R(SEXP handle) {
|
||||
SEXP out = Rf_protect(Rf_allocVector(REALSXP, 1));
|
||||
R_API_BEGIN();
|
||||
bst_ulong res;
|
||||
CHECK_CALL(XGBoosterGetNumFeature(R_ExternalPtrAddr(handle), &res));
|
||||
REAL(out)[0] = static_cast<double>(res);
|
||||
R_API_END();
|
||||
Rf_unprotect(1);
|
||||
return out;
|
||||
}
|
||||
|
||||
XGB_DLL SEXP XGBoosterSetParam_R(SEXP handle, SEXP name, SEXP val) {
|
||||
R_API_BEGIN();
|
||||
SEXP name_ = PROTECT(Rf_asChar(name));
|
||||
@ -503,8 +952,8 @@ XGB_DLL SEXP XGBoosterSetParam_R(SEXP handle, SEXP name, SEXP val) {
|
||||
XGB_DLL SEXP XGBoosterUpdateOneIter_R(SEXP handle, SEXP iter, SEXP dtrain) {
|
||||
R_API_BEGIN();
|
||||
CHECK_CALL(XGBoosterUpdateOneIter(R_ExternalPtrAddr(handle),
|
||||
asInteger(iter),
|
||||
R_ExternalPtrAddr(dtrain)));
|
||||
Rf_asInteger(iter),
|
||||
R_ExternalPtrAddr(dtrain)));
|
||||
R_API_END();
|
||||
return R_NilValue;
|
||||
}
|
||||
@ -513,20 +962,14 @@ XGB_DLL SEXP XGBoosterTrainOneIter_R(SEXP handle, SEXP dtrain, SEXP iter, SEXP g
|
||||
R_API_BEGIN();
|
||||
CHECK_EQ(Rf_xlength(grad), Rf_xlength(hess)) << "gradient and hess must have same length.";
|
||||
SEXP gdim = getAttrib(grad, R_DimSymbol);
|
||||
auto n_samples = static_cast<std::size_t>(INTEGER(gdim)[0]);
|
||||
auto n_targets = static_cast<std::size_t>(INTEGER(gdim)[1]);
|
||||
|
||||
SEXP hdim = getAttrib(hess, R_DimSymbol);
|
||||
CHECK_EQ(INTEGER(hdim)[0], n_samples) << "mismatched size between gradient and hessian";
|
||||
CHECK_EQ(INTEGER(hdim)[1], n_targets) << "mismatched size between gradient and hessian";
|
||||
double const *d_grad = REAL(grad);
|
||||
double const *d_hess = REAL(hess);
|
||||
|
||||
int res_code;
|
||||
{
|
||||
auto ctx = xgboost::detail::BoosterCtx(R_ExternalPtrAddr(handle));
|
||||
auto [s_grad, s_hess] = xgboost::detail::MakeGradientInterface(
|
||||
ctx, d_grad, d_hess, xgboost::linalg::kF, n_samples, n_targets);
|
||||
const std::string s_grad = Rf_isNull(gdim)?
|
||||
MakeArrayInterfaceFromRVector(grad) : MakeArrayInterfaceFromRMat(grad);
|
||||
const std::string s_hess = Rf_isNull(hdim)?
|
||||
MakeArrayInterfaceFromRVector(hess) : MakeArrayInterfaceFromRMat(hess);
|
||||
res_code = XGBoosterTrainOneIter(R_ExternalPtrAddr(handle), R_ExternalPtrAddr(dtrain),
|
||||
asInteger(iter), s_grad.c_str(), s_hess.c_str());
|
||||
}
|
||||
|
||||
@ -8,7 +8,9 @@
|
||||
#define XGBOOST_R_H_ // NOLINT(*)
|
||||
|
||||
|
||||
#include <R.h>
|
||||
#include <Rinternals.h>
|
||||
#include <R_ext/Altrep.h>
|
||||
#include <R_ext/Random.h>
|
||||
#include <Rmath.h>
|
||||
|
||||
@ -53,6 +55,16 @@ XGB_DLL SEXP XGDMatrixCreateFromFile_R(SEXP fname, SEXP silent);
|
||||
XGB_DLL SEXP XGDMatrixCreateFromMat_R(SEXP mat,
|
||||
SEXP missing,
|
||||
SEXP n_threads);
|
||||
|
||||
/**
|
||||
* @brief Create matrix content from a data frame.
|
||||
* @param data R data.frame object
|
||||
* @param missing which value to represent missing value
|
||||
* @param n_threads Number of threads used to construct DMatrix from dense matrix.
|
||||
* @return created dmatrix
|
||||
*/
|
||||
XGB_DLL SEXP XGDMatrixCreateFromDF_R(SEXP df, SEXP missing, SEXP n_threads);
|
||||
|
||||
/*!
|
||||
* \brief create a matrix content from CSC format
|
||||
* \param indptr pointer to column headers
|
||||
@ -106,12 +118,20 @@ XGB_DLL SEXP XGDMatrixSaveBinary_R(SEXP handle, SEXP fname, SEXP silent);
|
||||
XGB_DLL SEXP XGDMatrixSetInfo_R(SEXP handle, SEXP field, SEXP array);
|
||||
|
||||
/*!
|
||||
* \brief get info vector from matrix
|
||||
* \brief get info vector (float type) from matrix
|
||||
* \param handle a instance of data matrix
|
||||
* \param field field name
|
||||
* \return info vector
|
||||
*/
|
||||
XGB_DLL SEXP XGDMatrixGetInfo_R(SEXP handle, SEXP field);
|
||||
XGB_DLL SEXP XGDMatrixGetFloatInfo_R(SEXP handle, SEXP field);
|
||||
|
||||
/*!
|
||||
* \brief get info vector (uint type) from matrix
|
||||
* \param handle a instance of data matrix
|
||||
* \param field field name
|
||||
* \return info vector
|
||||
*/
|
||||
XGB_DLL SEXP XGDMatrixGetUIntInfo_R(SEXP handle, SEXP field);
|
||||
|
||||
/*!
|
||||
* \brief return number of rows
|
||||
@ -125,19 +145,87 @@ XGB_DLL SEXP XGDMatrixNumRow_R(SEXP handle);
|
||||
*/
|
||||
XGB_DLL SEXP XGDMatrixNumCol_R(SEXP handle);
|
||||
|
||||
/*!
|
||||
* \brief Call R C-level function 'duplicate'
|
||||
* \param obj Object to duplicate
|
||||
*/
|
||||
XGB_DLL SEXP XGDuplicate_R(SEXP obj);
|
||||
|
||||
/*!
|
||||
* \brief Equality comparison for two pointers
|
||||
* \param obj1 R 'externalptr'
|
||||
* \param obj2 R 'externalptr'
|
||||
*/
|
||||
XGB_DLL SEXP XGPointerEqComparison_R(SEXP obj1, SEXP obj2);
|
||||
|
||||
/*!
|
||||
* \brief Register the Altrep class used for the booster
|
||||
* \param dll DLL info as provided by R_init
|
||||
*/
|
||||
XGB_DLL void XGBInitializeAltrepClass_R(DllInfo *dll);
|
||||
|
||||
/*!
|
||||
* \brief return the quantile cuts used for the histogram method
|
||||
* \param handle an instance of data matrix
|
||||
* \return A list with entries 'indptr' and 'data'
|
||||
*/
|
||||
XGB_DLL SEXP XGDMatrixGetQuantileCut_R(SEXP handle);
|
||||
|
||||
/*!
|
||||
* \brief get the number of non-missing entries in a dmatrix
|
||||
* \param handle an instance of data matrix
|
||||
* \return the number of non-missing entries
|
||||
*/
|
||||
XGB_DLL SEXP XGDMatrixNumNonMissing_R(SEXP handle);
|
||||
|
||||
/*!
|
||||
* \brief get the data in a dmatrix in CSR format
|
||||
* \param handle an instance of data matrix
|
||||
* \return R list with the following entries in this order:
|
||||
* - 'indptr
|
||||
* - 'indices
|
||||
* - 'data'
|
||||
* - 'ncol'
|
||||
*/
|
||||
XGB_DLL SEXP XGDMatrixGetDataAsCSR_R(SEXP handle);
|
||||
|
||||
/*!
|
||||
* \brief create xgboost learner
|
||||
* \param dmats a list of dmatrix handles that will be cached
|
||||
*/
|
||||
XGB_DLL SEXP XGBoosterCreate_R(SEXP dmats);
|
||||
|
||||
/*!
|
||||
* \brief copy information about features from a DMatrix into a Booster
|
||||
* \param booster R 'externalptr' pointing to a booster object
|
||||
* \param dmat R 'externalptr' pointing to a DMatrix object
|
||||
*/
|
||||
XGB_DLL SEXP XGBoosterCopyInfoFromDMatrix_R(SEXP booster, SEXP dmat);
|
||||
|
||||
/*!
|
||||
* \brief create xgboost learner, saving the pointer into an existing R object
|
||||
* \param dmats a list of dmatrix handles that will be cached
|
||||
* \param R_handle a clean R external pointer (not holding any object)
|
||||
* \brief handle R 'externalptr' holding the booster object
|
||||
* \param field field name
|
||||
* \param features features to set for the field
|
||||
*/
|
||||
XGB_DLL SEXP XGBoosterCreateInEmptyObj_R(SEXP dmats, SEXP R_handle);
|
||||
XGB_DLL SEXP XGBoosterSetStrFeatureInfo_R(SEXP handle, SEXP field, SEXP features);
|
||||
|
||||
/*!
|
||||
* \brief handle R 'externalptr' holding the booster object
|
||||
* \param field field name
|
||||
*/
|
||||
XGB_DLL SEXP XGBoosterGetStrFeatureInfo_R(SEXP handle, SEXP field);
|
||||
|
||||
/*!
|
||||
* \brief Get the number of boosted rounds from a model
|
||||
* \param handle R 'externalptr' holding the booster object
|
||||
*/
|
||||
XGB_DLL SEXP XGBoosterBoostedRounds_R(SEXP handle);
|
||||
|
||||
/*!
|
||||
* \brief Get the number of features to which the model was fitted
|
||||
* \param handle R 'externalptr' holding the booster object
|
||||
*/
|
||||
XGB_DLL SEXP XGBoosterGetNumFeature_R(SEXP handle);
|
||||
|
||||
/*!
|
||||
* \brief set parameters
|
||||
|
||||
@ -17,7 +17,11 @@ namespace xgboost {
|
||||
ConsoleLogger::~ConsoleLogger() {
|
||||
if (cur_verbosity_ == LogVerbosity::kIgnore ||
|
||||
cur_verbosity_ <= GlobalVerbosity()) {
|
||||
dmlc::CustomLogMessage::Log(log_stream_.str());
|
||||
if (cur_verbosity_ == LogVerbosity::kWarning) {
|
||||
REprintf("%s\n", log_stream_.str().c_str());
|
||||
} else {
|
||||
dmlc::CustomLogMessage::Log(log_stream_.str());
|
||||
}
|
||||
}
|
||||
}
|
||||
TrackerLogger::~TrackerLogger() {
|
||||
|
||||
@ -3,7 +3,6 @@
|
||||
## inconsistent is found.
|
||||
pkgs <- c(
|
||||
## CI
|
||||
"caret",
|
||||
"pkgbuild",
|
||||
"roxygen2",
|
||||
"XML",
|
||||
|
||||
@ -16,18 +16,19 @@ n_threads <- 1
|
||||
test_that("train and predict binary classification", {
|
||||
nrounds <- 2
|
||||
expect_output(
|
||||
bst <- xgboost(
|
||||
data = train$data, label = train$label, max_depth = 2,
|
||||
bst <- xgb.train(
|
||||
data = xgb.DMatrix(train$data, label = train$label), max_depth = 2,
|
||||
eta = 1, nthread = n_threads, nrounds = nrounds,
|
||||
objective = "binary:logistic", eval_metric = "error"
|
||||
objective = "binary:logistic", eval_metric = "error",
|
||||
watchlist = list(train = xgb.DMatrix(train$data, label = train$label))
|
||||
),
|
||||
"train-error"
|
||||
)
|
||||
expect_equal(class(bst), "xgb.Booster")
|
||||
expect_equal(bst$niter, nrounds)
|
||||
expect_false(is.null(bst$evaluation_log))
|
||||
expect_equal(nrow(bst$evaluation_log), nrounds)
|
||||
expect_lt(bst$evaluation_log[, min(train_error)], 0.03)
|
||||
expect_equal(xgb.get.num.boosted.rounds(bst), nrounds)
|
||||
expect_false(is.null(attributes(bst)$evaluation_log))
|
||||
expect_equal(nrow(attributes(bst)$evaluation_log), nrounds)
|
||||
expect_lt(attributes(bst)$evaluation_log[, min(train_error)], 0.03)
|
||||
|
||||
pred <- predict(bst, test$data)
|
||||
expect_length(pred, 1611)
|
||||
@ -35,7 +36,7 @@ test_that("train and predict binary classification", {
|
||||
pred1 <- predict(bst, train$data, ntreelimit = 1)
|
||||
expect_length(pred1, 6513)
|
||||
err_pred1 <- sum((pred1 > 0.5) != train$label) / length(train$label)
|
||||
err_log <- bst$evaluation_log[1, train_error]
|
||||
err_log <- attributes(bst)$evaluation_log[1, train_error]
|
||||
expect_lt(abs(err_pred1 - err_log), 10e-6)
|
||||
|
||||
pred2 <- predict(bst, train$data, iterationrange = c(1, 2))
|
||||
@ -56,7 +57,7 @@ test_that("parameter validation works", {
|
||||
y <- d[, "x1"] + d[, "x2"]^2 +
|
||||
ifelse(d[, "x3"] > .5, d[, "x3"]^2, 2^d[, "x3"]) +
|
||||
rnorm(10)
|
||||
dtrain <- xgb.DMatrix(data = d, info = list(label = y), nthread = n_threads)
|
||||
dtrain <- xgb.DMatrix(data = d, label = y, nthread = n_threads)
|
||||
|
||||
correct <- function() {
|
||||
params <- list(
|
||||
@ -82,7 +83,8 @@ test_that("parameter validation works", {
|
||||
bar = "foo"
|
||||
)
|
||||
output <- capture.output(
|
||||
xgb.train(params = params, data = dtrain, nrounds = nrounds)
|
||||
xgb.train(params = params, data = dtrain, nrounds = nrounds),
|
||||
type = "message"
|
||||
)
|
||||
print(output)
|
||||
}
|
||||
@ -104,9 +106,8 @@ test_that("dart prediction works", {
|
||||
rnorm(100)
|
||||
|
||||
set.seed(1994)
|
||||
booster_by_xgboost <- xgboost(
|
||||
data = d,
|
||||
label = y,
|
||||
booster_by_xgboost <- xgb.train(
|
||||
data = xgb.DMatrix(d, label = y),
|
||||
max_depth = 2,
|
||||
booster = "dart",
|
||||
rate_drop = 0.5,
|
||||
@ -124,7 +125,7 @@ test_that("dart prediction works", {
|
||||
expect_false(all(matrix(pred_by_xgboost_0, byrow = TRUE) == matrix(pred_by_xgboost_2, byrow = TRUE)))
|
||||
|
||||
set.seed(1994)
|
||||
dtrain <- xgb.DMatrix(data = d, info = list(label = y), nthread = n_threads)
|
||||
dtrain <- xgb.DMatrix(data = d, label = y, nthread = n_threads)
|
||||
booster_by_train <- xgb.train(
|
||||
params = list(
|
||||
booster = "dart",
|
||||
@ -151,16 +152,17 @@ test_that("train and predict softprob", {
|
||||
lb <- as.numeric(iris$Species) - 1
|
||||
set.seed(11)
|
||||
expect_output(
|
||||
bst <- xgboost(
|
||||
data = as.matrix(iris[, -5]), label = lb,
|
||||
bst <- xgb.train(
|
||||
data = xgb.DMatrix(as.matrix(iris[, -5]), label = lb),
|
||||
max_depth = 3, eta = 0.5, nthread = n_threads, nrounds = 5,
|
||||
objective = "multi:softprob", num_class = 3, eval_metric = "merror"
|
||||
objective = "multi:softprob", num_class = 3, eval_metric = "merror",
|
||||
watchlist = list(train = xgb.DMatrix(as.matrix(iris[, -5]), label = lb))
|
||||
),
|
||||
"train-merror"
|
||||
)
|
||||
expect_false(is.null(bst$evaluation_log))
|
||||
expect_lt(bst$evaluation_log[, min(train_merror)], 0.025)
|
||||
expect_equal(bst$niter * 3, xgb.ntree(bst))
|
||||
expect_false(is.null(attributes(bst)$evaluation_log))
|
||||
expect_lt(attributes(bst)$evaluation_log[, min(train_merror)], 0.025)
|
||||
expect_equal(xgb.get.num.boosted.rounds(bst) * 3, xgb.ntree(bst))
|
||||
pred <- predict(bst, as.matrix(iris[, -5]))
|
||||
expect_length(pred, nrow(iris) * 3)
|
||||
# row sums add up to total probability of 1:
|
||||
@ -170,12 +172,12 @@ test_that("train and predict softprob", {
|
||||
expect_equal(as.numeric(t(mpred)), pred)
|
||||
pred_labels <- max.col(mpred) - 1
|
||||
err <- sum(pred_labels != lb) / length(lb)
|
||||
expect_equal(bst$evaluation_log[5, train_merror], err, tolerance = 5e-6)
|
||||
expect_equal(attributes(bst)$evaluation_log[5, train_merror], err, tolerance = 5e-6)
|
||||
# manually calculate error at the 1st iteration:
|
||||
mpred <- predict(bst, as.matrix(iris[, -5]), reshape = TRUE, ntreelimit = 1)
|
||||
pred_labels <- max.col(mpred) - 1
|
||||
err <- sum(pred_labels != lb) / length(lb)
|
||||
expect_equal(bst$evaluation_log[1, train_merror], err, tolerance = 5e-6)
|
||||
expect_equal(attributes(bst)$evaluation_log[1, train_merror], err, tolerance = 5e-6)
|
||||
|
||||
mpred1 <- predict(bst, as.matrix(iris[, -5]), reshape = TRUE, iterationrange = c(1, 2))
|
||||
expect_equal(mpred, mpred1)
|
||||
@ -186,7 +188,7 @@ test_that("train and predict softprob", {
|
||||
x3 = rnorm(100)
|
||||
)
|
||||
y <- sample.int(10, 100, replace = TRUE) - 1
|
||||
dtrain <- xgb.DMatrix(data = d, info = list(label = y), nthread = n_threads)
|
||||
dtrain <- xgb.DMatrix(data = d, label = y, nthread = n_threads)
|
||||
booster <- xgb.train(
|
||||
params = list(tree_method = "hist", nthread = n_threads),
|
||||
data = dtrain, nrounds = 4, num_class = 10,
|
||||
@ -201,39 +203,41 @@ test_that("train and predict softmax", {
|
||||
lb <- as.numeric(iris$Species) - 1
|
||||
set.seed(11)
|
||||
expect_output(
|
||||
bst <- xgboost(
|
||||
data = as.matrix(iris[, -5]), label = lb,
|
||||
bst <- xgb.train(
|
||||
data = xgb.DMatrix(as.matrix(iris[, -5]), label = lb),
|
||||
max_depth = 3, eta = 0.5, nthread = n_threads, nrounds = 5,
|
||||
objective = "multi:softmax", num_class = 3, eval_metric = "merror"
|
||||
objective = "multi:softmax", num_class = 3, eval_metric = "merror",
|
||||
watchlist = list(train = xgb.DMatrix(as.matrix(iris[, -5]), label = lb))
|
||||
),
|
||||
"train-merror"
|
||||
)
|
||||
expect_false(is.null(bst$evaluation_log))
|
||||
expect_lt(bst$evaluation_log[, min(train_merror)], 0.025)
|
||||
expect_equal(bst$niter * 3, xgb.ntree(bst))
|
||||
expect_false(is.null(attributes(bst)$evaluation_log))
|
||||
expect_lt(attributes(bst)$evaluation_log[, min(train_merror)], 0.025)
|
||||
expect_equal(xgb.get.num.boosted.rounds(bst) * 3, xgb.ntree(bst))
|
||||
|
||||
pred <- predict(bst, as.matrix(iris[, -5]))
|
||||
expect_length(pred, nrow(iris))
|
||||
err <- sum(pred != lb) / length(lb)
|
||||
expect_equal(bst$evaluation_log[5, train_merror], err, tolerance = 5e-6)
|
||||
expect_equal(attributes(bst)$evaluation_log[5, train_merror], err, tolerance = 5e-6)
|
||||
})
|
||||
|
||||
test_that("train and predict RF", {
|
||||
set.seed(11)
|
||||
lb <- train$label
|
||||
# single iteration
|
||||
bst <- xgboost(
|
||||
data = train$data, label = lb, max_depth = 5,
|
||||
bst <- xgb.train(
|
||||
data = xgb.DMatrix(train$data, label = lb), max_depth = 5,
|
||||
nthread = n_threads,
|
||||
nrounds = 1, objective = "binary:logistic", eval_metric = "error",
|
||||
num_parallel_tree = 20, subsample = 0.6, colsample_bytree = 0.1
|
||||
num_parallel_tree = 20, subsample = 0.6, colsample_bytree = 0.1,
|
||||
watchlist = list(train = xgb.DMatrix(train$data, label = lb))
|
||||
)
|
||||
expect_equal(bst$niter, 1)
|
||||
expect_equal(xgb.get.num.boosted.rounds(bst), 1)
|
||||
expect_equal(xgb.ntree(bst), 20)
|
||||
|
||||
pred <- predict(bst, train$data)
|
||||
pred_err <- sum((pred > 0.5) != lb) / length(lb)
|
||||
expect_lt(abs(bst$evaluation_log[1, train_error] - pred_err), 10e-6)
|
||||
expect_lt(abs(attributes(bst)$evaluation_log[1, train_error] - pred_err), 10e-6)
|
||||
# expect_lt(pred_err, 0.03)
|
||||
|
||||
pred <- predict(bst, train$data, ntreelimit = 20)
|
||||
@ -248,50 +252,53 @@ test_that("train and predict RF with softprob", {
|
||||
lb <- as.numeric(iris$Species) - 1
|
||||
nrounds <- 15
|
||||
set.seed(11)
|
||||
bst <- xgboost(
|
||||
data = as.matrix(iris[, -5]), label = lb,
|
||||
bst <- xgb.train(
|
||||
data = xgb.DMatrix(as.matrix(iris[, -5]), label = lb),
|
||||
max_depth = 3, eta = 0.9, nthread = n_threads, nrounds = nrounds,
|
||||
objective = "multi:softprob", eval_metric = "merror",
|
||||
num_class = 3, verbose = 0,
|
||||
num_parallel_tree = 4, subsample = 0.5, colsample_bytree = 0.5
|
||||
num_parallel_tree = 4, subsample = 0.5, colsample_bytree = 0.5,
|
||||
watchlist = list(train = xgb.DMatrix(as.matrix(iris[, -5]), label = lb))
|
||||
)
|
||||
expect_equal(bst$niter, 15)
|
||||
expect_equal(xgb.get.num.boosted.rounds(bst), 15)
|
||||
expect_equal(xgb.ntree(bst), 15 * 3 * 4)
|
||||
# predict for all iterations:
|
||||
pred <- predict(bst, as.matrix(iris[, -5]), reshape = TRUE)
|
||||
expect_equal(dim(pred), c(nrow(iris), 3))
|
||||
pred_labels <- max.col(pred) - 1
|
||||
err <- sum(pred_labels != lb) / length(lb)
|
||||
expect_equal(bst$evaluation_log[nrounds, train_merror], err, tolerance = 5e-6)
|
||||
expect_equal(attributes(bst)$evaluation_log[nrounds, train_merror], err, tolerance = 5e-6)
|
||||
# predict for 7 iterations and adjust for 4 parallel trees per iteration
|
||||
pred <- predict(bst, as.matrix(iris[, -5]), reshape = TRUE, ntreelimit = 7 * 4)
|
||||
err <- sum((max.col(pred) - 1) != lb) / length(lb)
|
||||
expect_equal(bst$evaluation_log[7, train_merror], err, tolerance = 5e-6)
|
||||
expect_equal(attributes(bst)$evaluation_log[7, train_merror], err, tolerance = 5e-6)
|
||||
})
|
||||
|
||||
test_that("use of multiple eval metrics works", {
|
||||
expect_output(
|
||||
bst <- xgboost(
|
||||
data = train$data, label = train$label, max_depth = 2,
|
||||
bst <- xgb.train(
|
||||
data = xgb.DMatrix(train$data, label = train$label), max_depth = 2,
|
||||
eta = 1, nthread = n_threads, nrounds = 2, objective = "binary:logistic",
|
||||
eval_metric = "error", eval_metric = "auc", eval_metric = "logloss"
|
||||
eval_metric = "error", eval_metric = "auc", eval_metric = "logloss",
|
||||
watchlist = list(train = xgb.DMatrix(train$data, label = train$label))
|
||||
),
|
||||
"train-error.*train-auc.*train-logloss"
|
||||
)
|
||||
expect_false(is.null(bst$evaluation_log))
|
||||
expect_equal(dim(bst$evaluation_log), c(2, 4))
|
||||
expect_equal(colnames(bst$evaluation_log), c("iter", "train_error", "train_auc", "train_logloss"))
|
||||
expect_false(is.null(attributes(bst)$evaluation_log))
|
||||
expect_equal(dim(attributes(bst)$evaluation_log), c(2, 4))
|
||||
expect_equal(colnames(attributes(bst)$evaluation_log), c("iter", "train_error", "train_auc", "train_logloss"))
|
||||
expect_output(
|
||||
bst2 <- xgboost(
|
||||
data = train$data, label = train$label, max_depth = 2,
|
||||
bst2 <- xgb.train(
|
||||
data = xgb.DMatrix(train$data, label = train$label), max_depth = 2,
|
||||
eta = 1, nthread = n_threads, nrounds = 2, objective = "binary:logistic",
|
||||
eval_metric = list("error", "auc", "logloss")
|
||||
eval_metric = list("error", "auc", "logloss"),
|
||||
watchlist = list(train = xgb.DMatrix(train$data, label = train$label))
|
||||
),
|
||||
"train-error.*train-auc.*train-logloss"
|
||||
)
|
||||
expect_false(is.null(bst2$evaluation_log))
|
||||
expect_equal(dim(bst2$evaluation_log), c(2, 4))
|
||||
expect_equal(colnames(bst2$evaluation_log), c("iter", "train_error", "train_auc", "train_logloss"))
|
||||
expect_false(is.null(attributes(bst2)$evaluation_log))
|
||||
expect_equal(dim(attributes(bst2)$evaluation_log), c(2, 4))
|
||||
expect_equal(colnames(attributes(bst2)$evaluation_log), c("iter", "train_error", "train_auc", "train_logloss"))
|
||||
})
|
||||
|
||||
|
||||
@ -311,41 +318,25 @@ test_that("training continuation works", {
|
||||
# continue for two more:
|
||||
bst2 <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0, xgb_model = bst1)
|
||||
if (!windows_flag && !solaris_flag) {
|
||||
expect_equal(bst$raw, bst2$raw)
|
||||
expect_equal(xgb.save.raw(bst), xgb.save.raw(bst2))
|
||||
}
|
||||
expect_false(is.null(bst2$evaluation_log))
|
||||
expect_equal(dim(bst2$evaluation_log), c(4, 2))
|
||||
expect_equal(bst2$evaluation_log, bst$evaluation_log)
|
||||
expect_false(is.null(attributes(bst2)$evaluation_log))
|
||||
expect_equal(dim(attributes(bst2)$evaluation_log), c(4, 2))
|
||||
expect_equal(attributes(bst2)$evaluation_log, attributes(bst)$evaluation_log)
|
||||
# test continuing from raw model data
|
||||
bst2 <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0, xgb_model = bst1$raw)
|
||||
bst2 <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0, xgb_model = xgb.save.raw(bst1))
|
||||
if (!windows_flag && !solaris_flag) {
|
||||
expect_equal(bst$raw, bst2$raw)
|
||||
expect_equal(xgb.save.raw(bst), xgb.save.raw(bst2))
|
||||
}
|
||||
expect_equal(dim(bst2$evaluation_log), c(2, 2))
|
||||
expect_equal(dim(attributes(bst2)$evaluation_log), c(2, 2))
|
||||
# test continuing from a model in file
|
||||
xgb.save(bst1, "xgboost.json")
|
||||
bst2 <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0, xgb_model = "xgboost.json")
|
||||
fname <- file.path(tempdir(), "xgboost.json")
|
||||
xgb.save(bst1, fname)
|
||||
bst2 <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0, xgb_model = fname)
|
||||
if (!windows_flag && !solaris_flag) {
|
||||
expect_equal(bst$raw, bst2$raw)
|
||||
expect_equal(xgb.save.raw(bst), xgb.save.raw(bst2))
|
||||
}
|
||||
expect_equal(dim(bst2$evaluation_log), c(2, 2))
|
||||
file.remove("xgboost.json")
|
||||
})
|
||||
|
||||
test_that("model serialization works", {
|
||||
out_path <- "model_serialization"
|
||||
dtrain <- xgb.DMatrix(train$data, label = train$label, nthread = n_threads)
|
||||
watchlist <- list(train = dtrain)
|
||||
param <- list(objective = "binary:logistic", nthread = n_threads)
|
||||
booster <- xgb.train(param, dtrain, nrounds = 4, watchlist)
|
||||
raw <- xgb.serialize(booster)
|
||||
saveRDS(raw, out_path)
|
||||
raw <- readRDS(out_path)
|
||||
|
||||
loaded <- xgb.unserialize(raw)
|
||||
raw_from_loaded <- xgb.serialize(loaded)
|
||||
expect_equal(raw, raw_from_loaded)
|
||||
file.remove(out_path)
|
||||
expect_equal(dim(attributes(bst2)$evaluation_log), c(2, 2))
|
||||
})
|
||||
|
||||
test_that("xgb.cv works", {
|
||||
@ -361,7 +352,7 @@ test_that("xgb.cv works", {
|
||||
expect_is(cv, "xgb.cv.synchronous")
|
||||
expect_false(is.null(cv$evaluation_log))
|
||||
expect_lt(cv$evaluation_log[, min(test_error_mean)], 0.03)
|
||||
expect_lt(cv$evaluation_log[, min(test_error_std)], 0.008)
|
||||
expect_lt(cv$evaluation_log[, min(test_error_std)], 0.0085)
|
||||
expect_equal(cv$niter, 2)
|
||||
expect_false(is.null(cv$folds) && is.list(cv$folds))
|
||||
expect_length(cv$folds, 5)
|
||||
@ -391,8 +382,8 @@ test_that("xgb.cv works with stratified folds", {
|
||||
test_that("train and predict with non-strict classes", {
|
||||
# standard dense matrix input
|
||||
train_dense <- as.matrix(train$data)
|
||||
bst <- xgboost(
|
||||
data = train_dense, label = train$label, max_depth = 2,
|
||||
bst <- xgb.train(
|
||||
data = xgb.DMatrix(train_dense, label = train$label), max_depth = 2,
|
||||
eta = 1, nthread = n_threads, nrounds = 2, objective = "binary:logistic",
|
||||
verbose = 0
|
||||
)
|
||||
@ -402,8 +393,8 @@ test_that("train and predict with non-strict classes", {
|
||||
class(train_dense) <- "shmatrix"
|
||||
expect_true(is.matrix(train_dense))
|
||||
expect_error(
|
||||
bst <- xgboost(
|
||||
data = train_dense, label = train$label, max_depth = 2,
|
||||
bst <- xgb.train(
|
||||
data = xgb.DMatrix(train_dense, label = train$label), max_depth = 2,
|
||||
eta = 1, nthread = n_threads, nrounds = 2, objective = "binary:logistic",
|
||||
verbose = 0
|
||||
),
|
||||
@ -416,8 +407,8 @@ test_that("train and predict with non-strict classes", {
|
||||
class(train_dense) <- c("pphmatrix", "shmatrix")
|
||||
expect_true(is.matrix(train_dense))
|
||||
expect_error(
|
||||
bst <- xgboost(
|
||||
data = train_dense, label = train$label, max_depth = 2,
|
||||
bst <- xgb.train(
|
||||
data = xgb.DMatrix(train_dense, label = train$label), max_depth = 2,
|
||||
eta = 1, nthread = n_threads, nrounds = 2, objective = "binary:logistic",
|
||||
verbose = 0
|
||||
),
|
||||
@ -448,8 +439,8 @@ test_that("max_delta_step works", {
|
||||
# model with restricted max_delta_step
|
||||
bst2 <- xgb.train(param, dtrain, nrounds, watchlist, verbose = 1, max_delta_step = 1)
|
||||
# the no-restriction model is expected to have consistently lower loss during the initial iterations
|
||||
expect_true(all(bst1$evaluation_log$train_logloss < bst2$evaluation_log$train_logloss))
|
||||
expect_lt(mean(bst1$evaluation_log$train_logloss) / mean(bst2$evaluation_log$train_logloss), 0.8)
|
||||
expect_true(all(attributes(bst1)$evaluation_log$train_logloss < attributes(bst2)$evaluation_log$train_logloss))
|
||||
expect_lt(mean(attributes(bst1)$evaluation_log$train_logloss) / mean(attributes(bst2)$evaluation_log$train_logloss), 0.8)
|
||||
})
|
||||
|
||||
test_that("colsample_bytree works", {
|
||||
@ -480,8 +471,8 @@ test_that("colsample_bytree works", {
|
||||
})
|
||||
|
||||
test_that("Configuration works", {
|
||||
bst <- xgboost(
|
||||
data = train$data, label = train$label, max_depth = 2,
|
||||
bst <- xgb.train(
|
||||
data = xgb.DMatrix(train$data, label = train$label), max_depth = 2,
|
||||
eta = 1, nthread = n_threads, nrounds = 2, objective = "binary:logistic",
|
||||
eval_metric = "error", eval_metric = "auc", eval_metric = "logloss"
|
||||
)
|
||||
@ -521,8 +512,8 @@ test_that("strict_shape works", {
|
||||
y <- as.numeric(iris$Species) - 1
|
||||
X <- as.matrix(iris[, -5])
|
||||
|
||||
bst <- xgboost(
|
||||
data = X, label = y,
|
||||
bst <- xgb.train(
|
||||
data = xgb.DMatrix(X, label = y),
|
||||
max_depth = 2, nrounds = n_rounds, nthread = n_threads,
|
||||
objective = "multi:softprob", num_class = 3, eval_metric = "merror"
|
||||
)
|
||||
@ -536,8 +527,8 @@ test_that("strict_shape works", {
|
||||
X <- agaricus.train$data
|
||||
y <- agaricus.train$label
|
||||
|
||||
bst <- xgboost(
|
||||
data = X, label = y, max_depth = 2, nthread = n_threads,
|
||||
bst <- xgb.train(
|
||||
data = xgb.DMatrix(X, label = y), max_depth = 2, nthread = n_threads,
|
||||
nrounds = n_rounds, objective = "binary:logistic",
|
||||
eval_metric = "error", eval_metric = "auc", eval_metric = "logloss"
|
||||
)
|
||||
@ -555,8 +546,8 @@ test_that("'predict' accepts CSR data", {
|
||||
x_csc <- as(X[1L, , drop = FALSE], "CsparseMatrix")
|
||||
x_csr <- as(x_csc, "RsparseMatrix")
|
||||
x_spv <- as(x_csc, "sparseVector")
|
||||
bst <- xgboost(
|
||||
data = X, label = y, objective = "binary:logistic",
|
||||
bst <- xgb.train(
|
||||
data = xgb.DMatrix(X, label = y), objective = "binary:logistic",
|
||||
nrounds = 5L, verbose = FALSE, nthread = n_threads,
|
||||
)
|
||||
p_csc <- predict(bst, x_csc)
|
||||
@ -565,3 +556,234 @@ test_that("'predict' accepts CSR data", {
|
||||
expect_equal(p_csc, p_csr)
|
||||
expect_equal(p_csc, p_spv)
|
||||
})
|
||||
|
||||
test_that("Quantile regression accepts multiple quantiles", {
|
||||
data(mtcars)
|
||||
y <- mtcars[, 1]
|
||||
x <- as.matrix(mtcars[, -1])
|
||||
dm <- xgb.DMatrix(data = x, label = y)
|
||||
model <- xgb.train(
|
||||
data = dm,
|
||||
params = list(
|
||||
objective = "reg:quantileerror",
|
||||
tree_method = "exact",
|
||||
quantile_alpha = c(0.05, 0.5, 0.95),
|
||||
nthread = n_threads
|
||||
),
|
||||
nrounds = 15
|
||||
)
|
||||
pred <- predict(model, x, reshape = TRUE)
|
||||
|
||||
expect_equal(dim(pred)[1], nrow(x))
|
||||
expect_equal(dim(pred)[2], 3)
|
||||
expect_true(all(pred[, 1] <= pred[, 3]))
|
||||
|
||||
cors <- cor(y, pred)
|
||||
expect_true(cors[2] > cors[1])
|
||||
expect_true(cors[2] > cors[3])
|
||||
expect_true(cors[2] > 0.85)
|
||||
})
|
||||
|
||||
test_that("Can use multi-output labels with built-in objectives", {
|
||||
data("mtcars")
|
||||
y <- mtcars$mpg
|
||||
x <- as.matrix(mtcars[, -1])
|
||||
y_mirrored <- cbind(y, -y)
|
||||
dm <- xgb.DMatrix(x, label = y_mirrored, nthread = n_threads)
|
||||
model <- xgb.train(
|
||||
params = list(
|
||||
tree_method = "hist",
|
||||
multi_strategy = "multi_output_tree",
|
||||
objective = "reg:squarederror",
|
||||
nthread = n_threads
|
||||
),
|
||||
data = dm,
|
||||
nrounds = 5
|
||||
)
|
||||
pred <- predict(model, x, reshape = TRUE)
|
||||
expect_equal(pred[, 1], -pred[, 2])
|
||||
expect_true(cor(y, pred[, 1]) > 0.9)
|
||||
expect_true(cor(y, pred[, 2]) < -0.9)
|
||||
})
|
||||
|
||||
test_that("Can use multi-output labels with custom objectives", {
|
||||
data("mtcars")
|
||||
y <- mtcars$mpg
|
||||
x <- as.matrix(mtcars[, -1])
|
||||
y_mirrored <- cbind(y, -y)
|
||||
dm <- xgb.DMatrix(x, label = y_mirrored, nthread = n_threads)
|
||||
model <- xgb.train(
|
||||
params = list(
|
||||
tree_method = "hist",
|
||||
multi_strategy = "multi_output_tree",
|
||||
base_score = 0,
|
||||
objective = function(pred, dtrain) {
|
||||
y <- getinfo(dtrain, "label")
|
||||
grad <- pred - y
|
||||
hess <- rep(1, nrow(grad) * ncol(grad))
|
||||
hess <- matrix(hess, nrow = nrow(grad))
|
||||
return(list(grad = grad, hess = hess))
|
||||
},
|
||||
nthread = n_threads
|
||||
),
|
||||
data = dm,
|
||||
nrounds = 5
|
||||
)
|
||||
pred <- predict(model, x, reshape = TRUE)
|
||||
expect_equal(pred[, 1], -pred[, 2])
|
||||
expect_true(cor(y, pred[, 1]) > 0.9)
|
||||
expect_true(cor(y, pred[, 2]) < -0.9)
|
||||
})
|
||||
|
||||
test_that("Can use ranking objectives with either 'qid' or 'group'", {
|
||||
set.seed(123)
|
||||
x <- matrix(rnorm(100 * 10), nrow = 100)
|
||||
y <- sample(2, size = 100, replace = TRUE) - 1
|
||||
qid <- c(rep(1, 20), rep(2, 20), rep(3, 60))
|
||||
gr <- c(20, 20, 60)
|
||||
|
||||
dmat_qid <- xgb.DMatrix(x, label = y, qid = qid)
|
||||
dmat_gr <- xgb.DMatrix(x, label = y, group = gr)
|
||||
|
||||
params <- list(tree_method = "hist",
|
||||
lambdarank_num_pair_per_sample = 8,
|
||||
objective = "rank:ndcg",
|
||||
lambdarank_pair_method = "topk",
|
||||
nthread = n_threads)
|
||||
set.seed(123)
|
||||
model_qid <- xgb.train(params, dmat_qid, nrounds = 5)
|
||||
set.seed(123)
|
||||
model_gr <- xgb.train(params, dmat_gr, nrounds = 5)
|
||||
|
||||
pred_qid <- predict(model_qid, x)
|
||||
pred_gr <- predict(model_gr, x)
|
||||
expect_equal(pred_qid, pred_gr)
|
||||
})
|
||||
|
||||
test_that("Coefficients from gblinear have the expected shape and names", {
|
||||
# Single-column coefficients
|
||||
data(mtcars)
|
||||
y <- mtcars$mpg
|
||||
x <- as.matrix(mtcars[, -1])
|
||||
mm <- model.matrix(~., data = mtcars[, -1])
|
||||
dm <- xgb.DMatrix(x, label = y, nthread = 1)
|
||||
model <- xgb.train(
|
||||
data = dm,
|
||||
params = list(
|
||||
booster = "gblinear",
|
||||
nthread = 1
|
||||
),
|
||||
nrounds = 3
|
||||
)
|
||||
coefs <- coef(model)
|
||||
expect_equal(length(coefs), ncol(x) + 1)
|
||||
expect_equal(names(coefs), c("(Intercept)", colnames(x)))
|
||||
pred_auto <- predict(model, x)
|
||||
pred_manual <- as.numeric(mm %*% coefs)
|
||||
expect_equal(pred_manual, pred_auto, tolerance = 1e-5)
|
||||
|
||||
# Multi-column coefficients
|
||||
data(iris)
|
||||
y <- as.numeric(iris$Species) - 1
|
||||
x <- as.matrix(iris[, -5])
|
||||
dm <- xgb.DMatrix(x, label = y, nthread = 1)
|
||||
mm <- model.matrix(~., data = iris[, -5])
|
||||
model <- xgb.train(
|
||||
data = dm,
|
||||
params = list(
|
||||
booster = "gblinear",
|
||||
objective = "multi:softprob",
|
||||
num_class = 3,
|
||||
nthread = 1
|
||||
),
|
||||
nrounds = 3
|
||||
)
|
||||
coefs <- coef(model)
|
||||
expect_equal(nrow(coefs), ncol(x) + 1)
|
||||
expect_equal(ncol(coefs), 3)
|
||||
expect_equal(row.names(coefs), c("(Intercept)", colnames(x)))
|
||||
pred_auto <- predict(model, x, outputmargin = TRUE, reshape = TRUE)
|
||||
pred_manual <- unname(mm %*% coefs)
|
||||
expect_equal(pred_manual, pred_auto, tolerance = 1e-7)
|
||||
})
|
||||
|
||||
test_that("Deep copies work as expected", {
|
||||
data(mtcars)
|
||||
y <- mtcars$mpg
|
||||
x <- mtcars[, -1]
|
||||
dm <- xgb.DMatrix(x, label = y, nthread = 1)
|
||||
model <- xgb.train(
|
||||
data = dm,
|
||||
params = list(nthread = 1),
|
||||
nrounds = 3
|
||||
)
|
||||
|
||||
xgb.attr(model, "my_attr") <- 100
|
||||
model_shallow_copy <- model
|
||||
xgb.attr(model_shallow_copy, "my_attr") <- 333
|
||||
attr_orig <- xgb.attr(model, "my_attr")
|
||||
attr_shallow <- xgb.attr(model_shallow_copy, "my_attr")
|
||||
expect_equal(attr_orig, attr_shallow)
|
||||
|
||||
model_deep_copy <- xgb.copy.Booster(model)
|
||||
xgb.attr(model_deep_copy, "my_attr") <- 444
|
||||
attr_orig <- xgb.attr(model, "my_attr")
|
||||
attr_deep <- xgb.attr(model_deep_copy, "my_attr")
|
||||
expect_false(attr_orig == attr_deep)
|
||||
})
|
||||
|
||||
test_that("Pointer comparison works as expected", {
|
||||
library(xgboost)
|
||||
y <- mtcars$mpg
|
||||
x <- as.matrix(mtcars[, -1])
|
||||
model <- xgb.train(
|
||||
params = list(nthread = 1),
|
||||
data = xgb.DMatrix(x, label = y, nthread = 1),
|
||||
nrounds = 3
|
||||
)
|
||||
|
||||
model_shallow_copy <- model
|
||||
expect_true(xgb.is.same.Booster(model, model_shallow_copy))
|
||||
|
||||
model_deep_copy <- xgb.copy.Booster(model)
|
||||
expect_false(xgb.is.same.Booster(model, model_deep_copy))
|
||||
|
||||
xgb.attr(model_shallow_copy, "my_attr") <- 111
|
||||
expect_equal(xgb.attr(model, "my_attr"), "111")
|
||||
expect_null(xgb.attr(model_deep_copy, "my_attr"))
|
||||
})
|
||||
|
||||
test_that("DMatrix field are set to booster when training", {
|
||||
set.seed(123)
|
||||
y <- rnorm(100)
|
||||
x <- matrix(rnorm(100 * 3), nrow = 100)
|
||||
x[, 2] <- abs(as.integer(x[, 2]))
|
||||
|
||||
dm_unnamed <- xgb.DMatrix(x, label = y, nthread = 1)
|
||||
dm_feature_names <- xgb.DMatrix(x, label = y, feature_names = c("a", "b", "c"), nthread = 1)
|
||||
dm_feature_types <- xgb.DMatrix(x, label = y)
|
||||
setinfo(dm_feature_types, "feature_type", c("q", "c", "q"))
|
||||
dm_both <- xgb.DMatrix(x, label = y, feature_names = c("a", "b", "c"), nthread = 1)
|
||||
setinfo(dm_both, "feature_type", c("q", "c", "q"))
|
||||
|
||||
params <- list(nthread = 1)
|
||||
model_unnamed <- xgb.train(data = dm_unnamed, params = params, nrounds = 3)
|
||||
model_feature_names <- xgb.train(data = dm_feature_names, params = params, nrounds = 3)
|
||||
model_feature_types <- xgb.train(data = dm_feature_types, params = params, nrounds = 3)
|
||||
model_both <- xgb.train(data = dm_both, params = params, nrounds = 3)
|
||||
|
||||
expect_null(getinfo(model_unnamed, "feature_name"))
|
||||
expect_equal(getinfo(model_feature_names, "feature_name"), c("a", "b", "c"))
|
||||
expect_null(getinfo(model_feature_types, "feature_name"))
|
||||
expect_equal(getinfo(model_both, "feature_name"), c("a", "b", "c"))
|
||||
|
||||
expect_null(variable.names(model_unnamed))
|
||||
expect_equal(variable.names(model_feature_names), c("a", "b", "c"))
|
||||
expect_null(variable.names(model_feature_types))
|
||||
expect_equal(variable.names(model_both), c("a", "b", "c"))
|
||||
|
||||
expect_null(getinfo(model_unnamed, "feature_type"))
|
||||
expect_null(getinfo(model_feature_names, "feature_type"))
|
||||
expect_equal(getinfo(model_feature_types, "feature_type"), c("q", "c", "q"))
|
||||
expect_equal(getinfo(model_both, "feature_type"), c("q", "c", "q"))
|
||||
})
|
||||
|
||||
@ -57,7 +57,7 @@ test_that("cb.print.evaluation works as expected", {
|
||||
expect_output(f5(), "\\[7\\]\ttrain-auc:0.900000\ttest-auc:0.800000")
|
||||
|
||||
bst_evaluation_err <- c('train-auc' = 0.1, 'test-auc' = 0.2)
|
||||
expect_output(f1(), "\\[7\\]\ttrain-auc:0.900000\\+0.100000\ttest-auc:0.800000\\+0.200000")
|
||||
expect_output(f1(), "\\[7\\]\ttrain-auc:0.900000±0.100000\ttest-auc:0.800000±0.200000")
|
||||
})
|
||||
|
||||
test_that("cb.evaluation.log works as expected", {
|
||||
@ -111,9 +111,9 @@ test_that("can store evaluation_log without printing", {
|
||||
expect_silent(
|
||||
bst <- xgb.train(param, dtrain, nrounds = 10, watchlist, eta = 1, verbose = 0)
|
||||
)
|
||||
expect_false(is.null(bst$evaluation_log))
|
||||
expect_false(is.null(bst$evaluation_log$train_error))
|
||||
expect_lt(bst$evaluation_log[, min(train_error)], 0.2)
|
||||
expect_false(is.null(attributes(bst)$evaluation_log))
|
||||
expect_false(is.null(attributes(bst)$evaluation_log$train_error))
|
||||
expect_lt(attributes(bst)$evaluation_log[, min(train_error)], 0.2)
|
||||
})
|
||||
|
||||
test_that("cb.reset.parameters works as expected", {
|
||||
@ -121,34 +121,34 @@ test_that("cb.reset.parameters works as expected", {
|
||||
# fixed eta
|
||||
set.seed(111)
|
||||
bst0 <- xgb.train(param, dtrain, nrounds = 2, watchlist, eta = 0.9, verbose = 0)
|
||||
expect_false(is.null(bst0$evaluation_log))
|
||||
expect_false(is.null(bst0$evaluation_log$train_error))
|
||||
expect_false(is.null(attributes(bst0)$evaluation_log))
|
||||
expect_false(is.null(attributes(bst0)$evaluation_log$train_error))
|
||||
|
||||
# same eta but re-set as a vector parameter in the callback
|
||||
set.seed(111)
|
||||
my_par <- list(eta = c(0.9, 0.9))
|
||||
bst1 <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0,
|
||||
callbacks = list(cb.reset.parameters(my_par)))
|
||||
expect_false(is.null(bst1$evaluation_log$train_error))
|
||||
expect_equal(bst0$evaluation_log$train_error,
|
||||
bst1$evaluation_log$train_error)
|
||||
expect_false(is.null(attributes(bst1)$evaluation_log$train_error))
|
||||
expect_equal(attributes(bst0)$evaluation_log$train_error,
|
||||
attributes(bst1)$evaluation_log$train_error)
|
||||
|
||||
# same eta but re-set via a function in the callback
|
||||
set.seed(111)
|
||||
my_par <- list(eta = function(itr, itr_end) 0.9)
|
||||
bst2 <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0,
|
||||
callbacks = list(cb.reset.parameters(my_par)))
|
||||
expect_false(is.null(bst2$evaluation_log$train_error))
|
||||
expect_equal(bst0$evaluation_log$train_error,
|
||||
bst2$evaluation_log$train_error)
|
||||
expect_false(is.null(attributes(bst2)$evaluation_log$train_error))
|
||||
expect_equal(attributes(bst0)$evaluation_log$train_error,
|
||||
attributes(bst2)$evaluation_log$train_error)
|
||||
|
||||
# different eta re-set as a vector parameter in the callback
|
||||
set.seed(111)
|
||||
my_par <- list(eta = c(0.6, 0.5))
|
||||
bst3 <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0,
|
||||
callbacks = list(cb.reset.parameters(my_par)))
|
||||
expect_false(is.null(bst3$evaluation_log$train_error))
|
||||
expect_false(all(bst0$evaluation_log$train_error == bst3$evaluation_log$train_error))
|
||||
expect_false(is.null(attributes(bst3)$evaluation_log$train_error))
|
||||
expect_false(all(attributes(bst0)$evaluation_log$train_error == attributes(bst3)$evaluation_log$train_error))
|
||||
|
||||
# resetting multiple parameters at the same time runs with no error
|
||||
my_par <- list(eta = c(1., 0.5), gamma = c(1, 2), max_depth = c(4, 8))
|
||||
@ -166,38 +166,39 @@ test_that("cb.reset.parameters works as expected", {
|
||||
my_par <- list(eta = c(0., 0.))
|
||||
bstX <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0,
|
||||
callbacks = list(cb.reset.parameters(my_par)))
|
||||
expect_false(is.null(bstX$evaluation_log$train_error))
|
||||
er <- unique(bstX$evaluation_log$train_error)
|
||||
expect_false(is.null(attributes(bstX)$evaluation_log$train_error))
|
||||
er <- unique(attributes(bstX)$evaluation_log$train_error)
|
||||
expect_length(er, 1)
|
||||
expect_gt(er, 0.4)
|
||||
})
|
||||
|
||||
test_that("cb.save.model works as expected", {
|
||||
files <- c('xgboost_01.json', 'xgboost_02.json', 'xgboost.json')
|
||||
files <- unname(sapply(files, function(f) file.path(tempdir(), f)))
|
||||
for (f in files) if (file.exists(f)) file.remove(f)
|
||||
|
||||
bst <- xgb.train(param, dtrain, nrounds = 2, watchlist, eta = 1, verbose = 0,
|
||||
save_period = 1, save_name = "xgboost_%02d.json")
|
||||
expect_true(file.exists('xgboost_01.json'))
|
||||
expect_true(file.exists('xgboost_02.json'))
|
||||
b1 <- xgb.load('xgboost_01.json')
|
||||
save_period = 1, save_name = file.path(tempdir(), "xgboost_%02d.json"))
|
||||
expect_true(file.exists(files[1]))
|
||||
expect_true(file.exists(files[2]))
|
||||
b1 <- xgb.load(files[1])
|
||||
xgb.parameters(b1) <- list(nthread = 2)
|
||||
expect_equal(xgb.ntree(b1), 1)
|
||||
b2 <- xgb.load('xgboost_02.json')
|
||||
expect_equal(xgb.get.num.boosted.rounds(b1), 1)
|
||||
b2 <- xgb.load(files[2])
|
||||
xgb.parameters(b2) <- list(nthread = 2)
|
||||
expect_equal(xgb.ntree(b2), 2)
|
||||
expect_equal(xgb.get.num.boosted.rounds(b2), 2)
|
||||
|
||||
xgb.config(b2) <- xgb.config(bst)
|
||||
expect_equal(xgb.config(bst), xgb.config(b2))
|
||||
expect_equal(bst$raw, b2$raw)
|
||||
expect_equal(xgb.save.raw(bst), xgb.save.raw(b2))
|
||||
|
||||
# save_period = 0 saves the last iteration's model
|
||||
bst <- xgb.train(param, dtrain, nrounds = 2, watchlist, eta = 1, verbose = 0,
|
||||
save_period = 0, save_name = 'xgboost.json')
|
||||
expect_true(file.exists('xgboost.json'))
|
||||
b2 <- xgb.load('xgboost.json')
|
||||
save_period = 0, save_name = file.path(tempdir(), 'xgboost.json'))
|
||||
expect_true(file.exists(files[3]))
|
||||
b2 <- xgb.load(files[3])
|
||||
xgb.config(b2) <- xgb.config(bst)
|
||||
expect_equal(bst$raw, b2$raw)
|
||||
expect_equal(xgb.save.raw(bst), xgb.save.raw(b2))
|
||||
|
||||
for (f in files) if (file.exists(f)) file.remove(f)
|
||||
})
|
||||
@ -208,14 +209,14 @@ test_that("early stopping xgb.train works", {
|
||||
bst <- xgb.train(param, dtrain, nrounds = 20, watchlist, eta = 0.3,
|
||||
early_stopping_rounds = 3, maximize = FALSE)
|
||||
, "Stopping. Best iteration")
|
||||
expect_false(is.null(bst$best_iteration))
|
||||
expect_lt(bst$best_iteration, 19)
|
||||
expect_equal(bst$best_iteration, bst$best_ntreelimit)
|
||||
expect_false(is.null(xgb.attr(bst, "best_iteration")))
|
||||
expect_lt(xgb.attr(bst, "best_iteration"), 19)
|
||||
expect_equal(xgb.attr(bst, "best_iteration"), xgb.attr(bst, "best_ntreelimit"))
|
||||
|
||||
pred <- predict(bst, dtest)
|
||||
expect_equal(length(pred), 1611)
|
||||
err_pred <- err(ltest, pred)
|
||||
err_log <- bst$evaluation_log[bst$best_iteration, test_error]
|
||||
err_log <- attributes(bst)$evaluation_log[xgb.attr(bst, "best_iteration"), test_error]
|
||||
expect_equal(err_log, err_pred, tolerance = 5e-6)
|
||||
|
||||
set.seed(11)
|
||||
@ -223,16 +224,15 @@ test_that("early stopping xgb.train works", {
|
||||
bst0 <- xgb.train(param, dtrain, nrounds = 20, watchlist, eta = 0.3,
|
||||
early_stopping_rounds = 3, maximize = FALSE, verbose = 0)
|
||||
)
|
||||
expect_equal(bst$evaluation_log, bst0$evaluation_log)
|
||||
expect_equal(attributes(bst)$evaluation_log, attributes(bst0)$evaluation_log)
|
||||
|
||||
xgb.save(bst, "model.bin")
|
||||
loaded <- xgb.load("model.bin")
|
||||
fname <- file.path(tempdir(), "model.bin")
|
||||
xgb.save(bst, fname)
|
||||
loaded <- xgb.load(fname)
|
||||
|
||||
expect_false(is.null(loaded$best_iteration))
|
||||
expect_equal(loaded$best_iteration, bst$best_ntreelimit)
|
||||
expect_equal(loaded$best_ntreelimit, bst$best_ntreelimit)
|
||||
|
||||
file.remove("model.bin")
|
||||
expect_false(is.null(xgb.attr(loaded, "best_iteration")))
|
||||
expect_equal(xgb.attr(loaded, "best_iteration"), xgb.attr(bst, "best_ntreelimit"))
|
||||
expect_equal(xgb.attr(loaded, "best_ntreelimit"), xgb.attr(bst, "best_ntreelimit"))
|
||||
})
|
||||
|
||||
test_that("early stopping using a specific metric works", {
|
||||
@ -243,14 +243,14 @@ test_that("early stopping using a specific metric works", {
|
||||
callbacks = list(cb.early.stop(stopping_rounds = 3, maximize = FALSE,
|
||||
metric_name = 'test_logloss')))
|
||||
, "Stopping. Best iteration")
|
||||
expect_false(is.null(bst$best_iteration))
|
||||
expect_lt(bst$best_iteration, 19)
|
||||
expect_equal(bst$best_iteration, bst$best_ntreelimit)
|
||||
expect_false(is.null(xgb.attr(bst, "best_iteration")))
|
||||
expect_lt(xgb.attr(bst, "best_iteration"), 19)
|
||||
expect_equal(xgb.attr(bst, "best_iteration"), xgb.attr(bst, "best_ntreelimit"))
|
||||
|
||||
pred <- predict(bst, dtest, ntreelimit = bst$best_ntreelimit)
|
||||
pred <- predict(bst, dtest, ntreelimit = xgb.attr(bst, "best_ntreelimit"))
|
||||
expect_equal(length(pred), 1611)
|
||||
logloss_pred <- sum(-ltest * log(pred) - (1 - ltest) * log(1 - pred)) / length(ltest)
|
||||
logloss_log <- bst$evaluation_log[bst$best_iteration, test_logloss]
|
||||
logloss_log <- attributes(bst)$evaluation_log[xgb.attr(bst, "best_iteration"), test_logloss]
|
||||
expect_equal(logloss_log, logloss_pred, tolerance = 1e-5)
|
||||
})
|
||||
|
||||
@ -265,14 +265,14 @@ test_that("early stopping works with titanic", {
|
||||
dtx <- model.matrix(~ 0 + ., data = titanic[, c("Pclass", "Sex")])
|
||||
dty <- titanic$Survived
|
||||
|
||||
xgboost::xgboost(
|
||||
data = dtx,
|
||||
label = dty,
|
||||
xgboost::xgb.train(
|
||||
data = xgb.DMatrix(dtx, label = dty),
|
||||
objective = "binary:logistic",
|
||||
eval_metric = "auc",
|
||||
nrounds = 100,
|
||||
early_stopping_rounds = 3,
|
||||
nthread = n_threads
|
||||
nthread = n_threads,
|
||||
watchlist = list(train = xgb.DMatrix(dtx, label = dty))
|
||||
)
|
||||
|
||||
expect_true(TRUE) # should not crash
|
||||
|
||||
@ -35,9 +35,9 @@ num_round <- 2
|
||||
test_that("custom objective works", {
|
||||
bst <- xgb.train(param, dtrain, num_round, watchlist)
|
||||
expect_equal(class(bst), "xgb.Booster")
|
||||
expect_false(is.null(bst$evaluation_log))
|
||||
expect_false(is.null(bst$evaluation_log$eval_error))
|
||||
expect_lt(bst$evaluation_log[num_round, eval_error], 0.03)
|
||||
expect_false(is.null(attributes(bst)$evaluation_log))
|
||||
expect_false(is.null(attributes(bst)$evaluation_log$eval_error))
|
||||
expect_lt(attributes(bst)$evaluation_log[num_round, eval_error], 0.03)
|
||||
})
|
||||
|
||||
test_that("custom objective in CV works", {
|
||||
@ -50,7 +50,7 @@ test_that("custom objective in CV works", {
|
||||
test_that("custom objective with early stop works", {
|
||||
bst <- xgb.train(param, dtrain, 10, watchlist)
|
||||
expect_equal(class(bst), "xgb.Booster")
|
||||
train_log <- bst$evaluation_log$train_error
|
||||
train_log <- attributes(bst)$evaluation_log$train_error
|
||||
expect_true(all(diff(train_log) <= 0))
|
||||
})
|
||||
|
||||
|
||||
@ -67,20 +67,22 @@ test_that("xgb.DMatrix: NA", {
|
||||
x[1, "x1"] <- NA
|
||||
|
||||
m <- xgb.DMatrix(x, nthread = n_threads)
|
||||
xgb.DMatrix.save(m, "int.dmatrix")
|
||||
fname_int <- file.path(tempdir(), "int.dmatrix")
|
||||
xgb.DMatrix.save(m, fname_int)
|
||||
|
||||
x <- matrix(as.numeric(x), nrow = n_samples, ncol = 2)
|
||||
colnames(x) <- c("x1", "x2")
|
||||
m <- xgb.DMatrix(x, nthread = n_threads)
|
||||
|
||||
xgb.DMatrix.save(m, "float.dmatrix")
|
||||
fname_float <- file.path(tempdir(), "float.dmatrix")
|
||||
xgb.DMatrix.save(m, fname_float)
|
||||
|
||||
iconn <- file("int.dmatrix", "rb")
|
||||
fconn <- file("float.dmatrix", "rb")
|
||||
iconn <- file(fname_int, "rb")
|
||||
fconn <- file(fname_float, "rb")
|
||||
|
||||
expect_equal(file.size("int.dmatrix"), file.size("float.dmatrix"))
|
||||
expect_equal(file.size(fname_int), file.size(fname_float))
|
||||
|
||||
bytes <- file.size("int.dmatrix")
|
||||
bytes <- file.size(fname_int)
|
||||
idmatrix <- readBin(iconn, "raw", n = bytes)
|
||||
fdmatrix <- readBin(fconn, "raw", n = bytes)
|
||||
|
||||
@ -90,8 +92,8 @@ test_that("xgb.DMatrix: NA", {
|
||||
close(iconn)
|
||||
close(fconn)
|
||||
|
||||
file.remove("int.dmatrix")
|
||||
file.remove("float.dmatrix")
|
||||
file.remove(fname_int)
|
||||
file.remove(fname_float)
|
||||
})
|
||||
|
||||
test_that("xgb.DMatrix: saving, loading", {
|
||||
@ -274,17 +276,19 @@ test_that("xgb.DMatrix: Inf as missing", {
|
||||
x_nan[2, 1] <- NA_real_
|
||||
|
||||
m_inf <- xgb.DMatrix(x_inf, nthread = n_threads, missing = Inf)
|
||||
xgb.DMatrix.save(m_inf, "inf.dmatrix")
|
||||
fname_inf <- file.path(tempdir(), "inf.dmatrix")
|
||||
xgb.DMatrix.save(m_inf, fname_inf)
|
||||
|
||||
m_nan <- xgb.DMatrix(x_nan, nthread = n_threads, missing = NA_real_)
|
||||
xgb.DMatrix.save(m_nan, "nan.dmatrix")
|
||||
fname_nan <- file.path(tempdir(), "nan.dmatrix")
|
||||
xgb.DMatrix.save(m_nan, fname_nan)
|
||||
|
||||
infconn <- file("inf.dmatrix", "rb")
|
||||
nanconn <- file("nan.dmatrix", "rb")
|
||||
infconn <- file(fname_inf, "rb")
|
||||
nanconn <- file(fname_nan, "rb")
|
||||
|
||||
expect_equal(file.size("inf.dmatrix"), file.size("nan.dmatrix"))
|
||||
expect_equal(file.size(fname_inf), file.size(fname_nan))
|
||||
|
||||
bytes <- file.size("inf.dmatrix")
|
||||
bytes <- file.size(fname_inf)
|
||||
infdmatrix <- readBin(infconn, "raw", n = bytes)
|
||||
nandmatrix <- readBin(nanconn, "raw", n = bytes)
|
||||
|
||||
@ -294,6 +298,143 @@ test_that("xgb.DMatrix: Inf as missing", {
|
||||
close(infconn)
|
||||
close(nanconn)
|
||||
|
||||
file.remove("inf.dmatrix")
|
||||
file.remove("nan.dmatrix")
|
||||
file.remove(fname_inf)
|
||||
file.remove(fname_nan)
|
||||
})
|
||||
|
||||
test_that("xgb.DMatrix: error on three-dimensional array", {
|
||||
set.seed(123)
|
||||
x <- matrix(rnorm(500), nrow = 50)
|
||||
y <- rnorm(400)
|
||||
dim(y) <- c(50, 4, 2)
|
||||
expect_error(xgb.DMatrix(data = x, label = y))
|
||||
})
|
||||
|
||||
test_that("xgb.DMatrix: can get group for both 'qid' and 'group' constructors", {
|
||||
set.seed(123)
|
||||
x <- matrix(rnorm(1000), nrow = 100)
|
||||
group <- c(20, 20, 60)
|
||||
qid <- c(rep(1, 20), rep(2, 20), rep(3, 60))
|
||||
|
||||
gr_mat <- xgb.DMatrix(x, group = group)
|
||||
qid_mat <- xgb.DMatrix(x, qid = qid)
|
||||
|
||||
info_gr <- getinfo(gr_mat, "group")
|
||||
info_qid <- getinfo(qid_mat, "group")
|
||||
expect_equal(info_gr, info_qid)
|
||||
|
||||
expected_gr <- c(0, 20, 40, 100)
|
||||
expect_equal(info_gr, expected_gr)
|
||||
})
|
||||
|
||||
test_that("xgb.DMatrix: data.frame", {
|
||||
df <- data.frame(
|
||||
a = (1:4) / 10,
|
||||
num = c(1, NA, 3, 4),
|
||||
as.int = as.integer(c(1, 2, 3, 4)),
|
||||
lo = c(TRUE, FALSE, NA, TRUE),
|
||||
str.fac = c("a", "b", "d", "c"),
|
||||
as.fac = as.factor(c(3, 5, 8, 11)),
|
||||
stringsAsFactors = TRUE
|
||||
)
|
||||
|
||||
m <- xgb.DMatrix(df, enable_categorical = TRUE)
|
||||
expect_equal(colnames(m), colnames(df))
|
||||
expect_equal(
|
||||
getinfo(m, "feature_type"), c("float", "float", "int", "i", "c", "c")
|
||||
)
|
||||
expect_error(xgb.DMatrix(df))
|
||||
|
||||
df <- data.frame(
|
||||
missing = c("a", "b", "d", NA),
|
||||
valid = c("a", "b", "d", "c"),
|
||||
stringsAsFactors = TRUE
|
||||
)
|
||||
m <- xgb.DMatrix(df, enable_categorical = TRUE)
|
||||
expect_equal(getinfo(m, "feature_type"), c("c", "c"))
|
||||
})
|
||||
|
||||
test_that("xgb.DMatrix: can take multi-dimensional 'base_margin'", {
|
||||
set.seed(123)
|
||||
x <- matrix(rnorm(100 * 10), nrow = 100)
|
||||
y <- matrix(rnorm(100 * 2), nrow = 100)
|
||||
b <- matrix(rnorm(100 * 2), nrow = 100)
|
||||
model <- xgb.train(
|
||||
data = xgb.DMatrix(data = x, label = y, nthread = n_threads),
|
||||
params = list(
|
||||
objective = "reg:squarederror",
|
||||
tree_method = "hist",
|
||||
multi_strategy = "multi_output_tree",
|
||||
base_score = 0,
|
||||
nthread = n_threads
|
||||
),
|
||||
nround = 1
|
||||
)
|
||||
pred_only_x <- predict(model, x, nthread = n_threads, reshape = TRUE)
|
||||
pred_w_base <- predict(
|
||||
model,
|
||||
xgb.DMatrix(data = x, base_margin = b, nthread = n_threads),
|
||||
nthread = n_threads,
|
||||
reshape = TRUE
|
||||
)
|
||||
expect_equal(pred_only_x, pred_w_base - b, tolerance = 1e-5)
|
||||
})
|
||||
|
||||
test_that("xgb.DMatrix: number of non-missing matches data", {
|
||||
x <- matrix(1:10, nrow = 5)
|
||||
dm1 <- xgb.DMatrix(x)
|
||||
expect_equal(xgb.get.DMatrix.num.non.missing(dm1), 10)
|
||||
|
||||
x[2, 2] <- NA
|
||||
x[4, 1] <- NA
|
||||
dm2 <- xgb.DMatrix(x)
|
||||
expect_equal(xgb.get.DMatrix.num.non.missing(dm2), 8)
|
||||
})
|
||||
|
||||
test_that("xgb.DMatrix: retrieving data as CSR", {
|
||||
data(mtcars)
|
||||
dm <- xgb.DMatrix(as.matrix(mtcars))
|
||||
csr <- xgb.get.DMatrix.data(dm)
|
||||
expect_equal(dim(csr), dim(mtcars))
|
||||
expect_equal(colnames(csr), colnames(mtcars))
|
||||
expect_equal(unname(as.matrix(csr)), unname(as.matrix(mtcars)), tolerance = 1e-6)
|
||||
})
|
||||
|
||||
test_that("xgb.DMatrix: quantile cuts look correct", {
|
||||
data(mtcars)
|
||||
y <- mtcars$mpg
|
||||
x <- as.matrix(mtcars[, -1])
|
||||
dm <- xgb.DMatrix(x, label = y)
|
||||
model <- xgb.train(
|
||||
data = dm,
|
||||
params = list(
|
||||
tree_method = "hist",
|
||||
max_bin = 8,
|
||||
nthread = 1
|
||||
),
|
||||
nrounds = 3
|
||||
)
|
||||
qcut_list <- xgb.get.DMatrix.qcut(dm, "list")
|
||||
qcut_arrays <- xgb.get.DMatrix.qcut(dm, "arrays")
|
||||
|
||||
expect_equal(length(qcut_arrays), 2)
|
||||
expect_equal(names(qcut_arrays), c("indptr", "data"))
|
||||
expect_equal(length(qcut_arrays$indptr), ncol(x) + 1)
|
||||
expect_true(min(diff(qcut_arrays$indptr)) > 0)
|
||||
|
||||
col_min <- apply(x, 2, min)
|
||||
col_max <- apply(x, 2, max)
|
||||
|
||||
expect_equal(length(qcut_list), ncol(x))
|
||||
expect_equal(names(qcut_list), colnames(x))
|
||||
lapply(
|
||||
seq(1, ncol(x)),
|
||||
function(col) {
|
||||
cuts <- qcut_list[[col]]
|
||||
expect_true(min(diff(cuts)) > 0)
|
||||
expect_true(col_min[col] > cuts[1])
|
||||
expect_true(col_max[col] < cuts[length(cuts)])
|
||||
expect_true(length(cuts) <= 9)
|
||||
}
|
||||
)
|
||||
})
|
||||
|
||||
@ -6,8 +6,8 @@ test_that("train and prediction when gctorture is on", {
|
||||
train <- agaricus.train
|
||||
test <- agaricus.test
|
||||
gctorture(TRUE)
|
||||
bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
|
||||
eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic")
|
||||
bst <- xgb.train(data = xgb.DMatrix(train$data, label = train$label), max.depth = 2,
|
||||
eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic")
|
||||
pred <- predict(bst, test$data)
|
||||
gctorture(FALSE)
|
||||
expect_length(pred, length(test$label))
|
||||
|
||||
@ -24,28 +24,28 @@ test_that("gblinear works", {
|
||||
bst <- xgb.train(param, dtrain, n, watchlist, verbose = VERB, feature_selector = 'shuffle')
|
||||
ypred <- predict(bst, dtest)
|
||||
expect_equal(length(getinfo(dtest, 'label')), 1611)
|
||||
expect_lt(bst$evaluation_log$eval_error[n], ERR_UL)
|
||||
expect_lt(attributes(bst)$evaluation_log$eval_error[n], ERR_UL)
|
||||
|
||||
bst <- xgb.train(param, dtrain, n, watchlist, verbose = VERB, feature_selector = 'cyclic',
|
||||
callbacks = list(cb.gblinear.history()))
|
||||
expect_lt(bst$evaluation_log$eval_error[n], ERR_UL)
|
||||
expect_lt(attributes(bst)$evaluation_log$eval_error[n], ERR_UL)
|
||||
h <- xgb.gblinear.history(bst)
|
||||
expect_equal(dim(h), c(n, ncol(dtrain) + 1))
|
||||
expect_is(h, "matrix")
|
||||
|
||||
param$updater <- 'coord_descent'
|
||||
bst <- xgb.train(param, dtrain, n, watchlist, verbose = VERB, feature_selector = 'cyclic')
|
||||
expect_lt(bst$evaluation_log$eval_error[n], ERR_UL)
|
||||
expect_lt(attributes(bst)$evaluation_log$eval_error[n], ERR_UL)
|
||||
|
||||
bst <- xgb.train(param, dtrain, n, watchlist, verbose = VERB, feature_selector = 'shuffle')
|
||||
expect_lt(bst$evaluation_log$eval_error[n], ERR_UL)
|
||||
expect_lt(attributes(bst)$evaluation_log$eval_error[n], ERR_UL)
|
||||
|
||||
bst <- xgb.train(param, dtrain, 2, watchlist, verbose = VERB, feature_selector = 'greedy')
|
||||
expect_lt(bst$evaluation_log$eval_error[2], ERR_UL)
|
||||
expect_lt(attributes(bst)$evaluation_log$eval_error[2], ERR_UL)
|
||||
|
||||
bst <- xgb.train(param, dtrain, n, watchlist, verbose = VERB, feature_selector = 'thrifty',
|
||||
top_k = 50, callbacks = list(cb.gblinear.history(sparse = TRUE)))
|
||||
expect_lt(bst$evaluation_log$eval_error[n], ERR_UL)
|
||||
expect_lt(attributes(bst)$evaluation_log$eval_error[n], ERR_UL)
|
||||
h <- xgb.gblinear.history(bst)
|
||||
expect_equal(dim(h), c(n, ncol(dtrain) + 1))
|
||||
expect_s4_class(h, "dgCMatrix")
|
||||
@ -72,10 +72,10 @@ test_that("gblinear early stopping works", {
|
||||
booster <- xgb.train(
|
||||
param, dtrain, n, list(eval = dtest, train = dtrain), early_stopping_rounds = es_round
|
||||
)
|
||||
expect_equal(booster$best_iteration, 5)
|
||||
expect_equal(xgb.attr(booster, "best_iteration"), 5)
|
||||
predt_es <- predict(booster, dtrain)
|
||||
|
||||
n <- booster$best_iteration + es_round
|
||||
n <- xgb.attr(booster, "best_iteration") + es_round
|
||||
booster <- xgb.train(
|
||||
param, dtrain, n, list(eval = dtest, train = dtrain), early_stopping_rounds = es_round
|
||||
)
|
||||
|
||||
@ -25,15 +25,15 @@ if (isTRUE(VCD_AVAILABLE)) {
|
||||
label <- df[, ifelse(Improved == "Marked", 1, 0)]
|
||||
|
||||
# binary
|
||||
bst.Tree <- xgboost(data = sparse_matrix, label = label, max_depth = 9,
|
||||
eta = 1, nthread = 2, nrounds = nrounds, verbose = 0,
|
||||
objective = "binary:logistic", booster = "gbtree",
|
||||
base_score = 0.5)
|
||||
bst.Tree <- xgb.train(data = xgb.DMatrix(sparse_matrix, label = label), max_depth = 9,
|
||||
eta = 1, nthread = 2, nrounds = nrounds, verbose = 0,
|
||||
objective = "binary:logistic", booster = "gbtree",
|
||||
base_score = 0.5)
|
||||
|
||||
bst.GLM <- xgboost(data = sparse_matrix, label = label,
|
||||
eta = 1, nthread = 1, nrounds = nrounds, verbose = 0,
|
||||
objective = "binary:logistic", booster = "gblinear",
|
||||
base_score = 0.5)
|
||||
bst.GLM <- xgb.train(data = xgb.DMatrix(sparse_matrix, label = label),
|
||||
eta = 1, nthread = 1, nrounds = nrounds, verbose = 0,
|
||||
objective = "binary:logistic", booster = "gblinear",
|
||||
base_score = 0.5)
|
||||
|
||||
feature.names <- colnames(sparse_matrix)
|
||||
}
|
||||
@ -41,14 +41,17 @@ if (isTRUE(VCD_AVAILABLE)) {
|
||||
# multiclass
|
||||
mlabel <- as.numeric(iris$Species) - 1
|
||||
nclass <- 3
|
||||
mbst.Tree <- xgboost(data = as.matrix(iris[, -5]), label = mlabel, verbose = 0,
|
||||
max_depth = 3, eta = 0.5, nthread = 2, nrounds = nrounds,
|
||||
objective = "multi:softprob", num_class = nclass, base_score = 0)
|
||||
mbst.Tree <- xgb.train(data = xgb.DMatrix(as.matrix(iris[, -5]), label = mlabel), verbose = 0,
|
||||
max_depth = 3, eta = 0.5, nthread = 2, nrounds = nrounds,
|
||||
objective = "multi:softprob", num_class = nclass, base_score = 0)
|
||||
|
||||
mbst.GLM <- xgboost(data = as.matrix(iris[, -5]), label = mlabel, verbose = 0,
|
||||
booster = "gblinear", eta = 0.1, nthread = 1, nrounds = nrounds,
|
||||
objective = "multi:softprob", num_class = nclass, base_score = 0)
|
||||
mbst.GLM <- xgb.train(data = xgb.DMatrix(as.matrix(iris[, -5]), label = mlabel), verbose = 0,
|
||||
booster = "gblinear", eta = 0.1, nthread = 1, nrounds = nrounds,
|
||||
objective = "multi:softprob", num_class = nclass, base_score = 0)
|
||||
|
||||
# without feature names
|
||||
bst.Tree.unnamed <- xgb.copy.Booster(bst.Tree)
|
||||
setinfo(bst.Tree.unnamed, "feature_name", NULL)
|
||||
|
||||
test_that("xgb.dump works", {
|
||||
.skip_if_vcd_not_available()
|
||||
@ -71,8 +74,9 @@ test_that("xgb.dump works for gblinear", {
|
||||
expect_length(xgb.dump(bst.GLM), 14)
|
||||
# also make sure that it works properly for a sparse model where some coefficients
|
||||
# are 0 from setting large L1 regularization:
|
||||
bst.GLM.sp <- xgboost(data = sparse_matrix, label = label, eta = 1, nthread = 2, nrounds = 1,
|
||||
alpha = 2, objective = "binary:logistic", booster = "gblinear")
|
||||
bst.GLM.sp <- xgb.train(data = xgb.DMatrix(sparse_matrix, label = label), eta = 1,
|
||||
nthread = 2, nrounds = 1,
|
||||
alpha = 2, objective = "binary:logistic", booster = "gblinear")
|
||||
d.sp <- xgb.dump(bst.GLM.sp)
|
||||
expect_length(d.sp, 14)
|
||||
expect_gt(sum(d.sp == "0"), 0)
|
||||
@ -168,7 +172,7 @@ test_that("SHAPs sum to predictions, with or without DART", {
|
||||
nrounds <- 30
|
||||
|
||||
for (booster in list("gbtree", "dart")) {
|
||||
fit <- xgboost(
|
||||
fit <- xgb.train(
|
||||
params = c(
|
||||
list(
|
||||
nthread = 2,
|
||||
@ -177,8 +181,7 @@ test_that("SHAPs sum to predictions, with or without DART", {
|
||||
eval_metric = "rmse"),
|
||||
if (booster == "dart")
|
||||
list(rate_drop = .01, one_drop = TRUE)),
|
||||
data = d,
|
||||
label = y,
|
||||
data = xgb.DMatrix(d, label = y),
|
||||
nrounds = nrounds)
|
||||
|
||||
pr <- function(...) {
|
||||
@ -204,7 +207,7 @@ test_that("xgb-attribute functionality", {
|
||||
list.ch <- list.val[order(names(list.val))]
|
||||
list.ch <- lapply(list.ch, as.character)
|
||||
# note: iter is 0-index in xgb attributes
|
||||
list.default <- list(niter = as.character(nrounds - 1))
|
||||
list.default <- list()
|
||||
list.ch <- c(list.ch, list.default)
|
||||
# proper input:
|
||||
expect_error(xgb.attr(bst.Tree, NULL))
|
||||
@ -212,24 +215,25 @@ test_that("xgb-attribute functionality", {
|
||||
# set & get:
|
||||
expect_null(xgb.attr(bst.Tree, "asdf"))
|
||||
expect_equal(xgb.attributes(bst.Tree), list.default)
|
||||
xgb.attr(bst.Tree, "my_attr") <- val
|
||||
expect_equal(xgb.attr(bst.Tree, "my_attr"), val)
|
||||
xgb.attributes(bst.Tree) <- list.val
|
||||
expect_equal(xgb.attributes(bst.Tree), list.ch)
|
||||
bst.Tree.copy <- xgb.copy.Booster(bst.Tree)
|
||||
xgb.attr(bst.Tree.copy, "my_attr") <- val
|
||||
expect_equal(xgb.attr(bst.Tree.copy, "my_attr"), val)
|
||||
xgb.attributes(bst.Tree.copy) <- list.val
|
||||
expect_equal(xgb.attributes(bst.Tree.copy), list.ch)
|
||||
# serializing:
|
||||
xgb.save(bst.Tree, 'xgb.model')
|
||||
bst <- xgb.load('xgb.model')
|
||||
if (file.exists('xgb.model')) file.remove('xgb.model')
|
||||
fname <- file.path(tempdir(), "xgb.ubj")
|
||||
xgb.save(bst.Tree.copy, fname)
|
||||
bst <- xgb.load(fname)
|
||||
expect_equal(xgb.attr(bst, "my_attr"), val)
|
||||
expect_equal(xgb.attributes(bst), list.ch)
|
||||
# deletion:
|
||||
xgb.attr(bst, "my_attr") <- NULL
|
||||
expect_null(xgb.attr(bst, "my_attr"))
|
||||
expect_equal(xgb.attributes(bst), list.ch[c("a", "b", "niter")])
|
||||
expect_equal(xgb.attributes(bst), list.ch[c("a", "b")])
|
||||
xgb.attributes(bst) <- list(a = NULL, b = NULL)
|
||||
expect_equal(xgb.attributes(bst), list.default)
|
||||
xgb.attributes(bst) <- list(niter = NULL)
|
||||
expect_null(xgb.attributes(bst))
|
||||
expect_equal(xgb.attributes(bst), list())
|
||||
})
|
||||
|
||||
if (grepl('Windows', Sys.info()[['sysname']], fixed = TRUE) ||
|
||||
@ -256,27 +260,23 @@ if (grepl('Windows', Sys.info()[['sysname']], fixed = TRUE) ||
|
||||
|
||||
test_that("xgb.Booster serializing as R object works", {
|
||||
.skip_if_vcd_not_available()
|
||||
saveRDS(bst.Tree, 'xgb.model.rds')
|
||||
bst <- readRDS('xgb.model.rds')
|
||||
fname_rds <- file.path(tempdir(), "xgb.model.rds")
|
||||
saveRDS(bst.Tree, fname_rds)
|
||||
bst <- readRDS(fname_rds)
|
||||
dtrain <- xgb.DMatrix(sparse_matrix, label = label, nthread = 2)
|
||||
expect_equal(predict(bst.Tree, dtrain), predict(bst, dtrain), tolerance = float_tolerance)
|
||||
expect_equal(xgb.dump(bst.Tree), xgb.dump(bst))
|
||||
xgb.save(bst, 'xgb.model')
|
||||
if (file.exists('xgb.model')) file.remove('xgb.model')
|
||||
bst <- readRDS('xgb.model.rds')
|
||||
if (file.exists('xgb.model.rds')) file.remove('xgb.model.rds')
|
||||
nil_ptr <- new("externalptr")
|
||||
class(nil_ptr) <- "xgb.Booster.handle"
|
||||
expect_true(identical(bst$handle, nil_ptr))
|
||||
bst <- xgb.Booster.complete(bst)
|
||||
expect_true(!identical(bst$handle, nil_ptr))
|
||||
|
||||
fname_bin <- file.path(tempdir(), "xgb.model")
|
||||
xgb.save(bst, fname_bin)
|
||||
bst <- readRDS(fname_rds)
|
||||
expect_equal(predict(bst.Tree, dtrain), predict(bst, dtrain), tolerance = float_tolerance)
|
||||
})
|
||||
|
||||
test_that("xgb.model.dt.tree works with and without feature names", {
|
||||
.skip_if_vcd_not_available()
|
||||
names.dt.trees <- c("Tree", "Node", "ID", "Feature", "Split", "Yes", "No", "Missing", "Quality", "Cover")
|
||||
dt.tree <- xgb.model.dt.tree(feature_names = feature.names, model = bst.Tree)
|
||||
names.dt.trees <- c("Tree", "Node", "ID", "Feature", "Split", "Yes", "No", "Missing", "Gain", "Cover")
|
||||
dt.tree <- xgb.model.dt.tree(model = bst.Tree)
|
||||
expect_equal(names.dt.trees, names(dt.tree))
|
||||
if (!flag_32bit)
|
||||
expect_equal(dim(dt.tree), c(188, 10))
|
||||
@ -286,9 +286,7 @@ test_that("xgb.model.dt.tree works with and without feature names", {
|
||||
expect_equal(dt.tree, dt.tree.0)
|
||||
|
||||
# when model contains no feature names:
|
||||
bst.Tree.x <- bst.Tree
|
||||
bst.Tree.x$feature_names <- NULL
|
||||
dt.tree.x <- xgb.model.dt.tree(model = bst.Tree.x)
|
||||
dt.tree.x <- xgb.model.dt.tree(model = bst.Tree.unnamed)
|
||||
expect_output(str(dt.tree.x), 'Feature.*\\"3\\"')
|
||||
expect_equal(dt.tree[, -4, with = FALSE], dt.tree.x[, -4, with = FALSE])
|
||||
|
||||
@ -316,9 +314,7 @@ test_that("xgb.importance works with and without feature names", {
|
||||
expect_equal(importance.Tree, importance.Tree.0, tolerance = float_tolerance)
|
||||
|
||||
# when model contains no feature names:
|
||||
bst.Tree.x <- bst.Tree
|
||||
bst.Tree.x$feature_names <- NULL
|
||||
importance.Tree.x <- xgb.importance(model = bst.Tree)
|
||||
importance.Tree.x <- xgb.importance(model = bst.Tree.unnamed)
|
||||
expect_equal(importance.Tree[, -1, with = FALSE], importance.Tree.x[, -1, with = FALSE],
|
||||
tolerance = float_tolerance)
|
||||
|
||||
@ -334,14 +330,14 @@ test_that("xgb.importance works with and without feature names", {
|
||||
importance <- xgb.importance(feature_names = feature.names, model = bst.Tree, trees = trees)
|
||||
|
||||
importance_from_dump <- function() {
|
||||
model_text_dump <- xgb.dump(model = bst.Tree, with_stats = TRUE, trees = trees)
|
||||
model_text_dump <- xgb.dump(model = bst.Tree.unnamed, with_stats = TRUE, trees = trees)
|
||||
imp <- xgb.model.dt.tree(
|
||||
feature_names = feature.names,
|
||||
text = model_text_dump,
|
||||
trees = trees
|
||||
)[
|
||||
Feature != "Leaf", .(
|
||||
Gain = sum(Quality),
|
||||
Gain = sum(Gain),
|
||||
Cover = sum(Cover),
|
||||
Frequency = .N
|
||||
),
|
||||
@ -360,9 +356,8 @@ test_that("xgb.importance works with and without feature names", {
|
||||
expect_equal(importance_from_dump(), importance, tolerance = 1e-6)
|
||||
|
||||
## decision stump
|
||||
m <- xgboost::xgboost(
|
||||
data = as.matrix(data.frame(x = c(0, 1))),
|
||||
label = c(1, 2),
|
||||
m <- xgboost::xgb.train(
|
||||
data = xgb.DMatrix(as.matrix(data.frame(x = c(0, 1))), label = c(1, 2)),
|
||||
nrounds = 1,
|
||||
base_score = 0.5,
|
||||
nthread = 2
|
||||
@ -393,9 +388,9 @@ test_that("xgb.importance works with GLM model", {
|
||||
|
||||
test_that("xgb.model.dt.tree and xgb.importance work with a single split model", {
|
||||
.skip_if_vcd_not_available()
|
||||
bst1 <- xgboost(data = sparse_matrix, label = label, max_depth = 1,
|
||||
eta = 1, nthread = 2, nrounds = 1, verbose = 0,
|
||||
objective = "binary:logistic")
|
||||
bst1 <- xgb.train(data = xgb.DMatrix(sparse_matrix, label = label), max_depth = 1,
|
||||
eta = 1, nthread = 2, nrounds = 1, verbose = 0,
|
||||
objective = "binary:logistic")
|
||||
expect_error(dt <- xgb.model.dt.tree(model = bst1), regexp = NA) # no error
|
||||
expect_equal(nrow(dt), 3)
|
||||
expect_error(imp <- xgb.importance(model = bst1), regexp = NA) # no error
|
||||
@ -415,13 +410,13 @@ test_that("xgb.plot.importance de-duplicates features", {
|
||||
|
||||
test_that("xgb.plot.tree works with and without feature names", {
|
||||
.skip_if_vcd_not_available()
|
||||
expect_silent(xgb.plot.tree(feature_names = feature.names, model = bst.Tree))
|
||||
expect_silent(xgb.plot.tree(feature_names = feature.names, model = bst.Tree.unnamed))
|
||||
expect_silent(xgb.plot.tree(model = bst.Tree))
|
||||
})
|
||||
|
||||
test_that("xgb.plot.multi.trees works with and without feature names", {
|
||||
.skip_if_vcd_not_available()
|
||||
xgb.plot.multi.trees(model = bst.Tree, feature_names = feature.names, features_keep = 3)
|
||||
xgb.plot.multi.trees(model = bst.Tree.unnamed, feature_names = feature.names, features_keep = 3)
|
||||
xgb.plot.multi.trees(model = bst.Tree, features_keep = 3)
|
||||
})
|
||||
|
||||
|
||||
@ -13,9 +13,9 @@ train <- matrix(c(x1, x2, x3), ncol = 3)
|
||||
|
||||
test_that("interaction constraints for regression", {
|
||||
# Fit a model that only allows interaction between x1 and x2
|
||||
bst <- xgboost(data = train, label = y, max_depth = 3,
|
||||
eta = 0.1, nthread = 2, nrounds = 100, verbose = 0,
|
||||
interaction_constraints = list(c(0, 1)))
|
||||
bst <- xgb.train(data = xgb.DMatrix(train, label = y), max_depth = 3,
|
||||
eta = 0.1, nthread = 2, nrounds = 100, verbose = 0,
|
||||
interaction_constraints = list(c(0, 1)))
|
||||
|
||||
# Set all observations to have the same x3 values then increment
|
||||
# by the same amount
|
||||
@ -47,7 +47,7 @@ test_that("interaction constraints scientific representation", {
|
||||
d <- matrix(rexp(rows, rate = .1), nrow = rows, ncol = cols)
|
||||
y <- rnorm(rows)
|
||||
|
||||
dtrain <- xgb.DMatrix(data = d, info = list(label = y), nthread = n_threads)
|
||||
dtrain <- xgb.DMatrix(data = d, label = y, nthread = n_threads)
|
||||
inc <- list(c(seq.int(from = 0, to = cols, by = 1)))
|
||||
|
||||
with_inc <- xgb.train(
|
||||
|
||||
@ -98,15 +98,14 @@ test_that("SHAP contribution values are not NAN", {
|
||||
|
||||
ivs <- c("x1", "x2")
|
||||
|
||||
fit <- xgboost(
|
||||
fit <- xgb.train(
|
||||
verbose = 0,
|
||||
params = list(
|
||||
objective = "reg:squarederror",
|
||||
eval_metric = "rmse",
|
||||
nthread = n_threads
|
||||
),
|
||||
data = as.matrix(subset(d, fold == 2)[, ivs]),
|
||||
label = subset(d, fold == 2)$y,
|
||||
data = xgb.DMatrix(as.matrix(subset(d, fold == 2)[, ivs]), label = subset(d, fold == 2)$y),
|
||||
nrounds = 3
|
||||
)
|
||||
|
||||
@ -169,9 +168,8 @@ test_that("multiclass feature interactions work", {
|
||||
test_that("SHAP single sample works", {
|
||||
train <- agaricus.train
|
||||
test <- agaricus.test
|
||||
booster <- xgboost(
|
||||
data = train$data,
|
||||
label = train$label,
|
||||
booster <- xgb.train(
|
||||
data = xgb.DMatrix(train$data, label = train$label),
|
||||
max_depth = 2,
|
||||
nrounds = 4,
|
||||
objective = "binary:logistic",
|
||||
|
||||
@ -7,8 +7,8 @@ test <- agaricus.test
|
||||
|
||||
test_that("load/save raw works", {
|
||||
nrounds <- 8
|
||||
booster <- xgboost(
|
||||
data = train$data, label = train$label,
|
||||
booster <- xgb.train(
|
||||
data = xgb.DMatrix(train$data, label = train$label),
|
||||
nrounds = nrounds, objective = "binary:logistic",
|
||||
nthread = 2
|
||||
)
|
||||
@ -17,8 +17,8 @@ test_that("load/save raw works", {
|
||||
ubj_bytes <- xgb.save.raw(booster, raw_format = "ubj")
|
||||
old_bytes <- xgb.save.raw(booster, raw_format = "deprecated")
|
||||
|
||||
from_json <- xgb.load.raw(json_bytes, as_booster = TRUE)
|
||||
from_ubj <- xgb.load.raw(ubj_bytes, as_booster = TRUE)
|
||||
from_json <- xgb.load.raw(json_bytes)
|
||||
from_ubj <- xgb.load.raw(ubj_bytes)
|
||||
|
||||
json2old <- xgb.save.raw(from_json, raw_format = "deprecated")
|
||||
ubj2old <- xgb.save.raw(from_ubj, raw_format = "deprecated")
|
||||
@ -26,3 +26,46 @@ test_that("load/save raw works", {
|
||||
expect_equal(json2old, ubj2old)
|
||||
expect_equal(json2old, old_bytes)
|
||||
})
|
||||
|
||||
test_that("saveRDS preserves C and R attributes", {
|
||||
data(mtcars)
|
||||
y <- mtcars$mpg
|
||||
x <- as.matrix(mtcars[, -1])
|
||||
dm <- xgb.DMatrix(x, label = y, nthread = 1)
|
||||
model <- xgb.train(
|
||||
data = dm,
|
||||
params = list(nthread = 1, max_depth = 2),
|
||||
nrounds = 5
|
||||
)
|
||||
attributes(model)$my_attr <- "qwerty"
|
||||
xgb.attr(model, "c_attr") <- "asdf"
|
||||
|
||||
fname <- file.path(tempdir(), "xgb_model.Rds")
|
||||
saveRDS(model, fname)
|
||||
model_new <- readRDS(fname)
|
||||
|
||||
expect_equal(attributes(model_new)$my_attr, attributes(model)$my_attr)
|
||||
expect_equal(xgb.attr(model, "c_attr"), xgb.attr(model_new, "c_attr"))
|
||||
})
|
||||
|
||||
test_that("R serializers keep C config", {
|
||||
data(mtcars)
|
||||
y <- mtcars$mpg
|
||||
x <- as.matrix(mtcars[, -1])
|
||||
dm <- xgb.DMatrix(x, label = y, nthread = 1)
|
||||
model <- xgb.train(
|
||||
data = dm,
|
||||
params = list(
|
||||
tree_method = "approx",
|
||||
nthread = 1,
|
||||
max_depth = 2
|
||||
),
|
||||
nrounds = 3
|
||||
)
|
||||
model_new <- unserialize(serialize(model, NULL))
|
||||
expect_equal(
|
||||
xgb.config(model)$learner$gradient_booster$gbtree_train_param$tree_method,
|
||||
xgb.config(model_new)$learner$gradient_booster$gbtree_train_param$tree_method
|
||||
)
|
||||
expect_equal(variable.names(model), variable.names(model_new))
|
||||
})
|
||||
|
||||
@ -23,11 +23,7 @@ get_num_tree <- function(booster) {
|
||||
}
|
||||
|
||||
run_booster_check <- function(booster, name) {
|
||||
# If given a handle, we need to call xgb.Booster.complete() prior to using xgb.config().
|
||||
if (inherits(booster, "xgb.Booster") && xgboost:::is.null.handle(booster$handle)) {
|
||||
booster <- xgb.Booster.complete(booster)
|
||||
}
|
||||
config <- jsonlite::fromJSON(xgb.config(booster))
|
||||
config <- xgb.config(booster)
|
||||
run_model_param_check(config)
|
||||
if (name == 'cls') {
|
||||
testthat::expect_equal(get_num_tree(booster),
|
||||
@ -76,6 +72,10 @@ test_that("Models from previous versions of XGBoost can be loaded", {
|
||||
name <- m[3]
|
||||
is_rds <- endsWith(model_file, '.rds')
|
||||
is_json <- endsWith(model_file, '.json')
|
||||
# TODO: update this test for new RDS format
|
||||
if (is_rds) {
|
||||
return(NULL)
|
||||
}
|
||||
# Expect an R warning when a model is loaded from RDS and it was generated by version < 1.1.x
|
||||
if (is_rds && compareVersion(model_xgb_ver, '1.1.1.1') < 0) {
|
||||
booster <- readRDS(model_file)
|
||||
|
||||
@ -7,9 +7,9 @@ train <- matrix(x, ncol = 1)
|
||||
|
||||
|
||||
test_that("monotone constraints for regression", {
|
||||
bst <- xgboost(data = train, label = y, max_depth = 2,
|
||||
eta = 0.1, nthread = 2, nrounds = 100, verbose = 0,
|
||||
monotone_constraints = -1)
|
||||
bst <- xgb.train(data = xgb.DMatrix(train, label = y), max_depth = 2,
|
||||
eta = 0.1, nthread = 2, nrounds = 100, verbose = 0,
|
||||
monotone_constraints = -1)
|
||||
|
||||
pred <- predict(bst, train)
|
||||
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user