[R] On-demand serialization + standardization of attributes (#9924)

---------

Co-authored-by: Jiaming Yuan <jm.yuan@outlook.com>
This commit is contained in:
david-cortes 2024-01-10 22:08:42 +01:00 committed by GitHub
parent 01c4711556
commit d3a8d284ab
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
64 changed files with 1773 additions and 1281 deletions

View File

@ -2,16 +2,19 @@
S3method("[",xgb.DMatrix)
S3method("dimnames<-",xgb.DMatrix)
S3method(coef,xgb.Booster)
S3method(dim,xgb.DMatrix)
S3method(dimnames,xgb.DMatrix)
S3method(getinfo,xgb.Booster)
S3method(getinfo,xgb.DMatrix)
S3method(predict,xgb.Booster)
S3method(predict,xgb.Booster.handle)
S3method(print,xgb.Booster)
S3method(print,xgb.DMatrix)
S3method(print,xgb.cv.synchronous)
S3method(setinfo,xgb.Booster)
S3method(setinfo,xgb.DMatrix)
S3method(slice,xgb.DMatrix)
S3method(variable.names,xgb.Booster)
export("xgb.attr<-")
export("xgb.attributes<-")
export("xgb.config<-")
@ -26,13 +29,13 @@ export(cb.save.model)
export(getinfo)
export(setinfo)
export(slice)
export(xgb.Booster.complete)
export(xgb.DMatrix)
export(xgb.DMatrix.hasinfo)
export(xgb.DMatrix.save)
export(xgb.attr)
export(xgb.attributes)
export(xgb.config)
export(xgb.copy.Booster)
export(xgb.create.features)
export(xgb.cv)
export(xgb.dump)
@ -41,10 +44,12 @@ export(xgb.get.DMatrix.data)
export(xgb.get.DMatrix.num.non.missing)
export(xgb.get.DMatrix.qcut)
export(xgb.get.config)
export(xgb.get.num.boosted.rounds)
export(xgb.ggplot.deepness)
export(xgb.ggplot.importance)
export(xgb.ggplot.shap.summary)
export(xgb.importance)
export(xgb.is.same.Booster)
export(xgb.load)
export(xgb.load.raw)
export(xgb.model.dt.tree)
@ -56,10 +61,8 @@ export(xgb.plot.shap.summary)
export(xgb.plot.tree)
export(xgb.save)
export(xgb.save.raw)
export(xgb.serialize)
export(xgb.set.config)
export(xgb.train)
export(xgb.unserialize)
export(xgboost)
import(methods)
importClassesFrom(Matrix,dgCMatrix)
@ -88,8 +91,10 @@ importFrom(graphics,title)
importFrom(jsonlite,fromJSON)
importFrom(jsonlite,toJSON)
importFrom(methods,new)
importFrom(stats,coef)
importFrom(stats,median)
importFrom(stats,predict)
importFrom(stats,variable.names)
importFrom(utils,head)
importFrom(utils,object.size)
importFrom(utils,str)

View File

@ -228,7 +228,7 @@ cb.reset.parameters <- function(new_params) {
})
if (!is.null(env$bst)) {
xgb.parameters(env$bst$handle) <- pars
xgb.parameters(env$bst) <- pars
} else {
for (fd in env$bst_folds)
xgb.parameters(fd$bst) <- pars
@ -333,13 +333,13 @@ cb.early.stop <- function(stopping_rounds, maximize = FALSE,
if (!is.null(env$bst)) {
if (!inherits(env$bst, 'xgb.Booster'))
stop("'bst' in the parent frame must be an 'xgb.Booster'")
if (!is.null(best_score <- xgb.attr(env$bst$handle, 'best_score'))) {
if (!is.null(best_score <- xgb.attr(env$bst, 'best_score'))) {
best_score <<- as.numeric(best_score)
best_iteration <<- as.numeric(xgb.attr(env$bst$handle, 'best_iteration')) + 1
best_msg <<- as.numeric(xgb.attr(env$bst$handle, 'best_msg'))
best_iteration <<- as.numeric(xgb.attr(env$bst, 'best_iteration')) + 1
best_msg <<- as.numeric(xgb.attr(env$bst, 'best_msg'))
} else {
xgb.attributes(env$bst$handle) <- list(best_iteration = best_iteration - 1,
best_score = best_score)
xgb.attributes(env$bst) <- list(best_iteration = best_iteration - 1,
best_score = best_score)
}
} else if (is.null(env$bst_folds) || is.null(env$basket)) {
stop("Parent frame has neither 'bst' nor ('bst_folds' and 'basket')")
@ -348,7 +348,7 @@ cb.early.stop <- function(stopping_rounds, maximize = FALSE,
finalizer <- function(env) {
if (!is.null(env$bst)) {
attr_best_score <- as.numeric(xgb.attr(env$bst$handle, 'best_score'))
attr_best_score <- as.numeric(xgb.attr(env$bst, 'best_score'))
if (best_score != attr_best_score) {
# If the difference is too big, throw an error
if (abs(best_score - attr_best_score) >= 1e-14) {
@ -358,9 +358,9 @@ cb.early.stop <- function(stopping_rounds, maximize = FALSE,
# If the difference is due to floating-point truncation, update best_score
best_score <- attr_best_score
}
env$bst$best_iteration <- best_iteration
env$bst$best_ntreelimit <- best_ntreelimit
env$bst$best_score <- best_score
xgb.attr(env$bst, "best_iteration") <- best_iteration
xgb.attr(env$bst, "best_ntreelimit") <- best_ntreelimit
xgb.attr(env$bst, "best_score") <- best_score
} else {
env$basket$best_iteration <- best_iteration
env$basket$best_ntreelimit <- best_ntreelimit
@ -412,11 +412,15 @@ cb.early.stop <- function(stopping_rounds, maximize = FALSE,
#' @param save_period save the model to disk after every
#' \code{save_period} iterations; 0 means save the model at the end.
#' @param save_name the name or path for the saved model file.
#'
#' Note that the format of the model being saved is determined by the file
#' extension specified here (see \link{xgb.save} for details about how it works).
#'
#' It can contain a \code{\link[base]{sprintf}} formatting specifier
#' to include the integer iteration number in the file name.
#' E.g., with \code{save_name} = 'xgboost_%04d.model',
#' the file saved at iteration 50 would be named "xgboost_0050.model".
#'
#' E.g., with \code{save_name} = 'xgboost_%04d.ubj',
#' the file saved at iteration 50 would be named "xgboost_0050.ubj".
#' @seealso \link{xgb.save}
#' @details
#' This callback function allows to save an xgb-model file, either periodically after each \code{save_period}'s or at the end.
#'
@ -430,7 +434,7 @@ cb.early.stop <- function(stopping_rounds, maximize = FALSE,
#' \code{\link{callbacks}}
#'
#' @export
cb.save.model <- function(save_period = 0, save_name = "xgboost.model") {
cb.save.model <- function(save_period = 0, save_name = "xgboost.ubj") {
if (save_period < 0)
stop("'save_period' cannot be negative")
@ -440,8 +444,13 @@ cb.save.model <- function(save_period = 0, save_name = "xgboost.model") {
stop("'save_model' callback requires the 'bst' booster object in its calling frame")
if ((save_period > 0 && (env$iteration - env$begin_iteration) %% save_period == 0) ||
(save_period == 0 && env$iteration == env$end_iteration))
xgb.save(env$bst, sprintf(save_name, env$iteration))
(save_period == 0 && env$iteration == env$end_iteration)) {
# Note: this throws a warning if the name doesn't have anything to format through 'sprintf'
suppressWarnings({
save_name <- sprintf(save_name, env$iteration)
})
xgb.save(env$bst, save_name)
}
}
attr(callback, 'call') <- match.call()
attr(callback, 'name') <- 'cb.save.model'
@ -512,8 +521,7 @@ cb.cv.predict <- function(save_models = FALSE) {
env$basket$pred <- pred
if (save_models) {
env$basket$models <- lapply(env$bst_folds, function(fd) {
xgb.attr(fd$bst, 'niter') <- env$end_iteration - 1
xgb.Booster.complete(xgb.handleToBooster(handle = fd$bst, raw = NULL), saveraw = TRUE)
return(fd$bst)
})
}
}
@ -665,7 +673,7 @@ cb.gblinear.history <- function(sparse = FALSE) {
} else { # xgb.cv:
cf <- vector("list", length(env$bst_folds))
for (i in seq_along(env$bst_folds)) {
dmp <- xgb.dump(xgb.handleToBooster(handle = env$bst_folds[[i]]$bst, raw = NULL))
dmp <- xgb.dump(env$bst_folds[[i]]$bst)
cf[[i]] <- as.numeric(grep('(booster|bias|weigh)', dmp, invert = TRUE, value = TRUE))
if (sparse) cf[[i]] <- as(cf[[i]], "sparseVector")
}
@ -685,14 +693,19 @@ cb.gblinear.history <- function(sparse = FALSE) {
callback
}
#' Extract gblinear coefficients history.
#'
#' A helper function to extract the matrix of linear coefficients' history
#' @title Extract gblinear coefficients history.
#' @description A helper function to extract the matrix of linear coefficients' history
#' from a gblinear model created while using the \code{cb.gblinear.history()}
#' callback.
#' @details Note that this is an R-specific function that relies on R attributes that
#' are not saved when using xgboost's own serialization functions like \link{xgb.load}
#' or \link{xgb.load.raw}.
#'
#' In order for a serialized model to be accepted by tgis function, one must use R
#' serializers such as \link{saveRDS}.
#' @param model either an \code{xgb.Booster} or a result of \code{xgb.cv()}, trained
#' using the \code{cb.gblinear.history()} callback.
#' using the \code{cb.gblinear.history()} callback, but \bold{not} a booster
#' loaded from \link{xgb.load} or \link{xgb.load.raw}.
#' @param class_index zero-based class index to extract the coefficients for only that
#' specific class in a multinomial multiclass model. When it is NULL, all the
#' coefficients are returned. Has no effect in non-multiclass models.
@ -713,20 +726,18 @@ xgb.gblinear.history <- function(model, class_index = NULL) {
stop("model must be an object of either xgb.Booster or xgb.cv.synchronous class")
is_cv <- inherits(model, "xgb.cv.synchronous")
if (is.null(model[["callbacks"]]) || is.null(model$callbacks[["cb.gblinear.history"]]))
if (is_cv) {
callbacks <- model$callbacks
} else {
callbacks <- attributes(model)$callbacks
}
if (is.null(callbacks) || is.null(callbacks$cb.gblinear.history))
stop("model must be trained while using the cb.gblinear.history() callback")
if (!is_cv) {
# extract num_class & num_feat from the internal model
dmp <- xgb.dump(model)
if (length(dmp) < 2 || dmp[2] != "bias:")
stop("It does not appear to be a gblinear model")
dmp <- dmp[-c(1, 2)]
n <- which(dmp == 'weight:')
if (length(n) != 1)
stop("It does not appear to be a gblinear model")
num_class <- n - 1
num_feat <- (length(dmp) - 4) / num_class
num_class <- xgb.num_class(model)
num_feat <- xgb.num_feature(model)
} else {
# in case of CV, the object is expected to have this info
if (model$params$booster != "gblinear")
@ -742,7 +753,7 @@ xgb.gblinear.history <- function(model, class_index = NULL) {
(class_index[1] < 0 || class_index[1] >= num_class))
stop("class_index has to be within [0,", num_class - 1, "]")
coef_path <- environment(model$callbacks$cb.gblinear.history)[["coefs"]]
coef_path <- environment(callbacks$cb.gblinear.history)[["coefs"]]
if (!is.null(class_index) && num_class > 1) {
coef_path <- if (is.list(coef_path)) {
lapply(coef_path,

View File

@ -148,19 +148,17 @@ check.custom.eval <- function(env = parent.frame()) {
# Update a booster handle for an iteration with dtrain data
xgb.iter.update <- function(booster_handle, dtrain, iter, obj) {
if (!identical(class(booster_handle), "xgb.Booster.handle")) {
stop("booster_handle must be of xgb.Booster.handle class")
}
xgb.iter.update <- function(bst, dtrain, iter, obj) {
if (!inherits(dtrain, "xgb.DMatrix")) {
stop("dtrain must be of xgb.DMatrix class")
}
handle <- xgb.get.handle(bst)
if (is.null(obj)) {
.Call(XGBoosterUpdateOneIter_R, booster_handle, as.integer(iter), dtrain)
.Call(XGBoosterUpdateOneIter_R, handle, as.integer(iter), dtrain)
} else {
pred <- predict(
booster_handle,
bst,
dtrain,
outputmargin = TRUE,
training = TRUE,
@ -185,7 +183,7 @@ xgb.iter.update <- function(booster_handle, dtrain, iter, obj) {
}
.Call(
XGBoosterTrainOneIter_R, booster_handle, dtrain, iter, grad, hess
XGBoosterTrainOneIter_R, handle, dtrain, iter, grad, hess
)
}
return(TRUE)
@ -195,23 +193,22 @@ xgb.iter.update <- function(booster_handle, dtrain, iter, obj) {
# Evaluate one iteration.
# Returns a named vector of evaluation metrics
# with the names in a 'datasetname-metricname' format.
xgb.iter.eval <- function(booster_handle, watchlist, iter, feval) {
if (!identical(class(booster_handle), "xgb.Booster.handle"))
stop("class of booster_handle must be xgb.Booster.handle")
xgb.iter.eval <- function(bst, watchlist, iter, feval) {
handle <- xgb.get.handle(bst)
if (length(watchlist) == 0)
return(NULL)
evnames <- names(watchlist)
if (is.null(feval)) {
msg <- .Call(XGBoosterEvalOneIter_R, booster_handle, as.integer(iter), watchlist, as.list(evnames))
msg <- .Call(XGBoosterEvalOneIter_R, handle, as.integer(iter), watchlist, as.list(evnames))
mat <- matrix(strsplit(msg, '\\s+|:')[[1]][-1], nrow = 2)
res <- structure(as.numeric(mat[2, ]), names = mat[1, ])
} else {
res <- sapply(seq_along(watchlist), function(j) {
w <- watchlist[[j]]
## predict using all trees
preds <- predict(booster_handle, w, outputmargin = TRUE, iterationrange = c(1, 1))
preds <- predict(bst, w, outputmargin = TRUE, iterationrange = c(1, 1))
eval_res <- feval(preds, w)
out <- eval_res$value
names(out) <- paste0(evnames[j], "-", eval_res$metric)
@ -352,16 +349,45 @@ xgb.createFolds <- function(y, k) {
#' @name xgboost-deprecated
NULL
#' Do not use \code{\link[base]{saveRDS}} or \code{\link[base]{save}} for long-term archival of
#' models. Instead, use \code{\link{xgb.save}} or \code{\link{xgb.save.raw}}.
#' @title Model Serialization and Compatibility
#' @description
#'
#' It is a common practice to use the built-in \code{\link[base]{saveRDS}} function (or
#' \code{\link[base]{save}}) to persist R objects to the disk. While it is possible to persist
#' \code{xgb.Booster} objects using \code{\link[base]{saveRDS}}, it is not advisable to do so if
#' the model is to be accessed in the future. If you train a model with the current version of
#' XGBoost and persist it with \code{\link[base]{saveRDS}}, the model is not guaranteed to be
#' accessible in later releases of XGBoost. To ensure that your model can be accessed in future
#' releases of XGBoost, use \code{\link{xgb.save}} or \code{\link{xgb.save.raw}} instead.
#' When it comes to serializing XGBoost models, it's possible to use R serializers such as
#' \link{save} or \link{saveRDS} to serialize an XGBoost R model, but XGBoost also provides
#' its own serializers with better compatibility guarantees, which allow loading
#' said models in other language bindings of XGBoost.
#'
#' Note that an `xgb.Booster` object, outside of its core components, might also keep:\itemize{
#' \item Additional model configuration (accessible through \link{xgb.config}),
#' which includes model fitting parameters like `max_depth` and runtime parameters like `nthread`.
#' These are not necessarily useful for prediction/importance/plotting.
#' \item Additional R-specific attributes - e.g. results of callbacks, such as evaluation logs,
#' which are kept as a `data.table` object, accessible through `attributes(model)$evaluation_log`
#' if present.
#' }
#'
#' The first one (configurations) does not have the same compatibility guarantees as
#' the model itself, including attributes that are set and accessed through \link{xgb.attributes} - that is, such configuration
#' might be lost after loading the booster in a different XGBoost version, regardless of the
#' serializer that was used. These are saved when using \link{saveRDS}, but will be discarded
#' if loaded into an incompatible XGBoost version. They are not saved when using XGBoost's
#' serializers from its public interface including \link{xgb.save} and \link{xgb.save.raw}.
#'
#' The second ones (R attributes) are not part of the standard XGBoost model structure, and thus are
#' not saved when using XGBoost's own serializers. These attributes are only used for informational
#' purposes, such as keeping track of evaluation metrics as the model was fit, or saving the R
#' call that produced the model, but are otherwise not used for prediction / importance / plotting / etc.
#' These R attributes are only preserved when using R's serializers.
#'
#' Note that XGBoost models in R starting from version `2.1.0` and onwards, and XGBoost models
#' before version `2.1.0`; have a very different R object structure and are incompatible with
#' each other. Hence, models that were saved with R serializers live `saveRDS` or `save` before
#' version `2.1.0` will not work with latter `xgboost` versions and vice versa. Be aware that
#' the structure of R model objects could in theory change again in the future, so XGBoost's serializers
#' should be preferred for long-term storage.
#'
#' Furthermore, note that using the package `qs` for serialization will require version 0.26 or
#' higher of said package, and will have the same compatibility restrictions as R serializers.
#'
#' @details
#' Use \code{\link{xgb.save}} to save the XGBoost model as a stand-alone file. You may opt into
@ -374,9 +400,10 @@ NULL
#' The \code{\link{xgb.save.raw}} function is useful if you'd like to persist the XGBoost model
#' as part of another R object.
#'
#' Note: Do not use \code{\link{xgb.serialize}} to store models long-term. It persists not only the
#' model but also internal configurations and parameters, and its format is not stable across
#' multiple XGBoost versions. Use \code{\link{xgb.serialize}} only for checkpointing.
#' Use \link{saveRDS} if you require the R-specific attributes that a booster might have, such
#' as evaluation logs, but note that future compatibility of such objects is outside XGBoost's
#' control as it relies on R's serialization format (see e.g. the details section in
#' \link{serialize} and \link{save} from base R).
#'
#' For more details and explanation about model persistence and archival, consult the page
#' \url{https://xgboost.readthedocs.io/en/latest/tutorials/saving_model.html}.

View File

@ -1,180 +1,85 @@
# Construct an internal xgboost Booster and return a handle to it.
# Construct an internal xgboost Booster and get its current number of rounds.
# internal utility function
xgb.Booster.handle <- function(params, cachelist, modelfile, handle) {
# Note: the number of rounds in the C booster gets reset to zero when changing
# key booster parameters like 'process_type=update', but in some cases, when
# replacing previous iterations, it needs to make a check that the new number
# of iterations doesn't exceed the previous ones, hence it keeps track of the
# current number of iterations before resetting the parameters in order to
# perform the check later on.
xgb.Booster <- function(params, cachelist, modelfile) {
if (typeof(cachelist) != "list" ||
!all(vapply(cachelist, inherits, logical(1), what = 'xgb.DMatrix'))) {
stop("cachelist must be a list of xgb.DMatrix objects")
}
## Load existing model, dispatch for on disk model file and in memory buffer
if (!is.null(modelfile)) {
if (typeof(modelfile) == "character") {
if (is.character(modelfile)) {
## A filename
handle <- .Call(XGBoosterCreate_R, cachelist)
bst <- .Call(XGBoosterCreate_R, cachelist)
modelfile <- path.expand(modelfile)
.Call(XGBoosterLoadModel_R, handle, enc2utf8(modelfile[1]))
class(handle) <- "xgb.Booster.handle"
.Call(XGBoosterLoadModel_R, xgb.get.handle(bst), enc2utf8(modelfile[1]))
niter <- xgb.get.num.boosted.rounds(bst)
if (length(params) > 0) {
xgb.parameters(handle) <- params
xgb.parameters(bst) <- params
}
return(handle)
} else if (typeof(modelfile) == "raw") {
return(list(bst = bst, niter = niter))
} else if (is.raw(modelfile)) {
## A memory buffer
bst <- xgb.unserialize(modelfile, handle)
bst <- xgb.load.raw(modelfile)
niter <- xgb.get.num.boosted.rounds(bst)
xgb.parameters(bst) <- params
return(bst)
return(list(bst = bst, niter = niter))
} else if (inherits(modelfile, "xgb.Booster")) {
## A booster object
bst <- xgb.Booster.complete(modelfile, saveraw = TRUE)
bst <- xgb.unserialize(bst$raw)
bst <- .Call(XGDuplicate_R, modelfile)
niter <- xgb.get.num.boosted.rounds(bst)
xgb.parameters(bst) <- params
return(bst)
return(list(bst = bst, niter = niter))
} else {
stop("modelfile must be either character filename, or raw booster dump, or xgb.Booster object")
}
}
## Create new model
handle <- .Call(XGBoosterCreate_R, cachelist)
class(handle) <- "xgb.Booster.handle"
bst <- .Call(XGBoosterCreate_R, cachelist)
if (length(params) > 0) {
xgb.parameters(handle) <- params
xgb.parameters(bst) <- params
}
return(handle)
return(list(bst = bst, niter = 0L))
}
# Convert xgb.Booster.handle to xgb.Booster
# internal utility function
xgb.handleToBooster <- function(handle, raw) {
bst <- list(handle = handle, raw = raw)
class(bst) <- "xgb.Booster"
return(bst)
}
# Check whether xgb.Booster.handle is null
# Check whether xgb.Booster handle is null
# internal utility function
is.null.handle <- function(handle) {
if (is.null(handle)) return(TRUE)
if (!identical(class(handle), "xgb.Booster.handle"))
stop("argument type must be xgb.Booster.handle")
if (!inherits(handle, "externalptr"))
stop("argument type must be 'externalptr'")
if (.Call(XGCheckNullPtr_R, handle))
return(TRUE)
return(FALSE)
return(.Call(XGCheckNullPtr_R, handle))
}
# Return a verified to be valid handle out of either xgb.Booster.handle or
# xgb.Booster internal utility function
# Return a verified to be valid handle out of xgb.Booster
# internal utility function
xgb.get.handle <- function(object) {
if (inherits(object, "xgb.Booster")) {
handle <- object$handle
} else if (inherits(object, "xgb.Booster.handle")) {
handle <- object
handle <- object$ptr
if (is.null(handle) || !inherits(handle, "externalptr")) {
stop("'xgb.Booster' object is corrupted or is from an incompatible xgboost version.")
}
} else {
stop("argument must be of either xgb.Booster or xgb.Booster.handle class")
stop("argument must be an 'xgb.Booster' object.")
}
if (is.null.handle(handle)) {
stop("invalid xgb.Booster.handle")
stop("invalid 'xgb.Booster' (blank 'externalptr').")
}
handle
}
#' Restore missing parts of an incomplete xgb.Booster object
#'
#' It attempts to complete an `xgb.Booster` object by restoring either its missing
#' raw model memory dump (when it has no `raw` data but its `xgb.Booster.handle` is valid)
#' or its missing internal handle (when its `xgb.Booster.handle` is not valid
#' but it has a raw Booster memory dump).
#'
#' @param object Object of class `xgb.Booster`.
#' @param saveraw A flag indicating whether to append `raw` Booster memory dump data
#' when it doesn't already exist.
#'
#' @details
#'
#' While this method is primarily for internal use, it might be useful in some practical situations.
#'
#' E.g., when an `xgb.Booster` model is saved as an R object and then is loaded as an R object,
#' its handle (pointer) to an internal xgboost model would be invalid. The majority of xgboost methods
#' should still work for such a model object since those methods would be using
#' `xgb.Booster.complete()` internally. However, one might find it to be more efficient to call the
#' `xgb.Booster.complete()` function explicitly once after loading a model as an R-object.
#' That would prevent further repeated implicit reconstruction of an internal booster model.
#'
#' @return
#' An object of `xgb.Booster` class.
#'
#' @examples
#'
#' data(agaricus.train, package = "xgboost")
#'
#' bst <- xgboost(
#' data = agaricus.train$data,
#' label = agaricus.train$label,
#' max_depth = 2,
#' eta = 1,
#' nthread = 2,
#' nrounds = 2,
#' objective = "binary:logistic"
#' )
#'
#' fname <- file.path(tempdir(), "xgb_model.Rds")
#' saveRDS(bst, fname)
#'
#' # Warning: The resulting RDS file is only compatible with the current XGBoost version.
#' # Refer to the section titled "a-compatibility-note-for-saveRDS-save".
#' bst1 <- readRDS(fname)
#' # the handle is invalid:
#' print(bst1$handle)
#'
#' bst1 <- xgb.Booster.complete(bst1)
#' # now the handle points to a valid internal booster model:
#' print(bst1$handle)
#'
#' @export
xgb.Booster.complete <- function(object, saveraw = TRUE) {
if (!inherits(object, "xgb.Booster"))
stop("argument type must be xgb.Booster")
if (is.null.handle(object$handle)) {
object$handle <- xgb.Booster.handle(
params = list(),
cachelist = list(),
modelfile = object$raw,
handle = object$handle
)
} else {
if (is.null(object$raw) && saveraw) {
object$raw <- xgb.serialize(object$handle)
}
}
attrs <- xgb.attributes(object)
if (!is.null(attrs$best_ntreelimit)) {
object$best_ntreelimit <- as.integer(attrs$best_ntreelimit)
}
if (!is.null(attrs$best_iteration)) {
## Convert from 0 based back to 1 based.
object$best_iteration <- as.integer(attrs$best_iteration) + 1
}
if (!is.null(attrs$best_score)) {
object$best_score <- as.numeric(attrs$best_score)
}
if (!is.null(attrs$best_msg)) {
object$best_msg <- attrs$best_msg
}
if (!is.null(attrs$niter)) {
object$niter <- as.integer(attrs$niter)
}
return(object)
return(handle)
}
#' Predict method for XGBoost model
#'
#' Predicted values based on either xgboost model or model handle object.
#'
#' @param object Object of class `xgb.Booster` or `xgb.Booster.handle`.
#' @param object Object of class `xgb.Booster`.
#' @param newdata Takes `matrix`, `dgCMatrix`, `dgRMatrix`, `dsparseVector`,
#' local data file, or `xgb.DMatrix`.
#' For single-row predictions on sparse data, it is recommended to use the CSR format.
@ -358,27 +263,19 @@ xgb.Booster.complete <- function(object, saveraw = TRUE) {
#' pred5 <- predict(bst, as.matrix(iris[, -5]), iterationrange = c(1, 6))
#' sum(pred5 != lb) / length(lb)
#'
#' @rdname predict.xgb.Booster
#' @export
predict.xgb.Booster <- function(object, newdata, missing = NA, outputmargin = FALSE, ntreelimit = NULL,
predleaf = FALSE, predcontrib = FALSE, approxcontrib = FALSE, predinteraction = FALSE,
reshape = FALSE, training = FALSE, iterationrange = NULL, strict_shape = FALSE, ...) {
object <- xgb.Booster.complete(object, saveraw = FALSE)
if (!inherits(newdata, "xgb.DMatrix")) {
config <- jsonlite::fromJSON(xgb.config(object))
nthread <- strtoi(config$learner$generic_param$nthread)
nthread <- xgb.nthread(object)
newdata <- xgb.DMatrix(
newdata,
missing = missing, nthread = NVL(nthread, -1)
)
}
if (!is.null(object[["feature_names"]]) &&
!is.null(colnames(newdata)) &&
!identical(object[["feature_names"]], colnames(newdata)))
stop("Feature names stored in `object` and `newdata` are different!")
if (NVL(object$params[['booster']], '') == 'gblinear' || is.null(ntreelimit))
if (NVL(xgb.booster_type(object), '') == 'gblinear' || is.null(ntreelimit))
ntreelimit <- 0
if (ntreelimit != 0 && is.null(iterationrange)) {
@ -391,11 +288,12 @@ predict.xgb.Booster <- function(object, newdata, missing = NA, outputmargin = FA
## both are specified, let libgxgboost throw an error
} else {
## no limit is supplied, use best
if (is.null(object$best_iteration)) {
best_iteration <- xgb.best_iteration(object)
if (is.null(best_iteration)) {
iterationrange <- c(0, 0)
} else {
## We don't need to + 1 as R is 1-based index.
iterationrange <- c(0, as.integer(object$best_iteration))
iterationrange <- c(0, as.integer(best_iteration))
}
}
## Handle the 0 length values.
@ -438,7 +336,10 @@ predict.xgb.Booster <- function(object, newdata, missing = NA, outputmargin = FA
}
predts <- .Call(
XGBoosterPredictFromDMatrix_R, object$handle, newdata, jsonlite::toJSON(args, auto_unbox = TRUE)
XGBoosterPredictFromDMatrix_R,
xgb.get.handle(object),
newdata,
jsonlite::toJSON(args, auto_unbox = TRUE)
)
names(predts) <- c("shape", "results")
shape <- predts$shape
@ -509,22 +410,12 @@ predict.xgb.Booster <- function(object, newdata, missing = NA, outputmargin = FA
return(arr)
}
#' @rdname predict.xgb.Booster
#' @export
predict.xgb.Booster.handle <- function(object, ...) {
bst <- xgb.handleToBooster(handle = object, raw = NULL)
ret <- predict(bst, ...)
return(ret)
}
#' Accessors for serializable attributes of a model
#' @title Accessors for serializable attributes of a model
#'
#' These methods allow to manipulate the key-value attribute strings of an xgboost model.
#' @description These methods allow to manipulate the key-value attribute strings of an xgboost model.
#'
#' @param object Object of class `xgb.Booster` or `xgb.Booster.handle`.
#' @param object Object of class `xgb.Booster`. \bold{Will be modified in-place} when assigning to it.
#' @param name A non-empty character string specifying which attribute is to be accessed.
#' @param value For `xgb.attr<-`, a value of an attribute; for `xgb.attributes<-`,
#' it is a list (or an object coercible to a list) with the names of attributes to set
@ -546,16 +437,15 @@ predict.xgb.Booster.handle <- function(object, ...) {
#' change the value of that parameter for a model.
#' Use [xgb.parameters<-()] to set or change model parameters.
#'
#' The attribute setters would usually work more efficiently for `xgb.Booster.handle`
#' than for `xgb.Booster`, since only just a handle (pointer) would need to be copied.
#' That would only matter if attributes need to be set many times.
#' Note, however, that when feeding a handle of an `xgb.Booster` object to the attribute setters,
#' the raw model cache of an `xgb.Booster` object would not be automatically updated,
#' and it would be the user's responsibility to call [xgb.serialize()] to update it.
#'
#' The `xgb.attributes<-` setter either updates the existing or adds one or several attributes,
#' but it doesn't delete the other existing attributes.
#'
#' Important: since this modifies the booster's C object, semantics for assignment here
#' will differ from R's, as any object reference to the same booster will be modified
#' too, while assignment of R attributes through `attributes(model)$<attr> <- <value>`
#' will follow the usual copy-on-write R semantics (see \link{xgb.copy.Booster} for an
#' example of these behaviors).
#'
#' @return
#' - `xgb.attr()` returns either a string value of an attribute
#' or `NULL` if an attribute wasn't stored in a model.
@ -597,14 +487,25 @@ predict.xgb.Booster.handle <- function(object, ...) {
xgb.attr <- function(object, name) {
if (is.null(name) || nchar(as.character(name[1])) == 0) stop("invalid attribute name")
handle <- xgb.get.handle(object)
.Call(XGBoosterGetAttr_R, handle, as.character(name[1]))
out <- .Call(XGBoosterGetAttr_R, handle, as.character(name[1]))
if (!NROW(out) || !nchar(out)) {
return(NULL)
}
if (!is.null(out)) {
if (name %in% c("best_iteration", "best_ntreelimit", "best_score")) {
out <- as.numeric(out)
}
}
return(out)
}
#' @rdname xgb.attr
#' @export
`xgb.attr<-` <- function(object, name, value) {
if (is.null(name) || nchar(as.character(name[1])) == 0) stop("invalid attribute name")
name <- as.character(name[1])
if (!NROW(name) || !nchar(name)) stop("invalid attribute name")
handle <- xgb.get.handle(object)
if (!is.null(value)) {
# Coerce the elements to be scalar strings.
# Q: should we warn user about non-scalar elements?
@ -614,11 +515,8 @@ xgb.attr <- function(object, name) {
value <- as.character(value[1])
}
}
.Call(XGBoosterSetAttr_R, handle, as.character(name[1]), value)
if (is(object, 'xgb.Booster') && !is.null(object$raw)) {
object$raw <- xgb.serialize(object$handle)
}
object
.Call(XGBoosterSetAttr_R, handle, name, value)
return(object)
}
#' @rdname xgb.attr
@ -626,12 +524,10 @@ xgb.attr <- function(object, name) {
xgb.attributes <- function(object) {
handle <- xgb.get.handle(object)
attr_names <- .Call(XGBoosterGetAttrNames_R, handle)
if (is.null(attr_names)) return(NULL)
res <- lapply(attr_names, function(x) {
.Call(XGBoosterGetAttr_R, handle, x)
})
names(res) <- attr_names
res
if (!NROW(attr_names)) return(list())
out <- lapply(attr_names, function(name) xgb.attr(object, name))
names(out) <- attr_names
return(out)
}
#' @rdname xgb.attr
@ -641,31 +537,21 @@ xgb.attributes <- function(object) {
if (is.null(names(a)) || any(nchar(names(a)) == 0)) {
stop("attribute names cannot be empty strings")
}
# Coerce the elements to be scalar strings.
# Q: should we warn a user about non-scalar elements?
a <- lapply(a, function(x) {
if (is.null(x)) return(NULL)
if (is.numeric(x[1])) {
format(x[1], digits = 17)
} else {
as.character(x[1])
}
})
handle <- xgb.get.handle(object)
for (i in seq_along(a)) {
.Call(XGBoosterSetAttr_R, handle, names(a[i]), a[[i]])
xgb.attr(object, names(a[i])) <- a[[i]]
}
if (is(object, 'xgb.Booster') && !is.null(object$raw)) {
object$raw <- xgb.serialize(object$handle)
}
object
return(object)
}
#' Accessors for model parameters as JSON string
#'
#' @param object Object of class `xgb.Booster`.
#' @param value A JSON string.
#' @title Accessors for model parameters as JSON string
#' @details Note that assignment is performed in-place on the booster C object, which unlike assignment
#' of R attributes, doesn't follow typical copy-on-write semantics for assignment - i.e. all references
#' to the same booster will also get updated.
#'
#' See \link{xgb.copy.Booster} for an example of this behavior.
#' @param object Object of class `xgb.Booster`. \bold{Will be modified in-place} when assigning to it.
#' @param value An R list.
#' @return `xgb.config` will return the parameters as an R list.
#' @examples
#' data(agaricus.train, package = "xgboost")
#'
@ -690,31 +576,36 @@ xgb.attributes <- function(object) {
#' @export
xgb.config <- function(object) {
handle <- xgb.get.handle(object)
.Call(XGBoosterSaveJsonConfig_R, handle)
return(jsonlite::fromJSON(.Call(XGBoosterSaveJsonConfig_R, handle)))
}
#' @rdname xgb.config
#' @export
`xgb.config<-` <- function(object, value) {
handle <- xgb.get.handle(object)
.Call(XGBoosterLoadJsonConfig_R, handle, value)
object$raw <- NULL # force renew the raw buffer
object <- xgb.Booster.complete(object)
object
.Call(
XGBoosterLoadJsonConfig_R,
handle,
jsonlite::toJSON(value, auto_unbox = TRUE, null = "null")
)
return(object)
}
#' Accessors for model parameters
#' @title Accessors for model parameters
#' @description Only the setter for xgboost parameters is currently implemented.
#' @details Just like \link{xgb.attr}, this function will make in-place modifications
#' on the booster object which do not follow typical R assignment semantics - that is,
#' all references to the same booster will also be updated, unlike assingment of R
#' attributes which follow copy-on-write semantics.
#'
#' Only the setter for xgboost parameters is currently implemented.
#' See \link{xgb.copy.Booster} for an example of this behavior.
#'
#' @param object Object of class `xgb.Booster` or `xgb.Booster.handle`.
#' Be aware that setting parameters of a fitted booster related to training continuation / updates
#' will reset its number of rounds indicator to zero.
#' @param object Object of class `xgb.Booster`. \bold{Will be modified in-place}.
#' @param value A list (or an object coercible to a list) with the names of parameters to set
#' and the elements corresponding to parameter values.
#'
#' @details
#' Note that the setter would usually work more efficiently for `xgb.Booster.handle`
#' than for `xgb.Booster`, since only just a handle would need to be copied.
#'
#' @return The same booster `object`, which gets modified in-place.
#' @examples
#' data(agaricus.train, package = "xgboost")
#' train <- agaricus.train
@ -751,28 +642,301 @@ xgb.config <- function(object) {
for (i in seq_along(p)) {
.Call(XGBoosterSetParam_R, handle, names(p[i]), p[[i]])
}
if (is(object, 'xgb.Booster') && !is.null(object$raw)) {
object$raw <- xgb.serialize(object$handle)
return(object)
}
#' @rdname getinfo
#' @export
getinfo.xgb.Booster <- function(object, name) {
name <- as.character(head(name, 1L))
allowed_fields <- c("feature_name", "feature_type")
if (!(name %in% allowed_fields)) {
stop("getinfo: name must be one of the following: ", paste(allowed_fields, collapse = ", "))
}
object
handle <- xgb.get.handle(object)
out <- .Call(
XGBoosterGetStrFeatureInfo_R,
handle,
name
)
if (!NROW(out)) {
return(NULL)
}
return(out)
}
#' @rdname getinfo
#' @export
setinfo.xgb.Booster <- function(object, name, info) {
name <- as.character(head(name, 1L))
allowed_fields <- c("feature_name", "feature_type")
if (!(name %in% allowed_fields)) {
stop("setinfo: unknown info name ", name)
}
info <- as.character(info)
handle <- xgb.get.handle(object)
.Call(
XGBoosterSetStrFeatureInfo_R,
handle,
name,
info
)
return(TRUE)
}
#' @title Get number of boosting in a fitted booster
#' @param model A fitted `xgb.Booster` model.
#' @return The number of rounds saved in the model, as an integer.
#' @details Note that setting booster parameters related to training
#' continuation / updates through \link{xgb.parameters<-} will reset the
#' number of rounds to zero.
#' @export
xgb.get.num.boosted.rounds <- function(model) {
return(.Call(XGBoosterBoostedRounds_R, xgb.get.handle(model)))
}
#' @title Get Features Names from Booster
#' @description Returns the feature / variable / column names from a fitted
#' booster object, which are set automatically during the call to \link{xgb.train}
#' from the DMatrix names, or which can be set manually through \link{setinfo}.
#'
#' If the object doesn't have feature names, will return `NULL`.
#'
#' It is equivalent to calling `getinfo(object, "feature_name")`.
#' @param object An `xgb.Booster` object.
#' @param ... Not used.
#' @export
variable.names.xgb.Booster <- function(object, ...) {
return(getinfo(object, "feature_name"))
}
# Extract the number of trees in a model.
# TODO: either add a getter to C-interface, or simply set an 'ntree' attribute after each iteration.
# internal utility function
xgb.ntree <- function(bst) {
length(grep('^booster', xgb.dump(bst)))
config <- xgb.config(bst)
out <- strtoi(config$learner$gradient_booster$gbtree_model_param$num_trees)
return(out)
}
xgb.nthread <- function(bst) {
config <- xgb.config(bst)
out <- strtoi(config$learner$generic_param$nthread)
return(out)
}
#' Print xgb.Booster
xgb.booster_type <- function(bst) {
config <- xgb.config(bst)
out <- config$learner$learner_train_param$booster
return(out)
}
xgb.num_class <- function(bst) {
config <- xgb.config(bst)
out <- strtoi(config$learner$learner_model_param$num_class)
return(out)
}
xgb.feature_names <- function(bst) {
return(getinfo(bst, "feature_name"))
}
xgb.feature_types <- function(bst) {
return(getinfo(bst, "feature_type"))
}
xgb.num_feature <- function(bst) {
handle <- xgb.get.handle(bst)
return(.Call(XGBoosterGetNumFeature_R, handle))
}
xgb.best_iteration <- function(bst) {
out <- xgb.attr(bst, "best_iteration")
if (!NROW(out) || !nchar(out)) {
out <- NULL
}
return(out)
}
#' @title Extract coefficients from linear booster
#' @description Extracts the coefficients from a 'gblinear' booster object,
#' as produced by \code{xgb.train} when using parameter `booster="gblinear"`.
#'
#' Print information about `xgb.Booster`.
#' Note: this function will error out if passing a booster model
#' which is not of "gblinear" type.
#' @param object A fitted booster of 'gblinear' type.
#' @param ... Not used.
#' @return The extracted coefficients:\itemize{
#' \item If there's only one coefficient per column in the data, will be returned as a
#' vector, potentially containing the feature names if available, with the intercept
#' as first column.
#' \item If there's more than one coefficient per column in the data (e.g. when using
#' `objective="multi:softmax"`), will be returned as a matrix with dimensions equal
#' to `[num_features, num_cols]`, with the intercepts as first row. Note that the column
#' (classes in multi-class classification) dimension will not be named.
#' }
#'
#' The intercept returned here will include the 'base_score' parameter (unlike the 'bias'
#' or the last coefficient in the model dump, which doesn't have 'base_score' added to it),
#' hence one should get the same values from calling `predict(..., outputmargin = TRUE)` and
#' from performing a matrix multiplication with `model.matrix(~., ...)`.
#'
#' Be aware that the coefficients are obtained by first converting them to strings and
#' back, so there will always be some very small lose of precision compared to the actual
#' coefficients as used by \link{predict.xgb.Booster}.
#' @examples
#' library(xgboost)
#' data(mtcars)
#' y <- mtcars[, 1]
#' x <- as.matrix(mtcars[, -1])
#' dm <- xgb.DMatrix(data = x, label = y, nthread = 1)
#' params <- list(booster = "gblinear", nthread = 1)
#' model <- xgb.train(data = dm, params = params, nrounds = 2)
#' coef(model)
#' @export
coef.xgb.Booster <- function(object, ...) {
booster_type <- xgb.booster_type(object)
if (booster_type != "gblinear") {
stop("Coefficients are not defined for Booster type ", booster_type)
}
model_json <- jsonlite::fromJSON(rawToChar(xgb.save.raw(object, raw_format = "json")))
base_score <- model_json$learner$learner_model_param$base_score
num_feature <- as.numeric(model_json$learner$learner_model_param$num_feature)
weights <- model_json$learner$gradient_booster$model$weights
n_cols <- length(weights) / (num_feature + 1)
if (n_cols != floor(n_cols) || n_cols < 1) {
stop("Internal error: could not determine shape of coefficients.")
}
sep <- num_feature * n_cols
coefs <- weights[seq(1, sep)]
intercepts <- weights[seq(sep + 1, length(weights))]
intercepts <- intercepts + as.numeric(base_score)
feature_names <- xgb.feature_names(object)
if (!NROW(feature_names)) {
# This mimics the default naming in R which names columns as "V1..N"
# when names are needed but not available
feature_names <- paste0("V", seq(1L, num_feature))
}
feature_names <- c("(Intercept)", feature_names)
if (n_cols == 1L) {
out <- c(intercepts, coefs)
names(out) <- feature_names
} else {
coefs <- matrix(coefs, nrow = num_feature, byrow = TRUE)
dim(intercepts) <- c(1L, n_cols)
out <- rbind(intercepts, coefs)
row.names(out) <- feature_names
# TODO: if a class names attributes is added,
# should use those names here.
}
return(out)
}
#' @title Deep-copies a Booster Object
#' @description Creates a deep copy of an 'xgb.Booster' object, such that the
#' C object pointer contained will be a different object, and hence functions
#' like \link{xgb.attr} will not affect the object from which it was copied.
#' @param model An 'xgb.Booster' object.
#' @return A deep copy of `model` - it will be identical in every way, but C-level
#' functions called on that copy will not affect the `model` variable.
#' @examples
#' library(xgboost)
#' data(mtcars)
#' y <- mtcars$mpg
#' x <- mtcars[, -1]
#' dm <- xgb.DMatrix(x, label = y, nthread = 1)
#' model <- xgb.train(
#' data = dm,
#' params = list(nthread = 1),
#' nround = 3
#' )
#'
#' # Set an arbitrary attribute kept at the C level
#' xgb.attr(model, "my_attr") <- 100
#' print(xgb.attr(model, "my_attr"))
#'
#' # Just assigning to a new variable will not create
#' # a deep copy - C object pointer is shared, and in-place
#' # modifications will affect both objects
#' model_shallow_copy <- model
#' xgb.attr(model_shallow_copy, "my_attr") <- 333
#' # 'model' was also affected by this change:
#' print(xgb.attr(model, "my_attr"))
#'
#' model_deep_copy <- xgb.copy.Booster(model)
#' xgb.attr(model_deep_copy, "my_attr") <- 444
#' # 'model' was NOT affected by this change
#' # (keeps previous value that was assigned before)
#' print(xgb.attr(model, "my_attr"))
#'
#' # Verify that the new object was actually modified
#' print(xgb.attr(model_deep_copy, "my_attr"))
#' @export
xgb.copy.Booster <- function(model) {
if (!inherits(model, "xgb.Booster")) {
stop("'model' must be an 'xgb.Booster' object.")
}
return(.Call(XGDuplicate_R, model))
}
#' @title Check if two boosters share the same C object
#' @description Checks whether two booster objects refer to the same underlying C object.
#' @details As booster objects (as returned by e.g. \link{xgb.train}) contain an R 'externalptr'
#' object, they don't follow typical copy-on-write semantics of other R objects - that is, if
#' one assigns a booster to a different variable and modifies that new variable through in-place
#' methods like \link{xgb.attr<-}, the modification will be applied to both the old and the new
#' variable, unlike typical R assignments which would only modify the latter.
#'
#' This function allows checking whether two booster objects share the same 'externalptr',
#' regardless of the R attributes that they might have.
#'
#' In order to duplicate a booster in such a way that the copy wouldn't share the same
#' 'externalptr', one can use function \link{xgb.copy.Booster}.
#' @param obj1 Booster model to compare with `obj2`.
#' @param obj2 Booster model to compare with `obj1`.
#' @return Either `TRUE` or `FALSE` according to whether the two boosters share
#' the underlying C object.
#' @seealso \link{xgb.copy.Booster}
#' @examples
#' library(xgboost)
#' data(mtcars)
#' y <- mtcars$mpg
#' x <- as.matrix(mtcars[, -1])
#' model <- xgb.train(
#' params = list(nthread = 1),
#' data = xgb.DMatrix(x, label = y, nthread = 1),
#' nround = 3
#' )
#'
#' model_shallow_copy <- model
#' xgb.is.same.Booster(model, model_shallow_copy) # same C object
#'
#' model_deep_copy <- xgb.copy.Booster(model)
#' xgb.is.same.Booster(model, model_deep_copy) # different C objects
#'
#' # In-place assignments modify all references,
#' # but not full/deep copies of the booster
#' xgb.attr(model_shallow_copy, "my_attr") <- 111
#' xgb.attr(model, "my_attr") # gets modified
#' xgb.attr(model_deep_copy, "my_attr") # doesn't get modified
#' @export
xgb.is.same.Booster <- function(obj1, obj2) {
if (!inherits(obj1, "xgb.Booster") || !inherits(obj2, "xgb.Booster")) {
stop("'xgb.is.same.Booster' is only applicable to 'xgb.Booster' objects.")
}
return(
.Call(
XGPointerEqComparison_R,
xgb.get.handle(obj1),
xgb.get.handle(obj2)
)
)
}
#' @title Print xgb.Booster
#' @description Print information about `xgb.Booster`.
#' @param x An `xgb.Booster` object.
#' @param verbose Whether to print detailed data (e.g., attribute values).
#' @param ... Not currently used.
#'
#' @param ... Not used.
#' @return The same `x` object, returned invisibly
#' @examples
#' data(agaricus.train, package = "xgboost")
#' train <- agaricus.train
@ -790,79 +954,40 @@ xgb.ntree <- function(bst) {
#' attr(bst, "myattr") <- "memo"
#'
#' print(bst)
#' print(bst, verbose = TRUE)
#'
#' @export
print.xgb.Booster <- function(x, verbose = FALSE, ...) {
print.xgb.Booster <- function(x, ...) {
# this lets it error out when the object comes from an earlier R xgboost version
handle <- xgb.get.handle(x)
cat('##### xgb.Booster\n')
valid_handle <- !is.null.handle(x$handle)
if (!valid_handle)
cat("Handle is invalid! Suggest using xgb.Booster.complete\n")
cat('raw: ')
if (!is.null(x$raw)) {
cat(format(object.size(x$raw), units = "auto"), '\n')
} else {
cat('NULL\n')
}
if (!is.null(x$call)) {
R_attrs <- attributes(x)
if (!is.null(R_attrs$call)) {
cat('call:\n ')
print(x$call)
print(R_attrs$call)
}
if (!is.null(x$params)) {
cat('params (as set within xgb.train):\n')
cat(' ',
paste(names(x$params),
paste0('"', unlist(x$params), '"'),
sep = ' = ', collapse = ', '), '\n', sep = '')
}
# TODO: need an interface to access all the xgboosts parameters
cat('# of features:', xgb.num_feature(x), '\n')
cat('# of rounds: ', xgb.get.num.boosted.rounds(x), '\n')
attrs <- character(0)
if (valid_handle)
attrs <- xgb.attributes(x)
if (length(attrs) > 0) {
attr_names <- .Call(XGBoosterGetAttrNames_R, handle)
if (NROW(attr_names)) {
cat('xgb.attributes:\n')
if (verbose) {
cat(paste(paste0(' ', names(attrs)),
paste0('"', unlist(attrs), '"'),
sep = ' = ', collapse = '\n'), '\n', sep = '')
} else {
cat(' ', paste(names(attrs), collapse = ', '), '\n', sep = '')
}
cat(" ", paste(attr_names, collapse = ", "), "\n")
}
if (!is.null(x$callbacks) && length(x$callbacks) > 0) {
if (!is.null(R_attrs$callbacks) && length(R_attrs$callbacks) > 0) {
cat('callbacks:\n')
lapply(callback.calls(x$callbacks), function(x) {
lapply(callback.calls(R_attrs$callbacks), function(x) {
cat(' ')
print(x)
})
}
if (!is.null(x$feature_names))
cat('# of features:', length(x$feature_names), '\n')
cat('niter: ', x$niter, '\n', sep = '')
# TODO: uncomment when faster xgb.ntree is implemented
#cat('ntree: ', xgb.ntree(x), '\n', sep='')
for (n in setdiff(names(x), c('handle', 'raw', 'call', 'params', 'callbacks',
'evaluation_log', 'niter', 'feature_names'))) {
if (is.atomic(x[[n]])) {
cat(n, ':', x[[n]], '\n', sep = ' ')
} else {
cat(n, ':\n\t', sep = ' ')
print(x[[n]])
}
}
if (!is.null(x$evaluation_log)) {
if (!is.null(R_attrs$evaluation_log)) {
cat('evaluation_log:\n')
print(x$evaluation_log, row.names = FALSE, topn = 2)
print(R_attrs$evaluation_log, row.names = FALSE, topn = 2)
}
invisible(x)
return(invisible(x))
}

View File

@ -335,14 +335,13 @@ dimnames.xgb.DMatrix <- function(x) {
}
#' Get information of an xgb.DMatrix object
#'
#' Get information of an xgb.DMatrix object
#' @param object Object of class \code{xgb.DMatrix}
#' @title Get or set information of xgb.DMatrix and xgb.Booster objects
#' @param object Object of class \code{xgb.DMatrix} of `xgb.Booster`.
#' @param name the name of the information field to get (see details)
#'
#' @return For `getinfo`, will return the requested field. For `setinfo`, will always return value `TRUE`
#' if it succeeds.
#' @details
#' The \code{name} field can be one of the following:
#' The \code{name} field can be one of the following for `xgb.DMatrix`:
#'
#' \itemize{
#' \item \code{label}
@ -357,9 +356,17 @@ dimnames.xgb.DMatrix <- function(x) {
#' }
#' See the documentation for \link{xgb.DMatrix} for more information about these fields.
#'
#' For `xgb.Booster`, can be one of the following:
#' \itemize{
#' \item \code{feature_type}
#' \item \code{feature_name}
#' }
#'
#' Note that, while 'qid' cannot be retrieved, it's possible to get the equivalent 'group'
#' for a DMatrix that had 'qid' assigned.
#'
#' \bold{Important}: when calling `setinfo`, the objects are modified in-place. See
#' \link{xgb.copy.Booster} for an idea of this in-place assignment works.
#' @examples
#' data(agaricus.train, package='xgboost')
#' dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label, nthread = 2))
@ -412,13 +419,7 @@ getinfo.xgb.DMatrix <- function(object, name) {
return(ret)
}
#' Set information of an xgb.DMatrix object
#'
#' Set information of an xgb.DMatrix object
#'
#' @param object Object of class "xgb.DMatrix"
#' @param name the name of the field to get
#' @rdname getinfo
#' @param info the specific field of information to set
#'
#' @details
@ -441,11 +442,10 @@ getinfo.xgb.DMatrix <- function(object, name) {
#' setinfo(dtrain, 'label', 1-labels)
#' labels2 <- getinfo(dtrain, 'label')
#' stopifnot(all.equal(labels2, 1-labels))
#' @rdname setinfo
#' @export
setinfo <- function(object, name, info) UseMethod("setinfo")
#' @rdname setinfo
#' @rdname getinfo
#' @export
setinfo.xgb.DMatrix <- function(object, name, info) {
.internal.setinfo.xgb.DMatrix(object, name, info)

View File

@ -204,13 +204,13 @@ xgb.cv <- function(params = list(), data, nrounds, nfold, label = NULL, missing
dtrain <- slice(dall, unlist(folds[-k]))
else
dtrain <- slice(dall, train_folds[[k]])
handle <- xgb.Booster.handle(
bst <- xgb.Booster(
params = params,
cachelist = list(dtrain, dtest),
modelfile = NULL,
handle = NULL
modelfile = NULL
)
list(dtrain = dtrain, bst = handle, watchlist = list(train = dtrain, test = dtest), index = folds[[k]])
bst <- bst$bst
list(dtrain = dtrain, bst = bst, watchlist = list(train = dtrain, test = dtest), index = folds[[k]])
})
rm(dall)
# a "basket" to collect some results from callbacks
@ -231,13 +231,13 @@ xgb.cv <- function(params = list(), data, nrounds, nfold, label = NULL, missing
msg <- lapply(bst_folds, function(fd) {
xgb.iter.update(
booster_handle = fd$bst,
bst = fd$bst,
dtrain = fd$dtrain,
iter = iteration - 1,
obj = obj
)
xgb.iter.eval(
booster_handle = fd$bst,
bst = fd$bst,
watchlist = fd$watchlist,
iter = iteration - 1,
feval = feval
@ -267,7 +267,7 @@ xgb.cv <- function(params = list(), data, nrounds, nfold, label = NULL, missing
ret <- c(ret, basket)
class(ret) <- 'xgb.cv.synchronous'
invisible(ret)
return(invisible(ret))
}

View File

@ -56,9 +56,13 @@ xgb.dump <- function(model, fname = NULL, fmap = "", with_stats = FALSE,
if (!(is.null(fmap) || is.character(fmap)))
stop("fmap: argument must be a character string (when provided)")
model <- xgb.Booster.complete(model)
model_dump <- .Call(XGBoosterDumpModel_R, model$handle, NVL(fmap, "")[1], as.integer(with_stats),
as.character(dump_format))
model_dump <- .Call(
XGBoosterDumpModel_R,
xgb.get.handle(model),
NVL(fmap, "")[1],
as.integer(with_stats),
as.character(dump_format)
)
if (dump_format == "dot") {
return(sapply(model_dump, function(x) gsub("^booster\\[\\d+\\]\\n", "\\1", x)))
}

View File

@ -119,21 +119,21 @@ xgb.importance <- function(feature_names = NULL, model = NULL, trees = NULL,
if (!(is.null(data) && is.null(label) && is.null(target)))
warning("xgb.importance: parameters 'data', 'label' and 'target' are deprecated")
if (!inherits(model, "xgb.Booster"))
stop("model: must be an object of class xgb.Booster")
if (is.null(feature_names) && !is.null(model$feature_names))
feature_names <- model$feature_names
if (is.null(feature_names)) {
model_feature_names <- xgb.feature_names(model)
if (NROW(model_feature_names)) {
feature_names <- model_feature_names
}
}
if (!(is.null(feature_names) || is.character(feature_names)))
stop("feature_names: Has to be a character vector")
model <- xgb.Booster.complete(model)
config <- jsonlite::fromJSON(xgb.config(model))
if (config$learner$gradient_booster$name == "gblinear") {
handle <- xgb.get.handle(model)
if (xgb.booster_type(model) == "gblinear") {
args <- list(importance_type = "weight", feature_names = feature_names)
results <- .Call(
XGBoosterFeatureScore_R, model$handle, jsonlite::toJSON(args, auto_unbox = TRUE, null = "null")
XGBoosterFeatureScore_R, handle, jsonlite::toJSON(args, auto_unbox = TRUE, null = "null")
)
names(results) <- c("features", "shape", "weight")
if (length(results$shape) == 2) {
@ -154,7 +154,7 @@ xgb.importance <- function(feature_names = NULL, model = NULL, trees = NULL,
for (importance_type in c("weight", "total_gain", "total_cover")) {
args <- list(importance_type = importance_type, feature_names = feature_names, tree_idx = trees)
results <- .Call(
XGBoosterFeatureScore_R, model$handle, jsonlite::toJSON(args, auto_unbox = TRUE, null = "null")
XGBoosterFeatureScore_R, handle, jsonlite::toJSON(args, auto_unbox = TRUE, null = "null")
)
names(results) <- c("features", "shape", importance_type)
concatenated[

View File

@ -17,7 +17,7 @@
#' An object of \code{xgb.Booster} class.
#'
#' @seealso
#' \code{\link{xgb.save}}, \code{\link{xgb.Booster.complete}}.
#' \code{\link{xgb.save}}
#'
#' @examples
#' data(agaricus.train, package='xgboost')
@ -46,25 +46,20 @@ xgb.load <- function(modelfile) {
if (is.null(modelfile))
stop("xgb.load: modelfile cannot be NULL")
handle <- xgb.Booster.handle(
bst <- xgb.Booster(
params = list(),
cachelist = list(),
modelfile = modelfile,
handle = NULL
modelfile = modelfile
)
bst <- bst$bst
# re-use modelfile if it is raw so we do not need to serialize
if (typeof(modelfile) == "raw") {
warning(
paste(
"The support for loading raw booster with `xgb.load` will be ",
"discontinued in upcoming release. Use `xgb.load.raw` or",
" `xgb.unserialize` instead. "
"discontinued in upcoming release. Use `xgb.load.raw` instead. "
)
)
bst <- xgb.handleToBooster(handle = handle, raw = modelfile)
} else {
bst <- xgb.handleToBooster(handle = handle, raw = NULL)
}
bst <- xgb.Booster.complete(bst, saveraw = TRUE)
return(bst)
}

View File

@ -3,21 +3,10 @@
#' User can generate raw memory buffer by calling xgb.save.raw
#'
#' @param buffer the buffer returned by xgb.save.raw
#' @param as_booster Return the loaded model as xgb.Booster instead of xgb.Booster.handle.
#'
#' @export
xgb.load.raw <- function(buffer, as_booster = FALSE) {
xgb.load.raw <- function(buffer) {
cachelist <- list()
handle <- .Call(XGBoosterCreate_R, cachelist)
.Call(XGBoosterLoadModelFromRaw_R, handle, buffer)
class(handle) <- "xgb.Booster.handle"
if (as_booster) {
booster <- list(handle = handle, raw = NULL)
class(booster) <- "xgb.Booster"
booster <- xgb.Booster.complete(booster, saveraw = TRUE)
return(booster)
} else {
return(handle)
}
bst <- .Call(XGBoosterCreate_R, cachelist)
.Call(XGBoosterLoadModelFromRaw_R, xgb.get.handle(bst), buffer)
return(bst)
}

View File

@ -2,8 +2,10 @@
#'
#' Parse a boosted tree model text dump into a `data.table` structure.
#'
#' @param feature_names Character vector used to overwrite the feature names
#' of the model. The default (`NULL`) uses the original feature names.
#' @param feature_names Character vector of feature names. If the model already
#' contains feature names, those will be used when \code{feature_names=NULL} (default value).
#'
#' Note that, if the model already contains feature names, it's \bold{not} possible to override them here.
#' @param model Object of class `xgb.Booster`.
#' @param text Character vector previously generated by the function [xgb.dump()]
#' (called with parameter `with_stats = TRUE`). `text` takes precedence over `model`.
@ -54,8 +56,6 @@
#' objective = "binary:logistic"
#' )
#'
#' (dt <- xgb.model.dt.tree(colnames(agaricus.train$data), bst))
#'
#' # This bst model already has feature_names stored with it, so those would be used when
#' # feature_names is not set:
#' (dt <- xgb.model.dt.tree(model = bst))
@ -79,8 +79,15 @@ xgb.model.dt.tree <- function(feature_names = NULL, model = NULL, text = NULL,
" (or NULL if 'model' was provided).")
}
if (is.null(feature_names) && !is.null(model) && !is.null(model$feature_names))
feature_names <- model$feature_names
model_feature_names <- NULL
if (inherits(model, "xgb.Booster")) {
model_feature_names <- xgb.feature_names(model)
if (NROW(model_feature_names) && !is.null(feature_names)) {
stop("'model' contains feature names. Cannot override them.")
}
}
if (is.null(feature_names) && !is.null(model) && !is.null(model_feature_names))
feature_names <- model_feature_names
if (!(is.null(feature_names) || is.character(feature_names))) {
stop("feature_names: must be a character vector")
@ -90,8 +97,10 @@ xgb.model.dt.tree <- function(feature_names = NULL, model = NULL, text = NULL,
stop("trees: must be a vector of integers.")
}
from_text <- TRUE
if (is.null(text)) {
text <- xgb.dump(model = model, with_stats = TRUE)
from_text <- FALSE
}
if (length(text) < 2 || !any(grepl('leaf=(\\d+)', text))) {
@ -120,8 +129,28 @@ xgb.model.dt.tree <- function(feature_names = NULL, model = NULL, text = NULL,
td[, isLeaf := grepl("leaf", t, fixed = TRUE)]
# parse branch lines
branch_rx <- paste0("f(\\d+)<(", anynumber_regex, ")\\] yes=(\\d+),no=(\\d+),missing=(\\d+),",
"gain=(", anynumber_regex, "),cover=(", anynumber_regex, ")")
branch_rx_nonames <- paste0("f(\\d+)<(", anynumber_regex, ")\\] yes=(\\d+),no=(\\d+),missing=(\\d+),",
"gain=(", anynumber_regex, "),cover=(", anynumber_regex, ")")
branch_rx_w_names <- paste0("\\d+:\\[(.+)<(", anynumber_regex, ")\\] yes=(\\d+),no=(\\d+),missing=(\\d+),",
"gain=(", anynumber_regex, "),cover=(", anynumber_regex, ")")
text_has_feature_names <- FALSE
if (NROW(model_feature_names)) {
branch_rx <- branch_rx_w_names
text_has_feature_names <- TRUE
} else {
# Note: when passing a text dump, it might or might not have feature names,
# but that aspect is unknown from just the text attributes
branch_rx <- branch_rx_nonames
if (from_text) {
if (sum(grepl(branch_rx_w_names, text)) > sum(grepl(branch_rx_nonames, text))) {
branch_rx <- branch_rx_w_names
text_has_feature_names <- TRUE
}
}
}
if (text_has_feature_names && is.null(model) && !is.null(feature_names)) {
stop("'text' contains feature names. Cannot override them.")
}
branch_cols <- c("Feature", "Split", "Yes", "No", "Missing", "Gain", "Cover")
td[
isLeaf == FALSE,
@ -144,10 +173,12 @@ xgb.model.dt.tree <- function(feature_names = NULL, model = NULL, text = NULL,
is_stump <- function() {
return(length(td$Feature) == 1 && is.na(td$Feature))
}
if (!is.null(feature_names) && !is_stump()) {
if (length(feature_names) <= max(as.numeric(td$Feature), na.rm = TRUE))
stop("feature_names has less elements than there are features used in the model")
td[isLeaf == FALSE, Feature := feature_names[as.numeric(Feature) + 1]]
if (!text_has_feature_names) {
if (!is.null(feature_names) && !is_stump()) {
if (length(feature_names) <= max(as.numeric(td$Feature), na.rm = TRUE))
stop("feature_names has less elements than there are features used in the model")
td[isLeaf == FALSE, Feature := feature_names[as.numeric(Feature) + 1]]
}
}
# parse leaf lines

View File

@ -303,7 +303,11 @@ xgb.shap.data <- function(data, shap_contrib = NULL, features = NULL, top_n = 1,
if (is.character(features) && is.null(colnames(data)))
stop("either provide `data` with column names or provide `features` as column indices")
if (is.null(model$feature_names) && model$nfeatures != ncol(data))
model_feature_names <- NULL
if (is.null(features) && !is.null(model)) {
model_feature_names <- xgb.feature_names(model)
}
if (is.null(model_feature_names) && xgb.num_feature(model) != ncol(data))
stop("if model has no feature_names, columns in `data` must match features in model")
if (!is.null(subsample)) {
@ -332,7 +336,7 @@ xgb.shap.data <- function(data, shap_contrib = NULL, features = NULL, top_n = 1,
}
if (is.null(features)) {
if (!is.null(model$feature_names)) {
if (!is.null(model_feature_names)) {
imp <- xgb.importance(model = model, trees = trees)
} else {
imp <- xgb.importance(model = model, trees = trees, feature_names = colnames(data))

View File

@ -1,12 +1,24 @@
#' Save xgboost model to binary file
#'
#' Save xgboost model to a file in binary format.
#' Save xgboost model to a file in binary or JSON format.
#'
#' @param model model object of \code{xgb.Booster} class.
#' @param fname name of the file to write.
#' @param model Model object of \code{xgb.Booster} class.
#' @param fname Name of the file to write.
#'
#' Note that the extension of this file name determined the serialization format to use:\itemize{
#' \item Extension ".ubj" will use the universal binary JSON format (recommended).
#' This format uses binary types for e.g. floating point numbers, thereby preventing any loss
#' of precision when converting to a human-readable JSON text or similar.
#' \item Extension ".json" will use plain JSON, which is a human-readable format.
#' \item Extension ".deprecated" will use a \bold{deprecated} binary format. This format will
#' not be able to save attributes introduced after v1 of XGBoost, such as the "best_iteration"
#' attribute that boosters might keep, nor feature names or user-specifiec attributes.
#' \item If the format is not specified by passing one of the file extensions above, will
#' default to UBJ.
#' }
#'
#' @details
#' This methods allows to save a model in an xgboost-internal binary format which is universal
#' This methods allows to save a model in an xgboost-internal binary or text format which is universal
#' among the various xgboost interfaces. In R, the saved model file could be read-in later
#' using either the \code{\link{xgb.load}} function or the \code{xgb_model} parameter
#' of \code{\link{xgb.train}}.
@ -14,13 +26,13 @@
#' Note: a model can also be saved as an R-object (e.g., by using \code{\link[base]{readRDS}}
#' or \code{\link[base]{save}}). However, it would then only be compatible with R, and
#' corresponding R-methods would need to be used to load it. Moreover, persisting the model with
#' \code{\link[base]{readRDS}} or \code{\link[base]{save}}) will cause compatibility problems in
#' \code{\link[base]{readRDS}} or \code{\link[base]{save}}) might cause compatibility problems in
#' future versions of XGBoost. Consult \code{\link{a-compatibility-note-for-saveRDS-save}} to learn
#' how to persist models in a future-proof way, i.e. to make the model accessible in future
#' releases of XGBoost.
#'
#' @seealso
#' \code{\link{xgb.load}}, \code{\link{xgb.Booster.complete}}.
#' \code{\link{xgb.load}}
#'
#' @examples
#' data(agaricus.train, package='xgboost')
@ -51,8 +63,7 @@ xgb.save <- function(model, fname) {
stop("model must be xgb.Booster.",
if (inherits(model, "xgb.DMatrix")) " Use xgb.DMatrix.save to save an xgb.DMatrix object." else "")
}
model <- xgb.Booster.complete(model, saveraw = FALSE)
fname <- path.expand(fname)
.Call(XGBoosterSaveModel_R, model$handle, enc2utf8(fname[1]))
.Call(XGBoosterSaveModel_R, xgb.get.handle(model), enc2utf8(fname[1]))
return(TRUE)
}

View File

@ -11,8 +11,6 @@
#' \item \code{deprecated}: Encode the booster into old customized binary format.
#' }
#'
#' Right now the default is \code{deprecated} but will be changed to \code{ubj} in upcoming release.
#'
#' @examples
#' data(agaricus.train, package='xgboost')
#' data(agaricus.test, package='xgboost')
@ -30,7 +28,7 @@
#' bst <- xgb.load.raw(raw)
#'
#' @export
xgb.save.raw <- function(model, raw_format = "deprecated") {
xgb.save.raw <- function(model, raw_format = "ubj") {
handle <- xgb.get.handle(model)
args <- list(format = raw_format)
.Call(XGBoosterSaveModelToRaw_R, handle, jsonlite::toJSON(args, auto_unbox = TRUE))

View File

@ -1,21 +0,0 @@
#' Serialize the booster instance into R's raw vector. The serialization method differs
#' from \code{\link{xgb.save.raw}} as the latter one saves only the model but not
#' parameters. This serialization format is not stable across different xgboost versions.
#'
#' @param booster the booster instance
#'
#' @examples
#' data(agaricus.train, package='xgboost')
#' data(agaricus.test, package='xgboost')
#' train <- agaricus.train
#' test <- agaricus.test
#' bst <- xgb.train(data = xgb.DMatrix(train$data, label = train$label), max_depth = 2,
#' eta = 1, nthread = 2, nrounds = 2,objective = "binary:logistic")
#' raw <- xgb.serialize(bst)
#' bst <- xgb.unserialize(raw)
#'
#' @export
xgb.serialize <- function(booster) {
handle <- xgb.get.handle(booster)
.Call(XGBoosterSerializeToBuffer_R, handle)
}

View File

@ -152,6 +152,10 @@
#' See \code{\link{callbacks}}. Some of the callbacks are automatically created depending on the
#' parameters' values. User can provide either existing or their own callback methods in order
#' to customize the training process.
#'
#' Note that some callbacks might try to set an evaluation log - be aware that these evaluation logs
#' are kept as R attributes, and thus do not get saved when using non-R serializaters like
#' \link{xgb.save} (but are kept when using R serializers like \link{saveRDS}).
#' @param ... other parameters to pass to \code{params}.
#' @param label vector of response values. Should not be provided when data is
#' a local data file name or an \code{xgb.DMatrix}.
@ -160,6 +164,9 @@
#' This parameter is only used when input is a dense matrix.
#' @param weight a vector indicating the weight for each row of the input.
#'
#' @return
#' An object of class \code{xgb.Booster}.
#'
#' @details
#' These are the training functions for \code{xgboost}.
#'
@ -201,28 +208,20 @@
#' \item \code{cb.save.model}: when \code{save_period > 0} is set.
#' }
#'
#' @return
#' An object of class \code{xgb.Booster} with the following elements:
#' \itemize{
#' \item \code{handle} a handle (pointer) to the xgboost model in memory.
#' \item \code{raw} a cached memory dump of the xgboost model saved as R's \code{raw} type.
#' \item \code{niter} number of boosting iterations.
#' \item \code{evaluation_log} evaluation history stored as a \code{data.table} with the
#' first column corresponding to iteration number and the rest corresponding to evaluation
#' metrics' values. It is created by the \code{\link{cb.evaluation.log}} callback.
#' \item \code{call} a function call.
#' \item \code{params} parameters that were passed to the xgboost library. Note that it does not
#' capture parameters changed by the \code{\link{cb.reset.parameters}} callback.
#' \item \code{callbacks} callback functions that were either automatically assigned or
#' explicitly passed.
#' \item \code{best_iteration} iteration number with the best evaluation metric value
#' (only available with early stopping).
#' \item \code{best_score} the best evaluation metric value during early stopping.
#' (only available with early stopping).
#' \item \code{feature_names} names of the training dataset features
#' (only when column names were defined in training data).
#' \item \code{nfeatures} number of features in training data.
#' }
#' Note that objects of type `xgb.Booster` as returned by this function behave a bit differently
#' from typical R objects (it's an 'altrep' list class), and it makes a separation between
#' internal booster attributes (restricted to jsonifyable data), accessed through \link{xgb.attr}
#' and shared between interfaces through serialization functions like \link{xgb.save}; and
#' R-specific attributes, accessed through \link{attributes} and \link{attr}, which are otherwise
#' only used in the R interface, only kept when using R's serializers like \link{saveRDS}, and
#' not anyhow used by functions like \link{predict.xgb.Booster}.
#'
#' Be aware that one such R attribute that is automatically added is `params` - this attribute
#' is assigned from the `params` argument to this function, and is only meant to serve as a
#' reference for what went into the booster, but is not used in other methods that take a booster
#' object - so for example, changing the booster's configuration requires calling `xgb.config<-`
#' or 'xgb.parameters<-', while simply modifying `attributes(model)$params$<...>` will have no
#' effect elsewhere.
#'
#' @seealso
#' \code{\link{callbacks}},
@ -371,27 +370,31 @@ xgb.train <- function(params = list(), data, nrounds, watchlist = list(),
# The tree updating process would need slightly different handling
is_update <- NVL(params[['process_type']], '.') == 'update'
past_evaluation_log <- NULL
if (inherits(xgb_model, "xgb.Booster")) {
past_evaluation_log <- attributes(xgb_model)$evaluation_log
}
# Construct a booster (either a new one or load from xgb_model)
handle <- xgb.Booster.handle(
bst <- xgb.Booster(
params = params,
cachelist = append(watchlist, dtrain),
modelfile = xgb_model,
handle = NULL
modelfile = xgb_model
)
niter_init <- bst$niter
bst <- bst$bst
.Call(
XGBoosterCopyInfoFromDMatrix_R,
xgb.get.handle(bst),
dtrain
)
bst <- xgb.handleToBooster(handle = handle, raw = NULL)
# extract parameters that can affect the relationship b/w #trees and #iterations
num_class <- max(as.numeric(NVL(params[['num_class']], 1)), 1)
num_parallel_tree <- max(as.numeric(NVL(params[['num_parallel_tree']], 1)), 1)
# Note: it might look like these aren't used, but they need to be defined in this
# environment for the callbacks for work correctly.
num_class <- max(as.numeric(NVL(params[['num_class']], 1)), 1) # nolint
num_parallel_tree <- max(as.numeric(NVL(params[['num_parallel_tree']], 1)), 1) # nolint
# When the 'xgb_model' was set, find out how many boosting iterations it has
niter_init <- 0
if (!is.null(xgb_model)) {
niter_init <- as.numeric(xgb.attr(bst, 'niter')) + 1
if (length(niter_init) == 0) {
niter_init <- xgb.ntree(bst) %/% (num_parallel_tree * num_class)
}
}
if (is_update && nrounds > niter_init)
stop("nrounds cannot be larger than ", niter_init, " (nrounds of xgb_model)")
@ -405,7 +408,7 @@ xgb.train <- function(params = list(), data, nrounds, watchlist = list(),
for (f in cb$pre_iter) f()
xgb.iter.update(
booster_handle = bst$handle,
bst = bst,
dtrain = dtrain,
iter = iteration - 1,
obj = obj
@ -413,46 +416,43 @@ xgb.train <- function(params = list(), data, nrounds, watchlist = list(),
if (length(watchlist) > 0) {
bst_evaluation <- xgb.iter.eval( # nolint: object_usage_linter
booster_handle = bst$handle,
bst = bst,
watchlist = watchlist,
iter = iteration - 1,
feval = feval
)
}
xgb.attr(bst$handle, 'niter') <- iteration - 1
for (f in cb$post_iter) f()
if (stop_condition) break
}
for (f in cb$finalize) f(finalize = TRUE)
bst <- xgb.Booster.complete(bst, saveraw = TRUE)
# store the total number of boosting iterations
bst$niter <- end_iteration
# store the evaluation results
if (length(evaluation_log) > 0 &&
nrow(evaluation_log) > 0) {
keep_evaluation_log <- FALSE
if (length(evaluation_log) > 0 && nrow(evaluation_log) > 0) {
keep_evaluation_log <- TRUE
# include the previous compatible history when available
if (inherits(xgb_model, 'xgb.Booster') &&
!is_update &&
!is.null(xgb_model$evaluation_log) &&
!is.null(past_evaluation_log) &&
isTRUE(all.equal(colnames(evaluation_log),
colnames(xgb_model$evaluation_log)))) {
evaluation_log <- rbindlist(list(xgb_model$evaluation_log, evaluation_log))
colnames(past_evaluation_log)))) {
evaluation_log <- rbindlist(list(past_evaluation_log, evaluation_log))
}
bst$evaluation_log <- evaluation_log
}
bst$call <- match.call()
bst$params <- params
bst$callbacks <- callbacks
if (!is.null(colnames(dtrain)))
bst$feature_names <- colnames(dtrain)
bst$nfeatures <- ncol(dtrain)
extra_attrs <- list(
call = match.call(),
params = params,
callbacks = callbacks
)
if (keep_evaluation_log) {
extra_attrs$evaluation_log <- evaluation_log
}
curr_attrs <- attributes(bst)
attributes(bst) <- c(curr_attrs, extra_attrs)
return(bst)
}

View File

@ -1,41 +0,0 @@
#' Load the instance back from \code{\link{xgb.serialize}}
#'
#' @param buffer the buffer containing booster instance saved by \code{\link{xgb.serialize}}
#' @param handle An \code{xgb.Booster.handle} object which will be overwritten with
#' the new deserialized object. Must be a null handle (e.g. when loading the model through
#' `readRDS`). If not provided, a new handle will be created.
#' @return An \code{xgb.Booster.handle} object.
#'
#' @export
xgb.unserialize <- function(buffer, handle = NULL) {
cachelist <- list()
if (is.null(handle)) {
handle <- .Call(XGBoosterCreate_R, cachelist)
} else {
if (!is.null.handle(handle))
stop("'handle' is not null/empty. Cannot overwrite existing handle.")
.Call(XGBoosterCreateInEmptyObj_R, cachelist, handle)
}
tryCatch(
.Call(XGBoosterUnserializeFromBuffer_R, handle, buffer),
error = function(e) {
error_msg <- conditionMessage(e)
m <- regexec("(src[\\\\/]learner.cc:[0-9]+): Check failed: (header == serialisation_header_)",
error_msg, perl = TRUE)
groups <- regmatches(error_msg, m)[[1]]
if (length(groups) == 3) {
warning(paste("The model had been generated by XGBoost version 1.0.0 or earlier and was ",
"loaded from a RDS file. We strongly ADVISE AGAINST using saveRDS() ",
"function, to ensure that your model can be read in current and upcoming ",
"XGBoost releases. Please use xgb.save() instead to preserve models for the ",
"long term. For more details and explanation, see ",
"https://xgboost.readthedocs.io/en/latest/tutorials/saving_model.html",
sep = ""))
.Call(XGBoosterLoadModelFromRaw_R, handle, buffer)
} else {
stop(e)
}
})
class(handle) <- "xgb.Booster.handle"
return(handle)
}

View File

@ -100,8 +100,10 @@ NULL
#' @importFrom jsonlite toJSON
#' @importFrom methods new
#' @importFrom utils object.size str tail
#' @importFrom stats coef
#' @importFrom stats predict
#' @importFrom stats median
#' @importFrom stats variable.names
#' @importFrom utils head
#' @importFrom graphics barplot
#' @importFrom graphics lines

View File

@ -1,5 +1,4 @@
basic_walkthrough Basic feature walkthrough
caret_wrapper Use xgboost to train in caret library
custom_objective Customize loss function, and evaluation metric
boost_from_prediction Boosting from existing prediction
predict_first_ntree Predicting using first n trees

View File

@ -1,7 +1,6 @@
XGBoost R Feature Walkthrough
====
* [Basic walkthrough of wrappers](basic_walkthrough.R)
* [Train a xgboost model from caret library](caret_wrapper.R)
* [Customize loss function, and evaluation metric](custom_objective.R)
* [Boosting from existing prediction](boost_from_prediction.R)
* [Predicting using first n trees](predict_first_ntree.R)

View File

@ -1,44 +0,0 @@
# install development version of caret library that contains xgboost models
require(caret)
require(xgboost)
require(data.table)
require(vcd)
require(e1071)
# Load Arthritis dataset in memory.
data(Arthritis)
# Create a copy of the dataset with data.table package
# (data.table is 100% compliant with R dataframe but its syntax is a lot more consistent
# and its performance are really good).
df <- data.table(Arthritis, keep.rownames = FALSE)
# Let's add some new categorical features to see if it helps.
# Of course these feature are highly correlated to the Age feature.
# Usually it's not a good thing in ML, but Tree algorithms (including boosted trees) are able to select the best features,
# even in case of highly correlated features.
# For the first feature we create groups of age by rounding the real age.
# Note that we transform it to factor (categorical data) so the algorithm treat them as independant values.
df[, AgeDiscret := as.factor(round(Age / 10, 0))]
# Here is an even stronger simplification of the real age with an arbitrary split at 30 years old.
# I choose this value based on nothing.
# We will see later if simplifying the information based on arbitrary values is a good strategy
# (I am sure you already have an idea of how well it will work!).
df[, AgeCat := as.factor(ifelse(Age > 30, "Old", "Young"))]
# We remove ID as there is nothing to learn from this feature (it will just add some noise as the dataset is small).
df[, ID := NULL]
#-------------Basic Training using XGBoost in caret Library-----------------
# Set up control parameters for caret::train
# Here we use 10-fold cross-validation, repeating twice, and using random search for tuning hyper-parameters.
fitControl <- trainControl(method = "repeatedcv", number = 10, repeats = 2, search = "random")
# train a xgbTree model using caret::train
model <- train(factor(Improved) ~ ., data = df, method = "xgbTree", trControl = fitControl)
# Instead of tree for our boosters, you can also fit a linear regression or logistic regression model
# using xgbLinear
# model <- train(factor(Improved)~., data = df, method = "xgbLinear", trControl = fitControl)
# See model results
print(model)

View File

@ -27,7 +27,7 @@ head(pred_with_leaf)
create.new.tree.features <- function(model, original.features) {
pred_with_leaf <- predict(model, original.features, predleaf = TRUE)
cols <- list()
for (i in 1:model$niter) {
for (i in 1:xgb.get.num.boosted.rounds(model)) {
# max is not the real max but it s not important for the purpose of adding features
leaf.id <- sort(unique(pred_with_leaf[, i]))
cols[[i]] <- factor(x = pred_with_leaf[, i], level = leaf.id)

View File

@ -9,6 +9,5 @@ demo(create_sparse_matrix, package = 'xgboost')
demo(predict_leaf_indices, package = 'xgboost')
demo(early_stopping, package = 'xgboost')
demo(poisson_regression, package = 'xgboost')
demo(caret_wrapper, package = 'xgboost')
demo(tweedie_regression, package = 'xgboost')
#demo(gpu_accelerated, package = 'xgboost') # can only run when built with GPU support

View File

@ -2,16 +2,44 @@
% Please edit documentation in R/utils.R
\name{a-compatibility-note-for-saveRDS-save}
\alias{a-compatibility-note-for-saveRDS-save}
\title{Do not use \code{\link[base]{saveRDS}} or \code{\link[base]{save}} for long-term archival of
models. Instead, use \code{\link{xgb.save}} or \code{\link{xgb.save.raw}}.}
\title{Model Serialization and Compatibility}
\description{
It is a common practice to use the built-in \code{\link[base]{saveRDS}} function (or
\code{\link[base]{save}}) to persist R objects to the disk. While it is possible to persist
\code{xgb.Booster} objects using \code{\link[base]{saveRDS}}, it is not advisable to do so if
the model is to be accessed in the future. If you train a model with the current version of
XGBoost and persist it with \code{\link[base]{saveRDS}}, the model is not guaranteed to be
accessible in later releases of XGBoost. To ensure that your model can be accessed in future
releases of XGBoost, use \code{\link{xgb.save}} or \code{\link{xgb.save.raw}} instead.
When it comes to serializing XGBoost models, it's possible to use R serializers such as
\link{save} or \link{saveRDS} to serialize an XGBoost R model, but XGBoost also provides
its own serializers with better compatibility guarantees, which allow loading
said models in other language bindings of XGBoost.
Note that an \code{xgb.Booster} object, outside of its core components, might also keep:\itemize{
\item Additional model configuration (accessible through \link{xgb.config}),
which includes model fitting parameters like \code{max_depth} and runtime parameters like \code{nthread}.
These are not necessarily useful for prediction/importance/plotting.
\item Additional R-specific attributes - e.g. results of callbacks, such as evaluation logs,
which are kept as a \code{data.table} object, accessible through \code{attributes(model)$evaluation_log}
if present.
}
The first one (configurations) does not have the same compatibility guarantees as
the model itself, including attributes that are set and accessed through \link{xgb.attributes} - that is, such configuration
might be lost after loading the booster in a different XGBoost version, regardless of the
serializer that was used. These are saved when using \link{saveRDS}, but will be discarded
if loaded into an incompatible XGBoost version. They are not saved when using XGBoost's
serializers from its public interface including \link{xgb.save} and \link{xgb.save.raw}.
The second ones (R attributes) are not part of the standard XGBoost model structure, and thus are
not saved when using XGBoost's own serializers. These attributes are only used for informational
purposes, such as keeping track of evaluation metrics as the model was fit, or saving the R
call that produced the model, but are otherwise not used for prediction / importance / plotting / etc.
These R attributes are only preserved when using R's serializers.
Note that XGBoost models in R starting from version \verb{2.1.0} and onwards, and XGBoost models
before version \verb{2.1.0}; have a very different R object structure and are incompatible with
each other. Hence, models that were saved with R serializers live \code{saveRDS} or \code{save} before
version \verb{2.1.0} will not work with latter \code{xgboost} versions and vice versa. Be aware that
the structure of R model objects could in theory change again in the future, so XGBoost's serializers
should be preferred for long-term storage.
Furthermore, note that using the package \code{qs} for serialization will require version 0.26 or
higher of said package, and will have the same compatibility restrictions as R serializers.
}
\details{
Use \code{\link{xgb.save}} to save the XGBoost model as a stand-alone file. You may opt into
@ -24,9 +52,10 @@ re-construct the corresponding model. To read the model back, use \code{\link{xg
The \code{\link{xgb.save.raw}} function is useful if you'd like to persist the XGBoost model
as part of another R object.
Note: Do not use \code{\link{xgb.serialize}} to store models long-term. It persists not only the
model but also internal configurations and parameters, and its format is not stable across
multiple XGBoost versions. Use \code{\link{xgb.serialize}} only for checkpointing.
Use \link{saveRDS} if you require the R-specific attributes that a booster might have, such
as evaluation logs, but note that future compatibility of such objects is outside XGBoost's
control as it relies on R's serialization format (see e.g. the details section in
\link{serialize} and \link{save} from base R).
For more details and explanation about model persistence and archival, consult the page
\url{https://xgboost.readthedocs.io/en/latest/tutorials/saving_model.html}.

View File

@ -4,17 +4,22 @@
\alias{cb.save.model}
\title{Callback closure for saving a model file.}
\usage{
cb.save.model(save_period = 0, save_name = "xgboost.model")
cb.save.model(save_period = 0, save_name = "xgboost.ubj")
}
\arguments{
\item{save_period}{save the model to disk after every
\code{save_period} iterations; 0 means save the model at the end.}
\item{save_name}{the name or path for the saved model file.
It can contain a \code{\link[base]{sprintf}} formatting specifier
to include the integer iteration number in the file name.
E.g., with \code{save_name} = 'xgboost_\%04d.model',
the file saved at iteration 50 would be named "xgboost_0050.model".}
\if{html}{\out{<div class="sourceCode">}}\preformatted{ Note that the format of the model being saved is determined by the file
extension specified here (see \link{xgb.save} for details about how it works).
It can contain a \code{\link[base]{sprintf}} formatting specifier
to include the integer iteration number in the file name.
E.g., with \code{save_name} = 'xgboost_\%04d.ubj',
the file saved at iteration 50 would be named "xgboost_0050.ubj".
}\if{html}{\out{</div>}}}
}
\description{
Callback closure for saving a model file.
@ -29,5 +34,7 @@ Callback function expects the following values to be set in its calling frame:
\code{end_iteration}.
}
\seealso{
\link{xgb.save}
\code{\link{callbacks}}
}

View File

@ -0,0 +1,50 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/xgb.Booster.R
\name{coef.xgb.Booster}
\alias{coef.xgb.Booster}
\title{Extract coefficients from linear booster}
\usage{
\method{coef}{xgb.Booster}(object, ...)
}
\arguments{
\item{object}{A fitted booster of 'gblinear' type.}
\item{...}{Not used.}
}
\value{
The extracted coefficients:\itemize{
\item If there's only one coefficient per column in the data, will be returned as a
vector, potentially containing the feature names if available, with the intercept
as first column.
\item If there's more than one coefficient per column in the data (e.g. when using
\code{objective="multi:softmax"}), will be returned as a matrix with dimensions equal
to \verb{[num_features, num_cols]}, with the intercepts as first row. Note that the column
(classes in multi-class classification) dimension will not be named.
}
The intercept returned here will include the 'base_score' parameter (unlike the 'bias'
or the last coefficient in the model dump, which doesn't have 'base_score' added to it),
hence one should get the same values from calling \code{predict(..., outputmargin = TRUE)} and
from performing a matrix multiplication with \code{model.matrix(~., ...)}.
Be aware that the coefficients are obtained by first converting them to strings and
back, so there will always be some very small lose of precision compared to the actual
coefficients as used by \link{predict.xgb.Booster}.
}
\description{
Extracts the coefficients from a 'gblinear' booster object,
as produced by \code{xgb.train} when using parameter \code{booster="gblinear"}.
Note: this function will error out if passing a booster model
which is not of "gblinear" type.
}
\examples{
library(xgboost)
data(mtcars)
y <- mtcars[, 1]
x <- as.matrix(mtcars[, -1])
dm <- xgb.DMatrix(data = x, label = y, nthread = 1)
params <- list(booster = "gblinear", nthread = 1)
model <- xgb.train(data = dm, params = params, nrounds = 2)
coef(model)
}

View File

@ -1,24 +1,42 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/xgb.DMatrix.R
\name{getinfo}
% Please edit documentation in R/xgb.Booster.R, R/xgb.DMatrix.R
\name{getinfo.xgb.Booster}
\alias{getinfo.xgb.Booster}
\alias{setinfo.xgb.Booster}
\alias{getinfo}
\alias{getinfo.xgb.DMatrix}
\title{Get information of an xgb.DMatrix object}
\alias{setinfo}
\alias{setinfo.xgb.DMatrix}
\title{Get or set information of xgb.DMatrix and xgb.Booster objects}
\usage{
\method{getinfo}{xgb.Booster}(object, name)
\method{setinfo}{xgb.Booster}(object, name, info)
getinfo(object, name)
\method{getinfo}{xgb.DMatrix}(object, name)
setinfo(object, name, info)
\method{setinfo}{xgb.DMatrix}(object, name, info)
}
\arguments{
\item{object}{Object of class \code{xgb.DMatrix}}
\item{object}{Object of class \code{xgb.DMatrix} of \code{xgb.Booster}.}
\item{name}{the name of the information field to get (see details)}
\item{info}{the specific field of information to set}
}
\value{
For \code{getinfo}, will return the requested field. For \code{setinfo}, will always return value \code{TRUE}
if it succeeds.
}
\description{
Get information of an xgb.DMatrix object
Get or set information of xgb.DMatrix and xgb.Booster objects
}
\details{
The \code{name} field can be one of the following:
The \code{name} field can be one of the following for \code{xgb.DMatrix}:
\itemize{
\item \code{label}
@ -33,8 +51,28 @@ The \code{name} field can be one of the following:
}
See the documentation for \link{xgb.DMatrix} for more information about these fields.
For \code{xgb.Booster}, can be one of the following:
\itemize{
\item \code{feature_type}
\item \code{feature_name}
}
Note that, while 'qid' cannot be retrieved, it's possible to get the equivalent 'group'
for a DMatrix that had 'qid' assigned.
\bold{Important}: when calling \code{setinfo}, the objects are modified in-place. See
\link{xgb.copy.Booster} for an idea of this in-place assignment works.
See the documentation for \link{xgb.DMatrix} for possible fields that can be set
(which correspond to arguments in that function).
Note that the following fields are allowed in the construction of an \code{xgb.DMatrix}
but \bold{aren't} allowed here:\itemize{
\item data
\item missing
\item silent
\item nthread
}
}
\examples{
data(agaricus.train, package='xgboost')
@ -45,4 +83,11 @@ setinfo(dtrain, 'label', 1-labels)
labels2 <- getinfo(dtrain, 'label')
stopifnot(all(labels2 == 1-labels))
data(agaricus.train, package='xgboost')
dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label, nthread = 2))
labels <- getinfo(dtrain, 'label')
setinfo(dtrain, 'label', 1-labels)
labels2 <- getinfo(dtrain, 'label')
stopifnot(all.equal(labels2, 1-labels))
}

View File

@ -2,7 +2,6 @@
% Please edit documentation in R/xgb.Booster.R
\name{predict.xgb.Booster}
\alias{predict.xgb.Booster}
\alias{predict.xgb.Booster.handle}
\title{Predict method for XGBoost model}
\usage{
\method{predict}{xgb.Booster}(
@ -21,11 +20,9 @@
strict_shape = FALSE,
...
)
\method{predict}{xgb.Booster.handle}(object, ...)
}
\arguments{
\item{object}{Object of class \code{xgb.Booster} or \code{xgb.Booster.handle}.}
\item{object}{Object of class \code{xgb.Booster}.}
\item{newdata}{Takes \code{matrix}, \code{dgCMatrix}, \code{dgRMatrix}, \code{dsparseVector},
local data file, or \code{xgb.DMatrix}.

View File

@ -4,14 +4,15 @@
\alias{print.xgb.Booster}
\title{Print xgb.Booster}
\usage{
\method{print}{xgb.Booster}(x, verbose = FALSE, ...)
\method{print}{xgb.Booster}(x, ...)
}
\arguments{
\item{x}{An \code{xgb.Booster} object.}
\item{verbose}{Whether to print detailed data (e.g., attribute values).}
\item{...}{Not currently used.}
\item{...}{Not used.}
}
\value{
The same \code{x} object, returned invisibly
}
\description{
Print information about \code{xgb.Booster}.
@ -33,6 +34,5 @@ bst <- xgboost(
attr(bst, "myattr") <- "memo"
print(bst)
print(bst, verbose = TRUE)
}

View File

@ -1,42 +0,0 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/xgb.DMatrix.R
\name{setinfo}
\alias{setinfo}
\alias{setinfo.xgb.DMatrix}
\title{Set information of an xgb.DMatrix object}
\usage{
setinfo(object, name, info)
\method{setinfo}{xgb.DMatrix}(object, name, info)
}
\arguments{
\item{object}{Object of class "xgb.DMatrix"}
\item{name}{the name of the field to get}
\item{info}{the specific field of information to set}
}
\description{
Set information of an xgb.DMatrix object
}
\details{
See the documentation for \link{xgb.DMatrix} for possible fields that can be set
(which correspond to arguments in that function).
Note that the following fields are allowed in the construction of an \code{xgb.DMatrix}
but \bold{aren't} allowed here:\itemize{
\item data
\item missing
\item silent
\item nthread
}
}
\examples{
data(agaricus.train, package='xgboost')
dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label, nthread = 2))
labels <- getinfo(dtrain, 'label')
setinfo(dtrain, 'label', 1-labels)
labels2 <- getinfo(dtrain, 'label')
stopifnot(all.equal(labels2, 1-labels))
}

View File

@ -0,0 +1,22 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/xgb.Booster.R
\name{variable.names.xgb.Booster}
\alias{variable.names.xgb.Booster}
\title{Get Features Names from Booster}
\usage{
\method{variable.names}{xgb.Booster}(object, ...)
}
\arguments{
\item{object}{An \code{xgb.Booster} object.}
\item{...}{Not used.}
}
\description{
Returns the feature / variable / column names from a fitted
booster object, which are set automatically during the call to \link{xgb.train}
from the DMatrix names, or which can be set manually through \link{setinfo}.
If the object doesn't have feature names, will return \code{NULL}.
It is equivalent to calling \code{getinfo(object, "feature_name")}.
}

View File

@ -1,61 +0,0 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/xgb.Booster.R
\name{xgb.Booster.complete}
\alias{xgb.Booster.complete}
\title{Restore missing parts of an incomplete xgb.Booster object}
\usage{
xgb.Booster.complete(object, saveraw = TRUE)
}
\arguments{
\item{object}{Object of class \code{xgb.Booster}.}
\item{saveraw}{A flag indicating whether to append \code{raw} Booster memory dump data
when it doesn't already exist.}
}
\value{
An object of \code{xgb.Booster} class.
}
\description{
It attempts to complete an \code{xgb.Booster} object by restoring either its missing
raw model memory dump (when it has no \code{raw} data but its \code{xgb.Booster.handle} is valid)
or its missing internal handle (when its \code{xgb.Booster.handle} is not valid
but it has a raw Booster memory dump).
}
\details{
While this method is primarily for internal use, it might be useful in some practical situations.
E.g., when an \code{xgb.Booster} model is saved as an R object and then is loaded as an R object,
its handle (pointer) to an internal xgboost model would be invalid. The majority of xgboost methods
should still work for such a model object since those methods would be using
\code{xgb.Booster.complete()} internally. However, one might find it to be more efficient to call the
\code{xgb.Booster.complete()} function explicitly once after loading a model as an R-object.
That would prevent further repeated implicit reconstruction of an internal booster model.
}
\examples{
data(agaricus.train, package = "xgboost")
bst <- xgboost(
data = agaricus.train$data,
label = agaricus.train$label,
max_depth = 2,
eta = 1,
nthread = 2,
nrounds = 2,
objective = "binary:logistic"
)
fname <- file.path(tempdir(), "xgb_model.Rds")
saveRDS(bst, fname)
# Warning: The resulting RDS file is only compatible with the current XGBoost version.
# Refer to the section titled "a-compatibility-note-for-saveRDS-save".
bst1 <- readRDS(fname)
# the handle is invalid:
print(bst1$handle)
bst1 <- xgb.Booster.complete(bst1)
# now the handle points to a valid internal booster model:
print(bst1$handle)
}

View File

@ -16,7 +16,7 @@ xgb.attributes(object)
xgb.attributes(object) <- value
}
\arguments{
\item{object}{Object of class \code{xgb.Booster} or \code{xgb.Booster.handle}.}
\item{object}{Object of class \code{xgb.Booster}. \bold{Will be modified in-place} when assigning to it.}
\item{name}{A non-empty character string specifying which attribute is to be accessed.}
@ -51,15 +51,14 @@ Also, setting an attribute that has the same name as one of xgboost's parameters
change the value of that parameter for a model.
Use \code{\link[=xgb.parameters<-]{xgb.parameters<-()}} to set or change model parameters.
The attribute setters would usually work more efficiently for \code{xgb.Booster.handle}
than for \code{xgb.Booster}, since only just a handle (pointer) would need to be copied.
That would only matter if attributes need to be set many times.
Note, however, that when feeding a handle of an \code{xgb.Booster} object to the attribute setters,
the raw model cache of an \code{xgb.Booster} object would not be automatically updated,
and it would be the user's responsibility to call \code{\link[=xgb.serialize]{xgb.serialize()}} to update it.
The \verb{xgb.attributes<-} setter either updates the existing or adds one or several attributes,
but it doesn't delete the other existing attributes.
Important: since this modifies the booster's C object, semantics for assignment here
will differ from R's, as any object reference to the same booster will be modified
too, while assignment of R attributes through \verb{attributes(model)$<attr> <- <value>}
will follow the usual copy-on-write R semantics (see \link{xgb.copy.Booster} for an
example of these behaviors).
}
\examples{
data(agaricus.train, package = "xgboost")

View File

@ -10,13 +10,23 @@ xgb.config(object)
xgb.config(object) <- value
}
\arguments{
\item{object}{Object of class \code{xgb.Booster}.}
\item{object}{Object of class \code{xgb.Booster}. \bold{Will be modified in-place} when assigning to it.}
\item{value}{A JSON string.}
\item{value}{An R list.}
}
\value{
\code{xgb.config} will return the parameters as an R list.
}
\description{
Accessors for model parameters as JSON string
}
\details{
Note that assignment is performed in-place on the booster C object, which unlike assignment
of R attributes, doesn't follow typical copy-on-write semantics for assignment - i.e. all references
to the same booster will also get updated.
See \link{xgb.copy.Booster} for an example of this behavior.
}
\examples{
data(agaricus.train, package = "xgboost")

View File

@ -0,0 +1,53 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/xgb.Booster.R
\name{xgb.copy.Booster}
\alias{xgb.copy.Booster}
\title{Deep-copies a Booster Object}
\usage{
xgb.copy.Booster(model)
}
\arguments{
\item{model}{An 'xgb.Booster' object.}
}
\value{
A deep copy of \code{model} - it will be identical in every way, but C-level
functions called on that copy will not affect the \code{model} variable.
}
\description{
Creates a deep copy of an 'xgb.Booster' object, such that the
C object pointer contained will be a different object, and hence functions
like \link{xgb.attr} will not affect the object from which it was copied.
}
\examples{
library(xgboost)
data(mtcars)
y <- mtcars$mpg
x <- mtcars[, -1]
dm <- xgb.DMatrix(x, label = y, nthread = 1)
model <- xgb.train(
data = dm,
params = list(nthread = 1),
nround = 3
)
# Set an arbitrary attribute kept at the C level
xgb.attr(model, "my_attr") <- 100
print(xgb.attr(model, "my_attr"))
# Just assigning to a new variable will not create
# a deep copy - C object pointer is shared, and in-place
# modifications will affect both objects
model_shallow_copy <- model
xgb.attr(model_shallow_copy, "my_attr") <- 333
# 'model' was also affected by this change:
print(xgb.attr(model, "my_attr"))
model_deep_copy <- xgb.copy.Booster(model)
xgb.attr(model_deep_copy, "my_attr") <- 444
# 'model' was NOT affected by this change
# (keeps previous value that was assigned before)
print(xgb.attr(model, "my_attr"))
# Verify that the new object was actually modified
print(xgb.attr(model_deep_copy, "my_attr"))
}

View File

@ -8,7 +8,8 @@ xgb.gblinear.history(model, class_index = NULL)
}
\arguments{
\item{model}{either an \code{xgb.Booster} or a result of \code{xgb.cv()}, trained
using the \code{cb.gblinear.history()} callback.}
using the \code{cb.gblinear.history()} callback, but \bold{not} a booster
loaded from \link{xgb.load} or \link{xgb.load.raw}.}
\item{class_index}{zero-based class index to extract the coefficients for only that
specific class in a multinomial multiclass model. When it is NULL, all the
@ -27,3 +28,11 @@ A helper function to extract the matrix of linear coefficients' history
from a gblinear model created while using the \code{cb.gblinear.history()}
callback.
}
\details{
Note that this is an R-specific function that relies on R attributes that
are not saved when using xgboost's own serialization functions like \link{xgb.load}
or \link{xgb.load.raw}.
In order for a serialized model to be accepted by tgis function, one must use R
serializers such as \link{saveRDS}.
}

View File

@ -0,0 +1,22 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/xgb.Booster.R
\name{xgb.get.num.boosted.rounds}
\alias{xgb.get.num.boosted.rounds}
\title{Get number of boosting in a fitted booster}
\usage{
xgb.get.num.boosted.rounds(model)
}
\arguments{
\item{model}{A fitted \code{xgb.Booster} model.}
}
\value{
The number of rounds saved in the model, as an integer.
}
\description{
Get number of boosting in a fitted booster
}
\details{
Note that setting booster parameters related to training
continuation / updates through \link{xgb.parameters<-} will reset the
number of rounds to zero.
}

View File

@ -0,0 +1,59 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/xgb.Booster.R
\name{xgb.is.same.Booster}
\alias{xgb.is.same.Booster}
\title{Check if two boosters share the same C object}
\usage{
xgb.is.same.Booster(obj1, obj2)
}
\arguments{
\item{obj1}{Booster model to compare with \code{obj2}.}
\item{obj2}{Booster model to compare with \code{obj1}.}
}
\value{
Either \code{TRUE} or \code{FALSE} according to whether the two boosters share
the underlying C object.
}
\description{
Checks whether two booster objects refer to the same underlying C object.
}
\details{
As booster objects (as returned by e.g. \link{xgb.train}) contain an R 'externalptr'
object, they don't follow typical copy-on-write semantics of other R objects - that is, if
one assigns a booster to a different variable and modifies that new variable through in-place
methods like \link{xgb.attr<-}, the modification will be applied to both the old and the new
variable, unlike typical R assignments which would only modify the latter.
This function allows checking whether two booster objects share the same 'externalptr',
regardless of the R attributes that they might have.
In order to duplicate a booster in such a way that the copy wouldn't share the same
'externalptr', one can use function \link{xgb.copy.Booster}.
}
\examples{
library(xgboost)
data(mtcars)
y <- mtcars$mpg
x <- as.matrix(mtcars[, -1])
model <- xgb.train(
params = list(nthread = 1),
data = xgb.DMatrix(x, label = y, nthread = 1),
nround = 3
)
model_shallow_copy <- model
xgb.is.same.Booster(model, model_shallow_copy) # same C object
model_deep_copy <- xgb.copy.Booster(model)
xgb.is.same.Booster(model, model_deep_copy) # different C objects
# In-place assignments modify all references,
# but not full/deep copies of the booster
xgb.attr(model_shallow_copy, "my_attr") <- 111
xgb.attr(model, "my_attr") # gets modified
xgb.attr(model_deep_copy, "my_attr") # doesn't get modified
}
\seealso{
\link{xgb.copy.Booster}
}

View File

@ -48,5 +48,5 @@ xgb.save(bst, fname)
bst <- xgb.load(fname)
}
\seealso{
\code{\link{xgb.save}}, \code{\link{xgb.Booster.complete}}.
\code{\link{xgb.save}}
}

View File

@ -4,12 +4,10 @@
\alias{xgb.load.raw}
\title{Load serialised xgboost model from R's raw vector}
\usage{
xgb.load.raw(buffer, as_booster = FALSE)
xgb.load.raw(buffer)
}
\arguments{
\item{buffer}{the buffer returned by xgb.save.raw}
\item{as_booster}{Return the loaded model as xgb.Booster instead of xgb.Booster.handle.}
}
\description{
User can generate raw memory buffer by calling xgb.save.raw

View File

@ -14,8 +14,11 @@ xgb.model.dt.tree(
)
}
\arguments{
\item{feature_names}{Character vector used to overwrite the feature names
of the model. The default (\code{NULL}) uses the original feature names.}
\item{feature_names}{Character vector of feature names. If the model already
contains feature names, those will be used when \code{feature_names=NULL} (default value).
\if{html}{\out{<div class="sourceCode">}}\preformatted{ Note that, if the model already contains feature names, it's \\bold\{not\} possible to override them here.
}\if{html}{\out{</div>}}}
\item{model}{Object of class \code{xgb.Booster}.}
@ -76,8 +79,6 @@ bst <- xgboost(
objective = "binary:logistic"
)
(dt <- xgb.model.dt.tree(colnames(agaricus.train$data), bst))
# This bst model already has feature_names stored with it, so those would be used when
# feature_names is not set:
(dt <- xgb.model.dt.tree(model = bst))

View File

@ -7,17 +7,27 @@
xgb.parameters(object) <- value
}
\arguments{
\item{object}{Object of class \code{xgb.Booster} or \code{xgb.Booster.handle}.}
\item{object}{Object of class \code{xgb.Booster}. \bold{Will be modified in-place}.}
\item{value}{A list (or an object coercible to a list) with the names of parameters to set
and the elements corresponding to parameter values.}
}
\value{
The same booster \code{object}, which gets modified in-place.
}
\description{
Only the setter for xgboost parameters is currently implemented.
}
\details{
Note that the setter would usually work more efficiently for \code{xgb.Booster.handle}
than for \code{xgb.Booster}, since only just a handle would need to be copied.
Just like \link{xgb.attr}, this function will make in-place modifications
on the booster object which do not follow typical R assignment semantics - that is,
all references to the same booster will also be updated, unlike assingment of R
attributes which follow copy-on-write semantics.
See \link{xgb.copy.Booster} for an example of this behavior.
Be aware that setting parameters of a fitted booster related to training continuation / updates
will reset its number of rounds indicator to zero.
}
\examples{
data(agaricus.train, package = "xgboost")

View File

@ -7,15 +7,27 @@
xgb.save(model, fname)
}
\arguments{
\item{model}{model object of \code{xgb.Booster} class.}
\item{model}{Model object of \code{xgb.Booster} class.}
\item{fname}{name of the file to write.}
\item{fname}{Name of the file to write.
Note that the extension of this file name determined the serialization format to use:\itemize{
\item Extension ".ubj" will use the universal binary JSON format (recommended).
This format uses binary types for e.g. floating point numbers, thereby preventing any loss
of precision when converting to a human-readable JSON text or similar.
\item Extension ".json" will use plain JSON, which is a human-readable format.
\item Extension ".deprecated" will use a \bold{deprecated} binary format. This format will
not be able to save attributes introduced after v1 of XGBoost, such as the "best_iteration"
attribute that boosters might keep, nor feature names or user-specifiec attributes.
\item If the format is not specified by passing one of the file extensions above, will
default to UBJ.
}}
}
\description{
Save xgboost model to a file in binary format.
Save xgboost model to a file in binary or JSON format.
}
\details{
This methods allows to save a model in an xgboost-internal binary format which is universal
This methods allows to save a model in an xgboost-internal binary or text format which is universal
among the various xgboost interfaces. In R, the saved model file could be read-in later
using either the \code{\link{xgb.load}} function or the \code{xgb_model} parameter
of \code{\link{xgb.train}}.
@ -23,7 +35,7 @@ of \code{\link{xgb.train}}.
Note: a model can also be saved as an R-object (e.g., by using \code{\link[base]{readRDS}}
or \code{\link[base]{save}}). However, it would then only be compatible with R, and
corresponding R-methods would need to be used to load it. Moreover, persisting the model with
\code{\link[base]{readRDS}} or \code{\link[base]{save}}) will cause compatibility problems in
\code{\link[base]{readRDS}} or \code{\link[base]{save}}) might cause compatibility problems in
future versions of XGBoost. Consult \code{\link{a-compatibility-note-for-saveRDS-save}} to learn
how to persist models in a future-proof way, i.e. to make the model accessible in future
releases of XGBoost.
@ -51,5 +63,5 @@ xgb.save(bst, fname)
bst <- xgb.load(fname)
}
\seealso{
\code{\link{xgb.load}}, \code{\link{xgb.Booster.complete}}.
\code{\link{xgb.load}}
}

View File

@ -5,7 +5,7 @@
\title{Save xgboost model to R's raw vector,
user can call xgb.load.raw to load the model back from raw vector}
\usage{
xgb.save.raw(model, raw_format = "deprecated")
xgb.save.raw(model, raw_format = "ubj")
}
\arguments{
\item{model}{the model object.}
@ -15,9 +15,7 @@ xgb.save.raw(model, raw_format = "deprecated")
\item \code{json}: Encode the booster into JSON text document.
\item \code{ubj}: Encode the booster into Universal Binary JSON.
\item \code{deprecated}: Encode the booster into old customized binary format.
}
Right now the default is \code{deprecated} but will be changed to \code{ubj} in upcoming release.}
}}
}
\description{
Save xgboost model from xgboost or xgb.train

View File

@ -1,29 +0,0 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/xgb.serialize.R
\name{xgb.serialize}
\alias{xgb.serialize}
\title{Serialize the booster instance into R's raw vector. The serialization method differs
from \code{\link{xgb.save.raw}} as the latter one saves only the model but not
parameters. This serialization format is not stable across different xgboost versions.}
\usage{
xgb.serialize(booster)
}
\arguments{
\item{booster}{the booster instance}
}
\description{
Serialize the booster instance into R's raw vector. The serialization method differs
from \code{\link{xgb.save.raw}} as the latter one saves only the model but not
parameters. This serialization format is not stable across different xgboost versions.
}
\examples{
data(agaricus.train, package='xgboost')
data(agaricus.test, package='xgboost')
train <- agaricus.train
test <- agaricus.test
bst <- xgb.train(data = xgb.DMatrix(train$data, label = train$label), max_depth = 2,
eta = 1, nthread = 2, nrounds = 2,objective = "binary:logistic")
raw <- xgb.serialize(bst)
bst <- xgb.unserialize(raw)
}

View File

@ -205,7 +205,12 @@ file with a previously saved model.}
\item{callbacks}{a list of callback functions to perform various task during boosting.
See \code{\link{callbacks}}. Some of the callbacks are automatically created depending on the
parameters' values. User can provide either existing or their own callback methods in order
to customize the training process.}
to customize the training process.
\if{html}{\out{<div class="sourceCode">}}\preformatted{ Note that some callbacks might try to set an evaluation log - be aware that these evaluation logs
are kept as R attributes, and thus do not get saved when using non-R serializaters like
\link{xgb.save} (but are kept when using R serializers like \link{saveRDS}).
}\if{html}{\out{</div>}}}
\item{...}{other parameters to pass to \code{params}.}
@ -219,27 +224,7 @@ This parameter is only used when input is a dense matrix.}
\item{weight}{a vector indicating the weight for each row of the input.}
}
\value{
An object of class \code{xgb.Booster} with the following elements:
\itemize{
\item \code{handle} a handle (pointer) to the xgboost model in memory.
\item \code{raw} a cached memory dump of the xgboost model saved as R's \code{raw} type.
\item \code{niter} number of boosting iterations.
\item \code{evaluation_log} evaluation history stored as a \code{data.table} with the
first column corresponding to iteration number and the rest corresponding to evaluation
metrics' values. It is created by the \code{\link{cb.evaluation.log}} callback.
\item \code{call} a function call.
\item \code{params} parameters that were passed to the xgboost library. Note that it does not
capture parameters changed by the \code{\link{cb.reset.parameters}} callback.
\item \code{callbacks} callback functions that were either automatically assigned or
explicitly passed.
\item \code{best_iteration} iteration number with the best evaluation metric value
(only available with early stopping).
\item \code{best_score} the best evaluation metric value during early stopping.
(only available with early stopping).
\item \code{feature_names} names of the training dataset features
(only when column names were defined in training data).
\item \code{nfeatures} number of features in training data.
}
An object of class \code{xgb.Booster}.
}
\description{
\code{xgb.train} is an advanced interface for training an xgboost model.
@ -285,6 +270,21 @@ and the \code{print_every_n} parameter is passed to it.
\item \code{cb.early.stop}: when \code{early_stopping_rounds} is set.
\item \code{cb.save.model}: when \code{save_period > 0} is set.
}
Note that objects of type \code{xgb.Booster} as returned by this function behave a bit differently
from typical R objects (it's an 'altrep' list class), and it makes a separation between
internal booster attributes (restricted to jsonifyable data), accessed through \link{xgb.attr}
and shared between interfaces through serialization functions like \link{xgb.save}; and
R-specific attributes, accessed through \link{attributes} and \link{attr}, which are otherwise
only used in the R interface, only kept when using R's serializers like \link{saveRDS}, and
not anyhow used by functions like \link{predict.xgb.Booster}.
Be aware that one such R attribute that is automatically added is \code{params} - this attribute
is assigned from the \code{params} argument to this function, and is only meant to serve as a
reference for what went into the booster, but is not used in other methods that take a booster
object - so for example, changing the booster's configuration requires calling \verb{xgb.config<-}
or 'xgb.parameters<-', while simply modifying \verb{attributes(model)$params$<...>} will have no
effect elsewhere.
}
\examples{
data(agaricus.train, package='xgboost')

View File

@ -1,21 +0,0 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/xgb.unserialize.R
\name{xgb.unserialize}
\alias{xgb.unserialize}
\title{Load the instance back from \code{\link{xgb.serialize}}}
\usage{
xgb.unserialize(buffer, handle = NULL)
}
\arguments{
\item{buffer}{the buffer containing booster instance saved by \code{\link{xgb.serialize}}}
\item{handle}{An \code{xgb.Booster.handle} object which will be overwritten with
the new deserialized object. Must be a null handle (e.g. when loading the model through
\code{readRDS}). If not provided, a new handle will be created.}
}
\value{
An \code{xgb.Booster.handle} object.
}
\description{
Load the instance back from \code{\link{xgb.serialize}}
}

View File

@ -15,9 +15,16 @@ Check these declarations against the C/Fortran source code.
*/
/* .Call calls */
extern void XGBInitializeAltrepClass_R(DllInfo *info);
extern SEXP XGDuplicate_R(SEXP);
extern SEXP XGPointerEqComparison_R(SEXP, SEXP);
extern SEXP XGBoosterTrainOneIter_R(SEXP, SEXP, SEXP, SEXP, SEXP);
extern SEXP XGBoosterCreate_R(SEXP);
extern SEXP XGBoosterCreateInEmptyObj_R(SEXP, SEXP);
extern SEXP XGBoosterCopyInfoFromDMatrix_R(SEXP, SEXP);
extern SEXP XGBoosterSetStrFeatureInfo_R(SEXP, SEXP, SEXP);
extern SEXP XGBoosterGetStrFeatureInfo_R(SEXP, SEXP);
extern SEXP XGBoosterBoostedRounds_R(SEXP);
extern SEXP XGBoosterGetNumFeature_R(SEXP);
extern SEXP XGBoosterDumpModel_R(SEXP, SEXP, SEXP, SEXP);
extern SEXP XGBoosterEvalOneIter_R(SEXP, SEXP, SEXP, SEXP);
extern SEXP XGBoosterGetAttrNames_R(SEXP);
@ -57,9 +64,15 @@ extern SEXP XGBGetGlobalConfig_R(void);
extern SEXP XGBoosterFeatureScore_R(SEXP, SEXP);
static const R_CallMethodDef CallEntries[] = {
{"XGDuplicate_R", (DL_FUNC) &XGDuplicate_R, 1},
{"XGPointerEqComparison_R", (DL_FUNC) &XGPointerEqComparison_R, 2},
{"XGBoosterTrainOneIter_R", (DL_FUNC) &XGBoosterTrainOneIter_R, 5},
{"XGBoosterCreate_R", (DL_FUNC) &XGBoosterCreate_R, 1},
{"XGBoosterCreateInEmptyObj_R", (DL_FUNC) &XGBoosterCreateInEmptyObj_R, 2},
{"XGBoosterCopyInfoFromDMatrix_R", (DL_FUNC) &XGBoosterCopyInfoFromDMatrix_R, 2},
{"XGBoosterSetStrFeatureInfo_R",(DL_FUNC) &XGBoosterSetStrFeatureInfo_R,3}, // NOLINT
{"XGBoosterGetStrFeatureInfo_R",(DL_FUNC) &XGBoosterGetStrFeatureInfo_R,2}, // NOLINT
{"XGBoosterBoostedRounds_R", (DL_FUNC) &XGBoosterBoostedRounds_R, 1},
{"XGBoosterGetNumFeature_R", (DL_FUNC) &XGBoosterGetNumFeature_R, 1},
{"XGBoosterDumpModel_R", (DL_FUNC) &XGBoosterDumpModel_R, 4},
{"XGBoosterEvalOneIter_R", (DL_FUNC) &XGBoosterEvalOneIter_R, 4},
{"XGBoosterGetAttrNames_R", (DL_FUNC) &XGBoosterGetAttrNames_R, 1},
@ -106,4 +119,5 @@ __declspec(dllexport)
void attribute_visible R_init_xgboost(DllInfo *dll) {
R_registerRoutines(dll, NULL, CallEntries, NULL, NULL);
R_useDynamicSymbols(dll, FALSE);
XGBInitializeAltrepClass_R(dll);
}

View File

@ -260,16 +260,18 @@ char cpp_ex_msg[512];
using dmlc::BeginPtr;
XGB_DLL SEXP XGCheckNullPtr_R(SEXP handle) {
return ScalarLogical(R_ExternalPtrAddr(handle) == NULL);
return Rf_ScalarLogical(R_ExternalPtrAddr(handle) == nullptr);
}
XGB_DLL void _DMatrixFinalizer(SEXP ext) {
namespace {
void _DMatrixFinalizer(SEXP ext) {
R_API_BEGIN();
if (R_ExternalPtrAddr(ext) == NULL) return;
CHECK_CALL(XGDMatrixFree(R_ExternalPtrAddr(ext)));
R_ClearExternalPtr(ext);
R_API_END();
}
} /* namespace */
XGB_DLL SEXP XGBSetGlobalConfig_R(SEXP json_str) {
R_API_BEGIN();
@ -527,8 +529,14 @@ XGB_DLL SEXP XGDMatrixSetStrFeatureInfo_R(SEXP handle, SEXP field, SEXP array) {
}
SEXP str_info_holder = PROTECT(Rf_allocVector(VECSXP, len));
for (size_t i = 0; i < len; ++i) {
SET_VECTOR_ELT(str_info_holder, i, Rf_asChar(VECTOR_ELT(array, i)));
if (TYPEOF(array) == STRSXP) {
for (size_t i = 0; i < len; ++i) {
SET_VECTOR_ELT(str_info_holder, i, STRING_ELT(array, i));
}
} else {
for (size_t i = 0; i < len; ++i) {
SET_VECTOR_ELT(str_info_holder, i, Rf_asChar(VECTOR_ELT(array, i)));
}
}
SEXP field_ = PROTECT(Rf_asChar(field));
@ -614,6 +622,14 @@ XGB_DLL SEXP XGDMatrixNumCol_R(SEXP handle) {
return ScalarInteger(static_cast<int>(ncol));
}
XGB_DLL SEXP XGDuplicate_R(SEXP obj) {
return Rf_duplicate(obj);
}
XGB_DLL SEXP XGPointerEqComparison_R(SEXP obj1, SEXP obj2) {
return Rf_ScalarLogical(R_ExternalPtrAddr(obj1) == R_ExternalPtrAddr(obj2));
}
XGB_DLL SEXP XGDMatrixGetQuantileCut_R(SEXP handle) {
const char *out_names[] = {"indptr", "data", ""};
SEXP continuation_token = Rf_protect(R_MakeUnwindCont());
@ -682,14 +698,134 @@ XGB_DLL SEXP XGDMatrixGetDataAsCSR_R(SEXP handle) {
}
// functions related to booster
void _BoosterFinalizer(SEXP ext) {
if (R_ExternalPtrAddr(ext) == NULL) return;
CHECK_CALL(XGBoosterFree(R_ExternalPtrAddr(ext)));
R_ClearExternalPtr(ext);
namespace {
void _BoosterFinalizer(SEXP R_ptr) {
if (R_ExternalPtrAddr(R_ptr) == NULL) return;
CHECK_CALL(XGBoosterFree(R_ExternalPtrAddr(R_ptr)));
R_ClearExternalPtr(R_ptr);
}
/* Booster is represented as an altrep list with one element which
corresponds to an 'externalptr' holding the C object, forbidding
modification by not implementing setters, and adding custom serialization. */
R_altrep_class_t XGBAltrepPointerClass;
R_xlen_t XGBAltrepPointerLength_R(SEXP R_altrepped_obj) {
return 1;
}
SEXP XGBAltrepPointerGetElt_R(SEXP R_altrepped_obj, R_xlen_t idx) {
return R_altrep_data1(R_altrepped_obj);
}
SEXP XGBMakeEmptyAltrep() {
SEXP class_name = Rf_protect(Rf_mkString("xgb.Booster"));
SEXP elt_names = Rf_protect(Rf_mkString("ptr"));
SEXP R_ptr = Rf_protect(R_MakeExternalPtr(nullptr, R_NilValue, R_NilValue));
SEXP R_altrepped_obj = Rf_protect(R_new_altrep(XGBAltrepPointerClass, R_ptr, R_NilValue));
Rf_setAttrib(R_altrepped_obj, R_NamesSymbol, elt_names);
Rf_setAttrib(R_altrepped_obj, R_ClassSymbol, class_name);
Rf_unprotect(4);
return R_altrepped_obj;
}
/* Note: the idea for separating this function from the one above is to be
able to trigger all R allocations first before doing non-R allocations. */
void XGBAltrepSetPointer(SEXP R_altrepped_obj, BoosterHandle handle) {
SEXP R_ptr = R_altrep_data1(R_altrepped_obj);
R_SetExternalPtrAddr(R_ptr, handle);
R_RegisterCFinalizerEx(R_ptr, _BoosterFinalizer, TRUE);
}
SEXP XGBAltrepSerializer_R(SEXP R_altrepped_obj) {
R_API_BEGIN();
BoosterHandle handle = R_ExternalPtrAddr(R_altrep_data1(R_altrepped_obj));
char const *serialized_bytes;
bst_ulong serialized_length;
CHECK_CALL(XGBoosterSerializeToBuffer(
handle, &serialized_length, &serialized_bytes));
SEXP R_state = Rf_protect(Rf_allocVector(RAWSXP, serialized_length));
if (serialized_length != 0) {
std::memcpy(RAW(R_state), serialized_bytes, serialized_length);
}
Rf_unprotect(1);
return R_state;
R_API_END();
return R_NilValue; /* <- should not be reached */
}
SEXP XGBAltrepDeserializer_R(SEXP unused, SEXP R_state) {
SEXP R_altrepped_obj = Rf_protect(XGBMakeEmptyAltrep());
R_API_BEGIN();
BoosterHandle handle = nullptr;
CHECK_CALL(XGBoosterCreate(nullptr, 0, &handle));
int res_code = XGBoosterUnserializeFromBuffer(handle,
RAW(R_state),
Rf_xlength(R_state));
if (res_code != 0) {
XGBoosterFree(handle);
}
CHECK_CALL(res_code);
XGBAltrepSetPointer(R_altrepped_obj, handle);
R_API_END();
Rf_unprotect(1);
return R_altrepped_obj;
}
// https://purrple.cat/blog/2018/10/14/altrep-and-cpp/
Rboolean XGBAltrepInspector_R(
SEXP x, int pre, int deep, int pvec,
void (*inspect_subtree)(SEXP, int, int, int)) {
Rprintf("Altrepped external pointer [address:%p]\n",
R_ExternalPtrAddr(R_altrep_data1(x)));
return TRUE;
}
SEXP XGBAltrepDuplicate_R(SEXP R_altrepped_obj, Rboolean deep) {
R_API_BEGIN();
if (!deep) {
SEXP out = Rf_protect(XGBMakeEmptyAltrep());
R_set_altrep_data1(out, R_altrep_data1(R_altrepped_obj));
Rf_unprotect(1);
return out;
} else {
SEXP out = Rf_protect(XGBMakeEmptyAltrep());
char const *serialized_bytes;
bst_ulong serialized_length;
CHECK_CALL(XGBoosterSerializeToBuffer(
R_ExternalPtrAddr(R_altrep_data1(R_altrepped_obj)),
&serialized_length, &serialized_bytes));
BoosterHandle new_handle = nullptr;
CHECK_CALL(XGBoosterCreate(nullptr, 0, &new_handle));
int res_code = XGBoosterUnserializeFromBuffer(new_handle,
serialized_bytes,
serialized_length);
if (res_code != 0) {
XGBoosterFree(new_handle);
}
CHECK_CALL(res_code);
XGBAltrepSetPointer(out, new_handle);
Rf_unprotect(1);
return out;
}
R_API_END();
return R_NilValue; /* <- should not be reached */
}
} /* namespace */
XGB_DLL void XGBInitializeAltrepClass_R(DllInfo *dll) {
XGBAltrepPointerClass = R_make_altlist_class("XGBAltrepPointerClass", "xgboost", dll);
R_set_altrep_Length_method(XGBAltrepPointerClass, XGBAltrepPointerLength_R);
R_set_altlist_Elt_method(XGBAltrepPointerClass, XGBAltrepPointerGetElt_R);
R_set_altrep_Inspect_method(XGBAltrepPointerClass, XGBAltrepInspector_R);
R_set_altrep_Serialized_state_method(XGBAltrepPointerClass, XGBAltrepSerializer_R);
R_set_altrep_Unserialize_method(XGBAltrepPointerClass, XGBAltrepDeserializer_R);
R_set_altrep_Duplicate_method(XGBAltrepPointerClass, XGBAltrepDuplicate_R);
}
XGB_DLL SEXP XGBoosterCreate_R(SEXP dmats) {
SEXP ret = PROTECT(R_MakeExternalPtr(nullptr, R_NilValue, R_NilValue));
SEXP out = Rf_protect(XGBMakeEmptyAltrep());
R_API_BEGIN();
R_xlen_t len = Rf_xlength(dmats);
BoosterHandle handle;
@ -703,33 +839,104 @@ XGB_DLL SEXP XGBoosterCreate_R(SEXP dmats) {
res_code = XGBoosterCreate(BeginPtr(dvec), dvec.size(), &handle);
}
CHECK_CALL(res_code);
R_SetExternalPtrAddr(ret, handle);
R_RegisterCFinalizerEx(ret, _BoosterFinalizer, TRUE);
XGBAltrepSetPointer(out, handle);
R_API_END();
UNPROTECT(1);
return ret;
Rf_unprotect(1);
return out;
}
XGB_DLL SEXP XGBoosterCreateInEmptyObj_R(SEXP dmats, SEXP R_handle) {
XGB_DLL SEXP XGBoosterCopyInfoFromDMatrix_R(SEXP booster, SEXP dmat) {
R_API_BEGIN();
R_xlen_t len = Rf_xlength(dmats);
BoosterHandle handle;
char const **feature_names;
bst_ulong len_feature_names = 0;
CHECK_CALL(XGDMatrixGetStrFeatureInfo(R_ExternalPtrAddr(dmat),
"feature_name",
&len_feature_names,
&feature_names));
if (len_feature_names) {
CHECK_CALL(XGBoosterSetStrFeatureInfo(R_ExternalPtrAddr(booster),
"feature_name",
feature_names,
len_feature_names));
}
char const **feature_types;
bst_ulong len_feature_types = 0;
CHECK_CALL(XGDMatrixGetStrFeatureInfo(R_ExternalPtrAddr(dmat),
"feature_type",
&len_feature_types,
&feature_types));
if (len_feature_types) {
CHECK_CALL(XGBoosterSetStrFeatureInfo(R_ExternalPtrAddr(booster),
"feature_type",
feature_types,
len_feature_types));
}
R_API_END();
return R_NilValue;
}
XGB_DLL SEXP XGBoosterSetStrFeatureInfo_R(SEXP handle, SEXP field, SEXP features) {
R_API_BEGIN();
SEXP field_char = Rf_protect(Rf_asChar(field));
bst_ulong len_features = Rf_xlength(features);
int res_code;
{
std::vector<void*> dvec(len);
for (R_xlen_t i = 0; i < len; ++i) {
dvec[i] = R_ExternalPtrAddr(VECTOR_ELT(dmats, i));
std::vector<const char*> str_arr(len_features);
for (bst_ulong idx = 0; idx < len_features; idx++) {
str_arr[idx] = CHAR(STRING_ELT(features, idx));
}
res_code = XGBoosterCreate(BeginPtr(dvec), dvec.size(), &handle);
res_code = XGBoosterSetStrFeatureInfo(R_ExternalPtrAddr(handle),
CHAR(field_char),
str_arr.data(),
len_features);
}
CHECK_CALL(res_code);
R_SetExternalPtrAddr(R_handle, handle);
R_RegisterCFinalizerEx(R_handle, _BoosterFinalizer, TRUE);
Rf_unprotect(1);
R_API_END();
return R_NilValue;
}
XGB_DLL SEXP XGBoosterGetStrFeatureInfo_R(SEXP handle, SEXP field) {
R_API_BEGIN();
bst_ulong len;
const char **out_features;
SEXP field_char = Rf_protect(Rf_asChar(field));
CHECK_CALL(XGBoosterGetStrFeatureInfo(R_ExternalPtrAddr(handle),
CHAR(field_char), &len, &out_features));
SEXP out = Rf_protect(Rf_allocVector(STRSXP, len));
for (bst_ulong idx = 0; idx < len; idx++) {
SET_STRING_ELT(out, idx, Rf_mkChar(out_features[idx]));
}
Rf_unprotect(2);
return out;
R_API_END();
return R_NilValue; /* <- should not be reached */
}
XGB_DLL SEXP XGBoosterBoostedRounds_R(SEXP handle) {
SEXP out = Rf_protect(Rf_allocVector(INTSXP, 1));
R_API_BEGIN();
CHECK_CALL(XGBoosterBoostedRounds(R_ExternalPtrAddr(handle), INTEGER(out)));
R_API_END();
Rf_unprotect(1);
return out;
}
/* Note: R's integer class is 32-bit-and-signed only, while xgboost
supports more, so it returns it as a floating point instead */
XGB_DLL SEXP XGBoosterGetNumFeature_R(SEXP handle) {
SEXP out = Rf_protect(Rf_allocVector(REALSXP, 1));
R_API_BEGIN();
bst_ulong res;
CHECK_CALL(XGBoosterGetNumFeature(R_ExternalPtrAddr(handle), &res));
REAL(out)[0] = static_cast<double>(res);
R_API_END();
Rf_unprotect(1);
return out;
}
XGB_DLL SEXP XGBoosterSetParam_R(SEXP handle, SEXP name, SEXP val) {
R_API_BEGIN();
SEXP name_ = PROTECT(Rf_asChar(name));
@ -745,8 +952,8 @@ XGB_DLL SEXP XGBoosterSetParam_R(SEXP handle, SEXP name, SEXP val) {
XGB_DLL SEXP XGBoosterUpdateOneIter_R(SEXP handle, SEXP iter, SEXP dtrain) {
R_API_BEGIN();
CHECK_CALL(XGBoosterUpdateOneIter(R_ExternalPtrAddr(handle),
asInteger(iter),
R_ExternalPtrAddr(dtrain)));
Rf_asInteger(iter),
R_ExternalPtrAddr(dtrain)));
R_API_END();
return R_NilValue;
}

View File

@ -8,7 +8,9 @@
#define XGBOOST_R_H_ // NOLINT(*)
#include <R.h>
#include <Rinternals.h>
#include <R_ext/Altrep.h>
#include <R_ext/Random.h>
#include <Rmath.h>
@ -143,6 +145,25 @@ XGB_DLL SEXP XGDMatrixNumRow_R(SEXP handle);
*/
XGB_DLL SEXP XGDMatrixNumCol_R(SEXP handle);
/*!
* \brief Call R C-level function 'duplicate'
* \param obj Object to duplicate
*/
XGB_DLL SEXP XGDuplicate_R(SEXP obj);
/*!
* \brief Equality comparison for two pointers
* \param obj1 R 'externalptr'
* \param obj2 R 'externalptr'
*/
XGB_DLL SEXP XGPointerEqComparison_R(SEXP obj1, SEXP obj2);
/*!
* \brief Register the Altrep class used for the booster
* \param dll DLL info as provided by R_init
*/
XGB_DLL void XGBInitializeAltrepClass_R(DllInfo *dll);
/*!
* \brief return the quantile cuts used for the histogram method
* \param handle an instance of data matrix
@ -174,13 +195,37 @@ XGB_DLL SEXP XGDMatrixGetDataAsCSR_R(SEXP handle);
*/
XGB_DLL SEXP XGBoosterCreate_R(SEXP dmats);
/*!
* \brief copy information about features from a DMatrix into a Booster
* \param booster R 'externalptr' pointing to a booster object
* \param dmat R 'externalptr' pointing to a DMatrix object
*/
XGB_DLL SEXP XGBoosterCopyInfoFromDMatrix_R(SEXP booster, SEXP dmat);
/*!
* \brief create xgboost learner, saving the pointer into an existing R object
* \param dmats a list of dmatrix handles that will be cached
* \param R_handle a clean R external pointer (not holding any object)
* \brief handle R 'externalptr' holding the booster object
* \param field field name
* \param features features to set for the field
*/
XGB_DLL SEXP XGBoosterCreateInEmptyObj_R(SEXP dmats, SEXP R_handle);
XGB_DLL SEXP XGBoosterSetStrFeatureInfo_R(SEXP handle, SEXP field, SEXP features);
/*!
* \brief handle R 'externalptr' holding the booster object
* \param field field name
*/
XGB_DLL SEXP XGBoosterGetStrFeatureInfo_R(SEXP handle, SEXP field);
/*!
* \brief Get the number of boosted rounds from a model
* \param handle R 'externalptr' holding the booster object
*/
XGB_DLL SEXP XGBoosterBoostedRounds_R(SEXP handle);
/*!
* \brief Get the number of features to which the model was fitted
* \param handle R 'externalptr' holding the booster object
*/
XGB_DLL SEXP XGBoosterGetNumFeature_R(SEXP handle);
/*!
* \brief set parameters

View File

@ -3,7 +3,6 @@
## inconsistent is found.
pkgs <- c(
## CI
"caret",
"pkgbuild",
"roxygen2",
"XML",

View File

@ -25,10 +25,10 @@ test_that("train and predict binary classification", {
"train-error"
)
expect_equal(class(bst), "xgb.Booster")
expect_equal(bst$niter, nrounds)
expect_false(is.null(bst$evaluation_log))
expect_equal(nrow(bst$evaluation_log), nrounds)
expect_lt(bst$evaluation_log[, min(train_error)], 0.03)
expect_equal(xgb.get.num.boosted.rounds(bst), nrounds)
expect_false(is.null(attributes(bst)$evaluation_log))
expect_equal(nrow(attributes(bst)$evaluation_log), nrounds)
expect_lt(attributes(bst)$evaluation_log[, min(train_error)], 0.03)
pred <- predict(bst, test$data)
expect_length(pred, 1611)
@ -36,7 +36,7 @@ test_that("train and predict binary classification", {
pred1 <- predict(bst, train$data, ntreelimit = 1)
expect_length(pred1, 6513)
err_pred1 <- sum((pred1 > 0.5) != train$label) / length(train$label)
err_log <- bst$evaluation_log[1, train_error]
err_log <- attributes(bst)$evaluation_log[1, train_error]
expect_lt(abs(err_pred1 - err_log), 10e-6)
pred2 <- predict(bst, train$data, iterationrange = c(1, 2))
@ -160,9 +160,9 @@ test_that("train and predict softprob", {
),
"train-merror"
)
expect_false(is.null(bst$evaluation_log))
expect_lt(bst$evaluation_log[, min(train_merror)], 0.025)
expect_equal(bst$niter * 3, xgb.ntree(bst))
expect_false(is.null(attributes(bst)$evaluation_log))
expect_lt(attributes(bst)$evaluation_log[, min(train_merror)], 0.025)
expect_equal(xgb.get.num.boosted.rounds(bst) * 3, xgb.ntree(bst))
pred <- predict(bst, as.matrix(iris[, -5]))
expect_length(pred, nrow(iris) * 3)
# row sums add up to total probability of 1:
@ -172,12 +172,12 @@ test_that("train and predict softprob", {
expect_equal(as.numeric(t(mpred)), pred)
pred_labels <- max.col(mpred) - 1
err <- sum(pred_labels != lb) / length(lb)
expect_equal(bst$evaluation_log[5, train_merror], err, tolerance = 5e-6)
expect_equal(attributes(bst)$evaluation_log[5, train_merror], err, tolerance = 5e-6)
# manually calculate error at the 1st iteration:
mpred <- predict(bst, as.matrix(iris[, -5]), reshape = TRUE, ntreelimit = 1)
pred_labels <- max.col(mpred) - 1
err <- sum(pred_labels != lb) / length(lb)
expect_equal(bst$evaluation_log[1, train_merror], err, tolerance = 5e-6)
expect_equal(attributes(bst)$evaluation_log[1, train_merror], err, tolerance = 5e-6)
mpred1 <- predict(bst, as.matrix(iris[, -5]), reshape = TRUE, iterationrange = c(1, 2))
expect_equal(mpred, mpred1)
@ -211,14 +211,14 @@ test_that("train and predict softmax", {
),
"train-merror"
)
expect_false(is.null(bst$evaluation_log))
expect_lt(bst$evaluation_log[, min(train_merror)], 0.025)
expect_equal(bst$niter * 3, xgb.ntree(bst))
expect_false(is.null(attributes(bst)$evaluation_log))
expect_lt(attributes(bst)$evaluation_log[, min(train_merror)], 0.025)
expect_equal(xgb.get.num.boosted.rounds(bst) * 3, xgb.ntree(bst))
pred <- predict(bst, as.matrix(iris[, -5]))
expect_length(pred, nrow(iris))
err <- sum(pred != lb) / length(lb)
expect_equal(bst$evaluation_log[5, train_merror], err, tolerance = 5e-6)
expect_equal(attributes(bst)$evaluation_log[5, train_merror], err, tolerance = 5e-6)
})
test_that("train and predict RF", {
@ -232,12 +232,12 @@ test_that("train and predict RF", {
num_parallel_tree = 20, subsample = 0.6, colsample_bytree = 0.1,
watchlist = list(train = xgb.DMatrix(train$data, label = lb))
)
expect_equal(bst$niter, 1)
expect_equal(xgb.get.num.boosted.rounds(bst), 1)
expect_equal(xgb.ntree(bst), 20)
pred <- predict(bst, train$data)
pred_err <- sum((pred > 0.5) != lb) / length(lb)
expect_lt(abs(bst$evaluation_log[1, train_error] - pred_err), 10e-6)
expect_lt(abs(attributes(bst)$evaluation_log[1, train_error] - pred_err), 10e-6)
# expect_lt(pred_err, 0.03)
pred <- predict(bst, train$data, ntreelimit = 20)
@ -260,18 +260,18 @@ test_that("train and predict RF with softprob", {
num_parallel_tree = 4, subsample = 0.5, colsample_bytree = 0.5,
watchlist = list(train = xgb.DMatrix(as.matrix(iris[, -5]), label = lb))
)
expect_equal(bst$niter, 15)
expect_equal(xgb.get.num.boosted.rounds(bst), 15)
expect_equal(xgb.ntree(bst), 15 * 3 * 4)
# predict for all iterations:
pred <- predict(bst, as.matrix(iris[, -5]), reshape = TRUE)
expect_equal(dim(pred), c(nrow(iris), 3))
pred_labels <- max.col(pred) - 1
err <- sum(pred_labels != lb) / length(lb)
expect_equal(bst$evaluation_log[nrounds, train_merror], err, tolerance = 5e-6)
expect_equal(attributes(bst)$evaluation_log[nrounds, train_merror], err, tolerance = 5e-6)
# predict for 7 iterations and adjust for 4 parallel trees per iteration
pred <- predict(bst, as.matrix(iris[, -5]), reshape = TRUE, ntreelimit = 7 * 4)
err <- sum((max.col(pred) - 1) != lb) / length(lb)
expect_equal(bst$evaluation_log[7, train_merror], err, tolerance = 5e-6)
expect_equal(attributes(bst)$evaluation_log[7, train_merror], err, tolerance = 5e-6)
})
test_that("use of multiple eval metrics works", {
@ -284,9 +284,9 @@ test_that("use of multiple eval metrics works", {
),
"train-error.*train-auc.*train-logloss"
)
expect_false(is.null(bst$evaluation_log))
expect_equal(dim(bst$evaluation_log), c(2, 4))
expect_equal(colnames(bst$evaluation_log), c("iter", "train_error", "train_auc", "train_logloss"))
expect_false(is.null(attributes(bst)$evaluation_log))
expect_equal(dim(attributes(bst)$evaluation_log), c(2, 4))
expect_equal(colnames(attributes(bst)$evaluation_log), c("iter", "train_error", "train_auc", "train_logloss"))
expect_output(
bst2 <- xgb.train(
data = xgb.DMatrix(train$data, label = train$label), max_depth = 2,
@ -296,9 +296,9 @@ test_that("use of multiple eval metrics works", {
),
"train-error.*train-auc.*train-logloss"
)
expect_false(is.null(bst2$evaluation_log))
expect_equal(dim(bst2$evaluation_log), c(2, 4))
expect_equal(colnames(bst2$evaluation_log), c("iter", "train_error", "train_auc", "train_logloss"))
expect_false(is.null(attributes(bst2)$evaluation_log))
expect_equal(dim(attributes(bst2)$evaluation_log), c(2, 4))
expect_equal(colnames(attributes(bst2)$evaluation_log), c("iter", "train_error", "train_auc", "train_logloss"))
})
@ -318,41 +318,25 @@ test_that("training continuation works", {
# continue for two more:
bst2 <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0, xgb_model = bst1)
if (!windows_flag && !solaris_flag) {
expect_equal(bst$raw, bst2$raw)
expect_equal(xgb.save.raw(bst), xgb.save.raw(bst2))
}
expect_false(is.null(bst2$evaluation_log))
expect_equal(dim(bst2$evaluation_log), c(4, 2))
expect_equal(bst2$evaluation_log, bst$evaluation_log)
expect_false(is.null(attributes(bst2)$evaluation_log))
expect_equal(dim(attributes(bst2)$evaluation_log), c(4, 2))
expect_equal(attributes(bst2)$evaluation_log, attributes(bst)$evaluation_log)
# test continuing from raw model data
bst2 <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0, xgb_model = bst1$raw)
bst2 <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0, xgb_model = xgb.save.raw(bst1))
if (!windows_flag && !solaris_flag) {
expect_equal(bst$raw, bst2$raw)
expect_equal(xgb.save.raw(bst), xgb.save.raw(bst2))
}
expect_equal(dim(bst2$evaluation_log), c(2, 2))
expect_equal(dim(attributes(bst2)$evaluation_log), c(2, 2))
# test continuing from a model in file
fname <- file.path(tempdir(), "xgboost.json")
xgb.save(bst1, fname)
bst2 <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0, xgb_model = fname)
if (!windows_flag && !solaris_flag) {
expect_equal(bst$raw, bst2$raw)
expect_equal(xgb.save.raw(bst), xgb.save.raw(bst2))
}
expect_equal(dim(bst2$evaluation_log), c(2, 2))
})
test_that("model serialization works", {
out_path <- file.path(tempdir(), "model_serialization")
dtrain <- xgb.DMatrix(train$data, label = train$label, nthread = n_threads)
watchlist <- list(train = dtrain)
param <- list(objective = "binary:logistic", nthread = n_threads)
booster <- xgb.train(param, dtrain, nrounds = 4, watchlist)
raw <- xgb.serialize(booster)
saveRDS(raw, out_path)
raw <- readRDS(out_path)
loaded <- xgb.unserialize(raw)
raw_from_loaded <- xgb.serialize(loaded)
expect_equal(raw, raw_from_loaded)
file.remove(out_path)
expect_equal(dim(attributes(bst2)$evaluation_log), c(2, 2))
})
test_that("xgb.cv works", {
@ -455,8 +439,8 @@ test_that("max_delta_step works", {
# model with restricted max_delta_step
bst2 <- xgb.train(param, dtrain, nrounds, watchlist, verbose = 1, max_delta_step = 1)
# the no-restriction model is expected to have consistently lower loss during the initial iterations
expect_true(all(bst1$evaluation_log$train_logloss < bst2$evaluation_log$train_logloss))
expect_lt(mean(bst1$evaluation_log$train_logloss) / mean(bst2$evaluation_log$train_logloss), 0.8)
expect_true(all(attributes(bst1)$evaluation_log$train_logloss < attributes(bst2)$evaluation_log$train_logloss))
expect_lt(mean(attributes(bst1)$evaluation_log$train_logloss) / mean(attributes(bst2)$evaluation_log$train_logloss), 0.8)
})
test_that("colsample_bytree works", {
@ -675,3 +659,131 @@ test_that("Can use ranking objectives with either 'qid' or 'group'", {
pred_gr <- predict(model_gr, x)
expect_equal(pred_qid, pred_gr)
})
test_that("Coefficients from gblinear have the expected shape and names", {
# Single-column coefficients
data(mtcars)
y <- mtcars$mpg
x <- as.matrix(mtcars[, -1])
mm <- model.matrix(~., data = mtcars[, -1])
dm <- xgb.DMatrix(x, label = y, nthread = 1)
model <- xgb.train(
data = dm,
params = list(
booster = "gblinear",
nthread = 1
),
nrounds = 3
)
coefs <- coef(model)
expect_equal(length(coefs), ncol(x) + 1)
expect_equal(names(coefs), c("(Intercept)", colnames(x)))
pred_auto <- predict(model, x)
pred_manual <- as.numeric(mm %*% coefs)
expect_equal(pred_manual, pred_auto, tolerance = 1e-5)
# Multi-column coefficients
data(iris)
y <- as.numeric(iris$Species) - 1
x <- as.matrix(iris[, -5])
dm <- xgb.DMatrix(x, label = y, nthread = 1)
mm <- model.matrix(~., data = iris[, -5])
model <- xgb.train(
data = dm,
params = list(
booster = "gblinear",
objective = "multi:softprob",
num_class = 3,
nthread = 1
),
nrounds = 3
)
coefs <- coef(model)
expect_equal(nrow(coefs), ncol(x) + 1)
expect_equal(ncol(coefs), 3)
expect_equal(row.names(coefs), c("(Intercept)", colnames(x)))
pred_auto <- predict(model, x, outputmargin = TRUE, reshape = TRUE)
pred_manual <- unname(mm %*% coefs)
expect_equal(pred_manual, pred_auto, tolerance = 1e-7)
})
test_that("Deep copies work as expected", {
data(mtcars)
y <- mtcars$mpg
x <- mtcars[, -1]
dm <- xgb.DMatrix(x, label = y, nthread = 1)
model <- xgb.train(
data = dm,
params = list(nthread = 1),
nrounds = 3
)
xgb.attr(model, "my_attr") <- 100
model_shallow_copy <- model
xgb.attr(model_shallow_copy, "my_attr") <- 333
attr_orig <- xgb.attr(model, "my_attr")
attr_shallow <- xgb.attr(model_shallow_copy, "my_attr")
expect_equal(attr_orig, attr_shallow)
model_deep_copy <- xgb.copy.Booster(model)
xgb.attr(model_deep_copy, "my_attr") <- 444
attr_orig <- xgb.attr(model, "my_attr")
attr_deep <- xgb.attr(model_deep_copy, "my_attr")
expect_false(attr_orig == attr_deep)
})
test_that("Pointer comparison works as expected", {
library(xgboost)
y <- mtcars$mpg
x <- as.matrix(mtcars[, -1])
model <- xgb.train(
params = list(nthread = 1),
data = xgb.DMatrix(x, label = y, nthread = 1),
nrounds = 3
)
model_shallow_copy <- model
expect_true(xgb.is.same.Booster(model, model_shallow_copy))
model_deep_copy <- xgb.copy.Booster(model)
expect_false(xgb.is.same.Booster(model, model_deep_copy))
xgb.attr(model_shallow_copy, "my_attr") <- 111
expect_equal(xgb.attr(model, "my_attr"), "111")
expect_null(xgb.attr(model_deep_copy, "my_attr"))
})
test_that("DMatrix field are set to booster when training", {
set.seed(123)
y <- rnorm(100)
x <- matrix(rnorm(100 * 3), nrow = 100)
x[, 2] <- abs(as.integer(x[, 2]))
dm_unnamed <- xgb.DMatrix(x, label = y, nthread = 1)
dm_feature_names <- xgb.DMatrix(x, label = y, feature_names = c("a", "b", "c"), nthread = 1)
dm_feature_types <- xgb.DMatrix(x, label = y)
setinfo(dm_feature_types, "feature_type", c("q", "c", "q"))
dm_both <- xgb.DMatrix(x, label = y, feature_names = c("a", "b", "c"), nthread = 1)
setinfo(dm_both, "feature_type", c("q", "c", "q"))
params <- list(nthread = 1)
model_unnamed <- xgb.train(data = dm_unnamed, params = params, nrounds = 3)
model_feature_names <- xgb.train(data = dm_feature_names, params = params, nrounds = 3)
model_feature_types <- xgb.train(data = dm_feature_types, params = params, nrounds = 3)
model_both <- xgb.train(data = dm_both, params = params, nrounds = 3)
expect_null(getinfo(model_unnamed, "feature_name"))
expect_equal(getinfo(model_feature_names, "feature_name"), c("a", "b", "c"))
expect_null(getinfo(model_feature_types, "feature_name"))
expect_equal(getinfo(model_both, "feature_name"), c("a", "b", "c"))
expect_null(variable.names(model_unnamed))
expect_equal(variable.names(model_feature_names), c("a", "b", "c"))
expect_null(variable.names(model_feature_types))
expect_equal(variable.names(model_both), c("a", "b", "c"))
expect_null(getinfo(model_unnamed, "feature_type"))
expect_null(getinfo(model_feature_names, "feature_type"))
expect_equal(getinfo(model_feature_types, "feature_type"), c("q", "c", "q"))
expect_equal(getinfo(model_both, "feature_type"), c("q", "c", "q"))
})

View File

@ -111,9 +111,9 @@ test_that("can store evaluation_log without printing", {
expect_silent(
bst <- xgb.train(param, dtrain, nrounds = 10, watchlist, eta = 1, verbose = 0)
)
expect_false(is.null(bst$evaluation_log))
expect_false(is.null(bst$evaluation_log$train_error))
expect_lt(bst$evaluation_log[, min(train_error)], 0.2)
expect_false(is.null(attributes(bst)$evaluation_log))
expect_false(is.null(attributes(bst)$evaluation_log$train_error))
expect_lt(attributes(bst)$evaluation_log[, min(train_error)], 0.2)
})
test_that("cb.reset.parameters works as expected", {
@ -121,34 +121,34 @@ test_that("cb.reset.parameters works as expected", {
# fixed eta
set.seed(111)
bst0 <- xgb.train(param, dtrain, nrounds = 2, watchlist, eta = 0.9, verbose = 0)
expect_false(is.null(bst0$evaluation_log))
expect_false(is.null(bst0$evaluation_log$train_error))
expect_false(is.null(attributes(bst0)$evaluation_log))
expect_false(is.null(attributes(bst0)$evaluation_log$train_error))
# same eta but re-set as a vector parameter in the callback
set.seed(111)
my_par <- list(eta = c(0.9, 0.9))
bst1 <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0,
callbacks = list(cb.reset.parameters(my_par)))
expect_false(is.null(bst1$evaluation_log$train_error))
expect_equal(bst0$evaluation_log$train_error,
bst1$evaluation_log$train_error)
expect_false(is.null(attributes(bst1)$evaluation_log$train_error))
expect_equal(attributes(bst0)$evaluation_log$train_error,
attributes(bst1)$evaluation_log$train_error)
# same eta but re-set via a function in the callback
set.seed(111)
my_par <- list(eta = function(itr, itr_end) 0.9)
bst2 <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0,
callbacks = list(cb.reset.parameters(my_par)))
expect_false(is.null(bst2$evaluation_log$train_error))
expect_equal(bst0$evaluation_log$train_error,
bst2$evaluation_log$train_error)
expect_false(is.null(attributes(bst2)$evaluation_log$train_error))
expect_equal(attributes(bst0)$evaluation_log$train_error,
attributes(bst2)$evaluation_log$train_error)
# different eta re-set as a vector parameter in the callback
set.seed(111)
my_par <- list(eta = c(0.6, 0.5))
bst3 <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0,
callbacks = list(cb.reset.parameters(my_par)))
expect_false(is.null(bst3$evaluation_log$train_error))
expect_false(all(bst0$evaluation_log$train_error == bst3$evaluation_log$train_error))
expect_false(is.null(attributes(bst3)$evaluation_log$train_error))
expect_false(all(attributes(bst0)$evaluation_log$train_error == attributes(bst3)$evaluation_log$train_error))
# resetting multiple parameters at the same time runs with no error
my_par <- list(eta = c(1., 0.5), gamma = c(1, 2), max_depth = c(4, 8))
@ -166,8 +166,8 @@ test_that("cb.reset.parameters works as expected", {
my_par <- list(eta = c(0., 0.))
bstX <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0,
callbacks = list(cb.reset.parameters(my_par)))
expect_false(is.null(bstX$evaluation_log$train_error))
er <- unique(bstX$evaluation_log$train_error)
expect_false(is.null(attributes(bstX)$evaluation_log$train_error))
er <- unique(attributes(bstX)$evaluation_log$train_error)
expect_length(er, 1)
expect_gt(er, 0.4)
})
@ -183,14 +183,14 @@ test_that("cb.save.model works as expected", {
expect_true(file.exists(files[2]))
b1 <- xgb.load(files[1])
xgb.parameters(b1) <- list(nthread = 2)
expect_equal(xgb.ntree(b1), 1)
expect_equal(xgb.get.num.boosted.rounds(b1), 1)
b2 <- xgb.load(files[2])
xgb.parameters(b2) <- list(nthread = 2)
expect_equal(xgb.ntree(b2), 2)
expect_equal(xgb.get.num.boosted.rounds(b2), 2)
xgb.config(b2) <- xgb.config(bst)
expect_equal(xgb.config(bst), xgb.config(b2))
expect_equal(bst$raw, b2$raw)
expect_equal(xgb.save.raw(bst), xgb.save.raw(b2))
# save_period = 0 saves the last iteration's model
bst <- xgb.train(param, dtrain, nrounds = 2, watchlist, eta = 1, verbose = 0,
@ -198,7 +198,7 @@ test_that("cb.save.model works as expected", {
expect_true(file.exists(files[3]))
b2 <- xgb.load(files[3])
xgb.config(b2) <- xgb.config(bst)
expect_equal(bst$raw, b2$raw)
expect_equal(xgb.save.raw(bst), xgb.save.raw(b2))
for (f in files) if (file.exists(f)) file.remove(f)
})
@ -209,14 +209,14 @@ test_that("early stopping xgb.train works", {
bst <- xgb.train(param, dtrain, nrounds = 20, watchlist, eta = 0.3,
early_stopping_rounds = 3, maximize = FALSE)
, "Stopping. Best iteration")
expect_false(is.null(bst$best_iteration))
expect_lt(bst$best_iteration, 19)
expect_equal(bst$best_iteration, bst$best_ntreelimit)
expect_false(is.null(xgb.attr(bst, "best_iteration")))
expect_lt(xgb.attr(bst, "best_iteration"), 19)
expect_equal(xgb.attr(bst, "best_iteration"), xgb.attr(bst, "best_ntreelimit"))
pred <- predict(bst, dtest)
expect_equal(length(pred), 1611)
err_pred <- err(ltest, pred)
err_log <- bst$evaluation_log[bst$best_iteration, test_error]
err_log <- attributes(bst)$evaluation_log[xgb.attr(bst, "best_iteration"), test_error]
expect_equal(err_log, err_pred, tolerance = 5e-6)
set.seed(11)
@ -224,15 +224,15 @@ test_that("early stopping xgb.train works", {
bst0 <- xgb.train(param, dtrain, nrounds = 20, watchlist, eta = 0.3,
early_stopping_rounds = 3, maximize = FALSE, verbose = 0)
)
expect_equal(bst$evaluation_log, bst0$evaluation_log)
expect_equal(attributes(bst)$evaluation_log, attributes(bst0)$evaluation_log)
fname <- file.path(tempdir(), "model.bin")
xgb.save(bst, fname)
loaded <- xgb.load(fname)
expect_false(is.null(loaded$best_iteration))
expect_equal(loaded$best_iteration, bst$best_ntreelimit)
expect_equal(loaded$best_ntreelimit, bst$best_ntreelimit)
expect_false(is.null(xgb.attr(loaded, "best_iteration")))
expect_equal(xgb.attr(loaded, "best_iteration"), xgb.attr(bst, "best_ntreelimit"))
expect_equal(xgb.attr(loaded, "best_ntreelimit"), xgb.attr(bst, "best_ntreelimit"))
})
test_that("early stopping using a specific metric works", {
@ -243,14 +243,14 @@ test_that("early stopping using a specific metric works", {
callbacks = list(cb.early.stop(stopping_rounds = 3, maximize = FALSE,
metric_name = 'test_logloss')))
, "Stopping. Best iteration")
expect_false(is.null(bst$best_iteration))
expect_lt(bst$best_iteration, 19)
expect_equal(bst$best_iteration, bst$best_ntreelimit)
expect_false(is.null(xgb.attr(bst, "best_iteration")))
expect_lt(xgb.attr(bst, "best_iteration"), 19)
expect_equal(xgb.attr(bst, "best_iteration"), xgb.attr(bst, "best_ntreelimit"))
pred <- predict(bst, dtest, ntreelimit = bst$best_ntreelimit)
pred <- predict(bst, dtest, ntreelimit = xgb.attr(bst, "best_ntreelimit"))
expect_equal(length(pred), 1611)
logloss_pred <- sum(-ltest * log(pred) - (1 - ltest) * log(1 - pred)) / length(ltest)
logloss_log <- bst$evaluation_log[bst$best_iteration, test_logloss]
logloss_log <- attributes(bst)$evaluation_log[xgb.attr(bst, "best_iteration"), test_logloss]
expect_equal(logloss_log, logloss_pred, tolerance = 1e-5)
})

View File

@ -35,9 +35,9 @@ num_round <- 2
test_that("custom objective works", {
bst <- xgb.train(param, dtrain, num_round, watchlist)
expect_equal(class(bst), "xgb.Booster")
expect_false(is.null(bst$evaluation_log))
expect_false(is.null(bst$evaluation_log$eval_error))
expect_lt(bst$evaluation_log[num_round, eval_error], 0.03)
expect_false(is.null(attributes(bst)$evaluation_log))
expect_false(is.null(attributes(bst)$evaluation_log$eval_error))
expect_lt(attributes(bst)$evaluation_log[num_round, eval_error], 0.03)
})
test_that("custom objective in CV works", {
@ -50,7 +50,7 @@ test_that("custom objective in CV works", {
test_that("custom objective with early stop works", {
bst <- xgb.train(param, dtrain, 10, watchlist)
expect_equal(class(bst), "xgb.Booster")
train_log <- bst$evaluation_log$train_error
train_log <- attributes(bst)$evaluation_log$train_error
expect_true(all(diff(train_log) <= 0))
})

View File

@ -24,28 +24,28 @@ test_that("gblinear works", {
bst <- xgb.train(param, dtrain, n, watchlist, verbose = VERB, feature_selector = 'shuffle')
ypred <- predict(bst, dtest)
expect_equal(length(getinfo(dtest, 'label')), 1611)
expect_lt(bst$evaluation_log$eval_error[n], ERR_UL)
expect_lt(attributes(bst)$evaluation_log$eval_error[n], ERR_UL)
bst <- xgb.train(param, dtrain, n, watchlist, verbose = VERB, feature_selector = 'cyclic',
callbacks = list(cb.gblinear.history()))
expect_lt(bst$evaluation_log$eval_error[n], ERR_UL)
expect_lt(attributes(bst)$evaluation_log$eval_error[n], ERR_UL)
h <- xgb.gblinear.history(bst)
expect_equal(dim(h), c(n, ncol(dtrain) + 1))
expect_is(h, "matrix")
param$updater <- 'coord_descent'
bst <- xgb.train(param, dtrain, n, watchlist, verbose = VERB, feature_selector = 'cyclic')
expect_lt(bst$evaluation_log$eval_error[n], ERR_UL)
expect_lt(attributes(bst)$evaluation_log$eval_error[n], ERR_UL)
bst <- xgb.train(param, dtrain, n, watchlist, verbose = VERB, feature_selector = 'shuffle')
expect_lt(bst$evaluation_log$eval_error[n], ERR_UL)
expect_lt(attributes(bst)$evaluation_log$eval_error[n], ERR_UL)
bst <- xgb.train(param, dtrain, 2, watchlist, verbose = VERB, feature_selector = 'greedy')
expect_lt(bst$evaluation_log$eval_error[2], ERR_UL)
expect_lt(attributes(bst)$evaluation_log$eval_error[2], ERR_UL)
bst <- xgb.train(param, dtrain, n, watchlist, verbose = VERB, feature_selector = 'thrifty',
top_k = 50, callbacks = list(cb.gblinear.history(sparse = TRUE)))
expect_lt(bst$evaluation_log$eval_error[n], ERR_UL)
expect_lt(attributes(bst)$evaluation_log$eval_error[n], ERR_UL)
h <- xgb.gblinear.history(bst)
expect_equal(dim(h), c(n, ncol(dtrain) + 1))
expect_s4_class(h, "dgCMatrix")
@ -72,10 +72,10 @@ test_that("gblinear early stopping works", {
booster <- xgb.train(
param, dtrain, n, list(eval = dtest, train = dtrain), early_stopping_rounds = es_round
)
expect_equal(booster$best_iteration, 5)
expect_equal(xgb.attr(booster, "best_iteration"), 5)
predt_es <- predict(booster, dtrain)
n <- booster$best_iteration + es_round
n <- xgb.attr(booster, "best_iteration") + es_round
booster <- xgb.train(
param, dtrain, n, list(eval = dtest, train = dtrain), early_stopping_rounds = es_round
)

View File

@ -49,6 +49,9 @@ mbst.GLM <- xgb.train(data = xgb.DMatrix(as.matrix(iris[, -5]), label = mlabel),
booster = "gblinear", eta = 0.1, nthread = 1, nrounds = nrounds,
objective = "multi:softprob", num_class = nclass, base_score = 0)
# without feature names
bst.Tree.unnamed <- xgb.copy.Booster(bst.Tree)
setinfo(bst.Tree.unnamed, "feature_name", NULL)
test_that("xgb.dump works", {
.skip_if_vcd_not_available()
@ -204,7 +207,7 @@ test_that("xgb-attribute functionality", {
list.ch <- list.val[order(names(list.val))]
list.ch <- lapply(list.ch, as.character)
# note: iter is 0-index in xgb attributes
list.default <- list(niter = as.character(nrounds - 1))
list.default <- list()
list.ch <- c(list.ch, list.default)
# proper input:
expect_error(xgb.attr(bst.Tree, NULL))
@ -212,24 +215,25 @@ test_that("xgb-attribute functionality", {
# set & get:
expect_null(xgb.attr(bst.Tree, "asdf"))
expect_equal(xgb.attributes(bst.Tree), list.default)
xgb.attr(bst.Tree, "my_attr") <- val
expect_equal(xgb.attr(bst.Tree, "my_attr"), val)
xgb.attributes(bst.Tree) <- list.val
expect_equal(xgb.attributes(bst.Tree), list.ch)
bst.Tree.copy <- xgb.copy.Booster(bst.Tree)
xgb.attr(bst.Tree.copy, "my_attr") <- val
expect_equal(xgb.attr(bst.Tree.copy, "my_attr"), val)
xgb.attributes(bst.Tree.copy) <- list.val
expect_equal(xgb.attributes(bst.Tree.copy), list.ch)
# serializing:
fname <- file.path(tempdir(), "xgb.model")
xgb.save(bst.Tree, fname)
fname <- file.path(tempdir(), "xgb.ubj")
xgb.save(bst.Tree.copy, fname)
bst <- xgb.load(fname)
expect_equal(xgb.attr(bst, "my_attr"), val)
expect_equal(xgb.attributes(bst), list.ch)
# deletion:
xgb.attr(bst, "my_attr") <- NULL
expect_null(xgb.attr(bst, "my_attr"))
expect_equal(xgb.attributes(bst), list.ch[c("a", "b", "niter")])
expect_equal(xgb.attributes(bst), list.ch[c("a", "b")])
xgb.attributes(bst) <- list(a = NULL, b = NULL)
expect_equal(xgb.attributes(bst), list.default)
xgb.attributes(bst) <- list(niter = NULL)
expect_null(xgb.attributes(bst))
expect_equal(xgb.attributes(bst), list())
})
if (grepl('Windows', Sys.info()[['sysname']], fixed = TRUE) ||
@ -262,21 +266,17 @@ test_that("xgb.Booster serializing as R object works", {
dtrain <- xgb.DMatrix(sparse_matrix, label = label, nthread = 2)
expect_equal(predict(bst.Tree, dtrain), predict(bst, dtrain), tolerance = float_tolerance)
expect_equal(xgb.dump(bst.Tree), xgb.dump(bst))
fname_bin <- file.path(tempdir(), "xgb.model")
xgb.save(bst, fname_bin)
bst <- readRDS(fname_rds)
nil_ptr <- new("externalptr")
class(nil_ptr) <- "xgb.Booster.handle"
expect_true(identical(bst$handle, nil_ptr))
bst <- xgb.Booster.complete(bst)
expect_true(!identical(bst$handle, nil_ptr))
expect_equal(predict(bst.Tree, dtrain), predict(bst, dtrain), tolerance = float_tolerance)
})
test_that("xgb.model.dt.tree works with and without feature names", {
.skip_if_vcd_not_available()
names.dt.trees <- c("Tree", "Node", "ID", "Feature", "Split", "Yes", "No", "Missing", "Gain", "Cover")
dt.tree <- xgb.model.dt.tree(feature_names = feature.names, model = bst.Tree)
dt.tree <- xgb.model.dt.tree(model = bst.Tree)
expect_equal(names.dt.trees, names(dt.tree))
if (!flag_32bit)
expect_equal(dim(dt.tree), c(188, 10))
@ -286,9 +286,7 @@ test_that("xgb.model.dt.tree works with and without feature names", {
expect_equal(dt.tree, dt.tree.0)
# when model contains no feature names:
bst.Tree.x <- bst.Tree
bst.Tree.x$feature_names <- NULL
dt.tree.x <- xgb.model.dt.tree(model = bst.Tree.x)
dt.tree.x <- xgb.model.dt.tree(model = bst.Tree.unnamed)
expect_output(str(dt.tree.x), 'Feature.*\\"3\\"')
expect_equal(dt.tree[, -4, with = FALSE], dt.tree.x[, -4, with = FALSE])
@ -316,9 +314,7 @@ test_that("xgb.importance works with and without feature names", {
expect_equal(importance.Tree, importance.Tree.0, tolerance = float_tolerance)
# when model contains no feature names:
bst.Tree.x <- bst.Tree
bst.Tree.x$feature_names <- NULL
importance.Tree.x <- xgb.importance(model = bst.Tree)
importance.Tree.x <- xgb.importance(model = bst.Tree.unnamed)
expect_equal(importance.Tree[, -1, with = FALSE], importance.Tree.x[, -1, with = FALSE],
tolerance = float_tolerance)
@ -334,7 +330,7 @@ test_that("xgb.importance works with and without feature names", {
importance <- xgb.importance(feature_names = feature.names, model = bst.Tree, trees = trees)
importance_from_dump <- function() {
model_text_dump <- xgb.dump(model = bst.Tree, with_stats = TRUE, trees = trees)
model_text_dump <- xgb.dump(model = bst.Tree.unnamed, with_stats = TRUE, trees = trees)
imp <- xgb.model.dt.tree(
feature_names = feature.names,
text = model_text_dump,
@ -414,13 +410,13 @@ test_that("xgb.plot.importance de-duplicates features", {
test_that("xgb.plot.tree works with and without feature names", {
.skip_if_vcd_not_available()
expect_silent(xgb.plot.tree(feature_names = feature.names, model = bst.Tree))
expect_silent(xgb.plot.tree(feature_names = feature.names, model = bst.Tree.unnamed))
expect_silent(xgb.plot.tree(model = bst.Tree))
})
test_that("xgb.plot.multi.trees works with and without feature names", {
.skip_if_vcd_not_available()
xgb.plot.multi.trees(model = bst.Tree, feature_names = feature.names, features_keep = 3)
xgb.plot.multi.trees(model = bst.Tree.unnamed, feature_names = feature.names, features_keep = 3)
xgb.plot.multi.trees(model = bst.Tree, features_keep = 3)
})

View File

@ -17,8 +17,8 @@ test_that("load/save raw works", {
ubj_bytes <- xgb.save.raw(booster, raw_format = "ubj")
old_bytes <- xgb.save.raw(booster, raw_format = "deprecated")
from_json <- xgb.load.raw(json_bytes, as_booster = TRUE)
from_ubj <- xgb.load.raw(ubj_bytes, as_booster = TRUE)
from_json <- xgb.load.raw(json_bytes)
from_ubj <- xgb.load.raw(ubj_bytes)
json2old <- xgb.save.raw(from_json, raw_format = "deprecated")
ubj2old <- xgb.save.raw(from_ubj, raw_format = "deprecated")
@ -26,3 +26,46 @@ test_that("load/save raw works", {
expect_equal(json2old, ubj2old)
expect_equal(json2old, old_bytes)
})
test_that("saveRDS preserves C and R attributes", {
data(mtcars)
y <- mtcars$mpg
x <- as.matrix(mtcars[, -1])
dm <- xgb.DMatrix(x, label = y, nthread = 1)
model <- xgb.train(
data = dm,
params = list(nthread = 1, max_depth = 2),
nrounds = 5
)
attributes(model)$my_attr <- "qwerty"
xgb.attr(model, "c_attr") <- "asdf"
fname <- file.path(tempdir(), "xgb_model.Rds")
saveRDS(model, fname)
model_new <- readRDS(fname)
expect_equal(attributes(model_new)$my_attr, attributes(model)$my_attr)
expect_equal(xgb.attr(model, "c_attr"), xgb.attr(model_new, "c_attr"))
})
test_that("R serializers keep C config", {
data(mtcars)
y <- mtcars$mpg
x <- as.matrix(mtcars[, -1])
dm <- xgb.DMatrix(x, label = y, nthread = 1)
model <- xgb.train(
data = dm,
params = list(
tree_method = "approx",
nthread = 1,
max_depth = 2
),
nrounds = 3
)
model_new <- unserialize(serialize(model, NULL))
expect_equal(
xgb.config(model)$learner$gradient_booster$gbtree_train_param$tree_method,
xgb.config(model_new)$learner$gradient_booster$gbtree_train_param$tree_method
)
expect_equal(variable.names(model), variable.names(model_new))
})

View File

@ -23,11 +23,7 @@ get_num_tree <- function(booster) {
}
run_booster_check <- function(booster, name) {
# If given a handle, we need to call xgb.Booster.complete() prior to using xgb.config().
if (inherits(booster, "xgb.Booster") && xgboost:::is.null.handle(booster$handle)) {
booster <- xgb.Booster.complete(booster)
}
config <- jsonlite::fromJSON(xgb.config(booster))
config <- xgb.config(booster)
run_model_param_check(config)
if (name == 'cls') {
testthat::expect_equal(get_num_tree(booster),
@ -76,6 +72,10 @@ test_that("Models from previous versions of XGBoost can be loaded", {
name <- m[3]
is_rds <- endsWith(model_file, '.rds')
is_json <- endsWith(model_file, '.json')
# TODO: update this test for new RDS format
if (is_rds) {
return(NULL)
}
# Expect an R warning when a model is loaded from RDS and it was generated by version < 1.1.x
if (is_rds && compareVersion(model_xgb_ver, '1.1.1.1') < 0) {
booster <- readRDS(model_file)

View File

@ -19,12 +19,12 @@ bst <- xgb.train(data = dtrain,
objective = "binary:logistic")
test_that("call is exposed to R", {
expect_false(is.null(bst$call))
expect_is(bst$call, "call")
expect_false(is.null(attributes(bst)$call))
expect_is(attributes(bst)$call, "call")
})
test_that("params is exposed to R", {
model_params <- bst$params
model_params <- attributes(bst)$params
expect_is(model_params, "list")
expect_equal(model_params$eta, 1)
expect_equal(model_params$max_depth, 2)

View File

@ -17,8 +17,8 @@ test_that('Test ranking with unweighted data', {
eval_metric = 'auc', eval_metric = 'aucpr', nthread = n_threads)
bst <- xgb.train(params, dtrain, nrounds = 10, watchlist = list(train = dtrain))
# Check if the metric is monotone increasing
expect_true(all(diff(bst$evaluation_log$train_auc) >= 0))
expect_true(all(diff(bst$evaluation_log$train_aucpr) >= 0))
expect_true(all(diff(attributes(bst)$evaluation_log$train_auc) >= 0))
expect_true(all(diff(attributes(bst)$evaluation_log$train_aucpr) >= 0))
})
test_that('Test ranking with weighted data', {
@ -41,8 +41,8 @@ test_that('Test ranking with weighted data', {
)
bst <- xgb.train(params, dtrain, nrounds = 10, watchlist = list(train = dtrain))
# Check if the metric is monotone increasing
expect_true(all(diff(bst$evaluation_log$train_auc) >= 0))
expect_true(all(diff(bst$evaluation_log$train_aucpr) >= 0))
expect_true(all(diff(attributes(bst)$evaluation_log$train_auc) >= 0))
expect_true(all(diff(attributes(bst)$evaluation_log$train_aucpr) >= 0))
for (i in 1:10) {
pred <- predict(bst, newdata = dtrain, ntreelimit = i)
# is_sorted[i]: is i-th group correctly sorted by the ranking predictor?

View File

@ -40,7 +40,12 @@ test_that("updating the model works", {
bst1r <- xgb.train(p1r, dtrain, nrounds = 10, watchlist, verbose = 0)
tr1r <- xgb.model.dt.tree(model = bst1r)
# all should be the same when no subsampling
expect_equal(bst1$evaluation_log, bst1r$evaluation_log)
expect_equal(attributes(bst1)$evaluation_log, attributes(bst1r)$evaluation_log)
expect_equal(
jsonlite::fromJSON(rawToChar(xgb.save.raw(bst1, raw_format = "json"))),
jsonlite::fromJSON(rawToChar(xgb.save.raw(bst1r, raw_format = "json"))),
tolerance = 1e-6
)
if (!win32_flag) {
expect_equal(tr1, tr1r, tolerance = 0.00001, check.attributes = FALSE)
}
@ -51,7 +56,7 @@ test_that("updating the model works", {
bst2r <- xgb.train(p2r, dtrain, nrounds = 10, watchlist, verbose = 0)
tr2r <- xgb.model.dt.tree(model = bst2r)
# should be the same evaluation but different gains and larger cover
expect_equal(bst2$evaluation_log, bst2r$evaluation_log)
expect_equal(attributes(bst2)$evaluation_log, attributes(bst2r)$evaluation_log)
if (!win32_flag) {
expect_equal(tr2[Feature == 'Leaf']$Gain, tr2r[Feature == 'Leaf']$Gain)
}
@ -59,11 +64,25 @@ test_that("updating the model works", {
expect_gt(sum(tr2r$Cover) / sum(tr2$Cover), 1.5)
# process type 'update' for no-subsampling model, refreshing the tree stats AND leaves from training data:
set.seed(123)
p1u <- modifyList(p1, list(process_type = 'update', updater = 'refresh', refresh_leaf = TRUE))
bst1u <- xgb.train(p1u, dtrain, nrounds = 10, watchlist, verbose = 0, xgb_model = bst1)
tr1u <- xgb.model.dt.tree(model = bst1u)
# all should be the same when no subsampling
expect_equal(bst1$evaluation_log, bst1u$evaluation_log)
expect_equal(attributes(bst1)$evaluation_log, attributes(bst1u)$evaluation_log)
expect_equal(
jsonlite::fromJSON(rawToChar(xgb.save.raw(bst1, raw_format = "json"))),
jsonlite::fromJSON(rawToChar(xgb.save.raw(bst1u, raw_format = "json"))),
tolerance = 1e-6
)
expect_equal(tr1, tr1u, tolerance = 0.00001, check.attributes = FALSE)
# same thing but with a serialized model
set.seed(123)
bst1u <- xgb.train(p1u, dtrain, nrounds = 10, watchlist, verbose = 0, xgb_model = xgb.save.raw(bst1))
tr1u <- xgb.model.dt.tree(model = bst1u)
# all should be the same when no subsampling
expect_equal(attributes(bst1)$evaluation_log, attributes(bst1u)$evaluation_log)
expect_equal(tr1, tr1u, tolerance = 0.00001, check.attributes = FALSE)
# process type 'update' for model with subsampling, refreshing only the tree stats from training data:
@ -71,12 +90,12 @@ test_that("updating the model works", {
bst2u <- xgb.train(p2u, dtrain, nrounds = 10, watchlist, verbose = 0, xgb_model = bst2)
tr2u <- xgb.model.dt.tree(model = bst2u)
# should be the same evaluation but different gains and larger cover
expect_equal(bst2$evaluation_log, bst2u$evaluation_log)
expect_equal(attributes(bst2)$evaluation_log, attributes(bst2u)$evaluation_log)
expect_equal(tr2[Feature == 'Leaf']$Gain, tr2u[Feature == 'Leaf']$Gain)
expect_gt(sum(abs(tr2[Feature != 'Leaf']$Gain - tr2u[Feature != 'Leaf']$Gain)), 100)
expect_gt(sum(tr2u$Cover) / sum(tr2$Cover), 1.5)
# the results should be the same as for the model with an extra 'refresh' updater
expect_equal(bst2r$evaluation_log, bst2u$evaluation_log)
expect_equal(attributes(bst2r)$evaluation_log, attributes(bst2u)$evaluation_log)
if (!win32_flag) {
expect_equal(tr2r, tr2u, tolerance = 0.00001, check.attributes = FALSE)
}
@ -86,7 +105,7 @@ test_that("updating the model works", {
bst1ut <- xgb.train(p1ut, dtest, nrounds = 10, watchlist, verbose = 0, xgb_model = bst1)
tr1ut <- xgb.model.dt.tree(model = bst1ut)
# should be the same evaluations but different gains and smaller cover (test data is smaller)
expect_equal(bst1$evaluation_log, bst1ut$evaluation_log)
expect_equal(attributes(bst1)$evaluation_log, attributes(bst1ut)$evaluation_log)
expect_equal(tr1[Feature == 'Leaf']$Gain, tr1ut[Feature == 'Leaf']$Gain)
expect_gt(sum(abs(tr1[Feature != 'Leaf']$Gain - tr1ut[Feature != 'Leaf']$Gain)), 100)
expect_lt(sum(tr1ut$Cover) / sum(tr1$Cover), 0.5)
@ -106,11 +125,12 @@ test_that("updating works for multiclass & multitree", {
# run update process for an original model with subsampling
p0u <- modifyList(p0, list(process_type = 'update', updater = 'refresh', refresh_leaf = FALSE))
bst0u <- xgb.train(p0u, dtr, nrounds = bst0$niter, watchlist, xgb_model = bst0, verbose = 0)
bst0u <- xgb.train(p0u, dtr, nrounds = xgb.get.num.boosted.rounds(bst0),
watchlist, xgb_model = bst0, verbose = 0)
tr0u <- xgb.model.dt.tree(model = bst0u)
# should be the same evaluation but different gains and larger cover
expect_equal(bst0$evaluation_log, bst0u$evaluation_log)
expect_equal(attributes(bst0)$evaluation_log, attributes(bst0u)$evaluation_log)
expect_equal(tr0[Feature == 'Leaf']$Gain, tr0u[Feature == 'Leaf']$Gain)
expect_gt(sum(abs(tr0[Feature != 'Leaf']$Gain - tr0u[Feature != 'Leaf']$Gain)), 100)
expect_gt(sum(tr0u$Cover) / sum(tr0$Cover), 1.5)

View File

@ -1,223 +0,0 @@
\documentclass{article}
\RequirePackage{url}
\usepackage{hyperref}
\RequirePackage{amsmath}
\RequirePackage{natbib}
\RequirePackage[a4paper,lmargin={1.25in},rmargin={1.25in},tmargin={1in},bmargin={1in}]{geometry}
\makeatletter
% \VignetteIndexEntry{xgboost: eXtreme Gradient Boosting}
%\VignetteKeywords{xgboost, gbm, gradient boosting machines}
%\VignettePackage{xgboost}
% \VignetteEngine{knitr::knitr}
\makeatother
\begin{document}
%\SweaveOpts{concordance=TRUE}
<<knitropts,echo=FALSE,message=FALSE>>=
if (require('knitr')) opts_chunk$set(fig.width = 5, fig.height = 5, fig.align = 'center', tidy = FALSE, warning = FALSE, cache = TRUE)
@
%
<<prelim,echo=FALSE>>=
xgboost.version <- packageDescription("xgboost")$Version
@
%
\begin{center}
\vspace*{6\baselineskip}
\rule{\textwidth}{1.6pt}\vspace*{-\baselineskip}\vspace*{2pt}
\rule{\textwidth}{0.4pt}\\[2\baselineskip]
{\LARGE \textbf{xgboost: eXtreme Gradient Boosting}}\\[1.2\baselineskip]
\rule{\textwidth}{0.4pt}\vspace*{-\baselineskip}\vspace{3.2pt}
\rule{\textwidth}{1.6pt}\\[2\baselineskip]
{\Large Tianqi Chen, Tong He}\\[\baselineskip]
{\large Package Version: \Sexpr{xgboost.version}}\\[\baselineskip]
{\large \today}\par
\vfill
\end{center}
\thispagestyle{empty}
\clearpage
\setcounter{page}{1}
\section{Introduction}
This is an introductory document of using the \verb@xgboost@ package in R.
\verb@xgboost@ is short for eXtreme Gradient Boosting package. It is an efficient
and scalable implementation of gradient boosting framework by \citep{friedman2001greedy} \citep{friedman2000additive}.
The package includes efficient linear model solver and tree learning algorithm.
It supports various objective functions, including regression, classification
and ranking. The package is made to be extendible, so that users are also allowed to define their own objectives easily. It has several features:
\begin{enumerate}
\item{Speed: }{\verb@xgboost@ can automatically do parallel computation on
Windows and Linux, with openmp. It is generally over 10 times faster than
\verb@gbm@.}
\item{Input Type: }{\verb@xgboost@ takes several types of input data:}
\begin{itemize}
\item{Dense Matrix: }{R's dense matrix, i.e. \verb@matrix@}
\item{Sparse Matrix: }{R's sparse matrix \verb@Matrix::dgCMatrix@}
\item{Data File: }{Local data files}
\item{xgb.DMatrix: }{\verb@xgboost@'s own class. Recommended.}
\end{itemize}
\item{Sparsity: }{\verb@xgboost@ accepts sparse input for both tree booster
and linear booster, and is optimized for sparse input.}
\item{Customization: }{\verb@xgboost@ supports customized objective function
and evaluation function}
\item{Performance: }{\verb@xgboost@ has better performance on several different
datasets.}
\end{enumerate}
\section{Example with Mushroom data}
In this section, we will illustrate some common usage of \verb@xgboost@. The
Mushroom data is cited from UCI Machine Learning Repository. \citep{Bache+Lichman:2013}
<<Training and prediction with iris>>=
library(xgboost)
data(agaricus.train, package='xgboost')
data(agaricus.test, package='xgboost')
train <- agaricus.train
test <- agaricus.test
bst <- xgboost(data = train$data, label = train$label, max_depth = 2, eta = 1,
nrounds = 2, objective = "binary:logistic", nthread = 2)
xgb.save(bst, 'model.save')
bst = xgb.load('model.save')
xgb.parameters(bst) <- list(nthread = 2)
pred <- predict(bst, test$data)
@
\verb@xgboost@ is the main function to train a \verb@Booster@, i.e. a model.
\verb@predict@ does prediction on the model.
Here we can save the model to a binary local file, and load it when needed.
We can't inspect the trees inside. However we have another function to save the
model in plain text.
<<Dump Model>>=
xgb.dump(bst, 'model.dump')
@
The output looks like
\begin{verbatim}
booster[0]:
0:[f28<1.00001] yes=1,no=2,missing=2
1:[f108<1.00001] yes=3,no=4,missing=4
3:leaf=1.85965
4:leaf=-1.94071
2:[f55<1.00001] yes=5,no=6,missing=6
5:leaf=-1.70044
6:leaf=1.71218
booster[1]:
0:[f59<1.00001] yes=1,no=2,missing=2
1:leaf=-6.23624
2:[f28<1.00001] yes=3,no=4,missing=4
3:leaf=-0.96853
4:leaf=0.784718
\end{verbatim}
It is important to know \verb@xgboost@'s own data type: \verb@xgb.DMatrix@.
It speeds up \verb@xgboost@, and is needed for advanced features such as
training from initial prediction value, weighted training instance.
We can use \verb@xgb.DMatrix@ to construct an \verb@xgb.DMatrix@ object:
<<xgb.DMatrix>>=
dtrain <- xgb.DMatrix(train$data, label = train$label, nthread = 2)
class(dtrain)
head(getinfo(dtrain,'label'))
@
We can also save the matrix to a binary file. Then load it simply with
\verb@xgb.DMatrix@
<<save model>>=
xgb.DMatrix.save(dtrain, 'xgb.DMatrix')
dtrain = xgb.DMatrix('xgb.DMatrix')
@
\section{Advanced Examples}
The function \verb@xgboost@ is a simple function with less parameter, in order
to be R-friendly. The core training function is wrapped in \verb@xgb.train@. It is more flexible than \verb@xgboost@, but it requires users to read the document a bit more carefully.
\verb@xgb.train@ only accept a \verb@xgb.DMatrix@ object as its input, while it supports advanced features as custom objective and evaluation functions.
<<Customized loss function>>=
logregobj <- function(preds, dtrain) {
labels <- getinfo(dtrain, "label")
preds <- 1/(1 + exp(-preds))
grad <- preds - labels
hess <- preds * (1 - preds)
return(list(grad = grad, hess = hess))
}
evalerror <- function(preds, dtrain) {
labels <- getinfo(dtrain, "label")
err <- sqrt(mean((preds-labels)^2))
return(list(metric = "MSE", value = err))
}
dtest <- xgb.DMatrix(test$data, label = test$label, nthread = 2)
watchlist <- list(eval = dtest, train = dtrain)
param <- list(max_depth = 2, eta = 1, nthread = 2)
bst <- xgb.train(param, dtrain, nrounds = 2, watchlist, logregobj, evalerror, maximize = FALSE)
@
The gradient and second order gradient is required for the output of customized
objective function.
We also have \verb@slice@ for row extraction. It is useful in
cross-validation.
For a walkthrough demo, please see \verb@R-package/demo/@ for further
details.
\section{The Higgs Boson competition}
We have made a demo for \href{http://www.kaggle.com/c/higgs-boson}{the Higgs
Boson Machine Learning Challenge}.
Here are the instructions to make a submission
\begin{enumerate}
\item Download the \href{http://www.kaggle.com/c/higgs-boson/data}{datasets}
and extract them to \verb@data/@.
\item Run scripts under \verb@xgboost/demo/kaggle-higgs/@:
\href{https://github.com/tqchen/xgboost/blob/master/demo/kaggle-higgs/higgs-train.R}{higgs-train.R}
and \href{https://github.com/tqchen/xgboost/blob/master/demo/kaggle-higgs/higgs-pred.R}{higgs-pred.R}.
The computation will take less than a minute on Intel i7.
\item Go to the \href{http://www.kaggle.com/c/higgs-boson/submissions/attach}{submission page}
and submit your result.
\end{enumerate}
We provide \href{https://github.com/tqchen/xgboost/blob/master/demo/kaggle-higgs/speedtest.R}{a script}
to compare the time cost on the higgs dataset with \verb@gbm@ and \verb@xgboost@.
The training set contains 350000 records and 30 features.
\verb@xgboost@ can automatically do parallel computation. On a machine with Intel
i7-4700MQ and 24GB memories, we found that \verb@xgboost@ costs about 35 seconds, which is about 20 times faster
than \verb@gbm@. When we limited \verb@xgboost@ to use only one thread, it was
still about two times faster than \verb@gbm@.
Meanwhile, the result from \verb@xgboost@ reaches
\href{http://www.kaggle.com/c/higgs-boson/details/evaluation}{3.60@AMS} with a
single model. This results stands in the
\href{http://www.kaggle.com/c/higgs-boson/leaderboard}{top 30\%} of the
competition.
\bibliographystyle{jss}
\nocite{*} % list uncited references
\bibliography{xgboost}
\end{document}
<<Temp file cleaning, include=FALSE>>=
file.remove("xgb.DMatrix")
file.remove("model.dump")
file.remove("model.save")
@

View File

@ -107,7 +107,7 @@ train <- agaricus.train
test <- agaricus.test
```
> In the real world, it would be up to you to make this division between `train` and `test` data. The way to do it is out of the purpose of this article, however `caret` package may [help](http://topepo.github.io/caret/data-splitting.html).
> In the real world, it would be up to you to make this division between `train` and `test` data.
Each variable is a `list` containing two things, `label` and `data`:
@ -155,11 +155,13 @@ We will train decision tree model using the following parameters:
bstSparse <- xgboost(
data = train$data
, label = train$label
, max_depth = 2
, eta = 1
, nthread = 2
, params = list(
max_depth = 2
, eta = 1
, nthread = 2
, objective = "binary:logistic"
)
, nrounds = 2
, objective = "binary:logistic"
)
```
@ -175,11 +177,13 @@ Alternatively, you can put your dataset in a *dense* matrix, i.e. a basic **R**
bstDense <- xgboost(
data = as.matrix(train$data),
label = train$label,
max_depth = 2,
eta = 1,
nthread = 2,
nrounds = 2,
objective = "binary:logistic"
params = list(
max_depth = 2,
eta = 1,
nthread = 2,
objective = "binary:logistic"
),
nrounds = 2
)
```
@ -191,11 +195,13 @@ bstDense <- xgboost(
dtrain <- xgb.DMatrix(data = train$data, label = train$label, nthread = 2)
bstDMatrix <- xgboost(
data = dtrain,
max_depth = 2,
eta = 1,
nthread = 2,
nrounds = 2,
objective = "binary:logistic"
params = list(
max_depth = 2,
eta = 1,
nthread = 2,
objective = "binary:logistic"
),
nrounds = 2
)
```
@ -209,11 +215,13 @@ One of the simplest way to see the training progress is to set the `verbose` opt
# verbose = 0, no message
bst <- xgboost(
data = dtrain
, max_depth = 2
, eta = 1
, nthread = 2
, params = list(
max_depth = 2
, eta = 1
, nthread = 2
, objective = "binary:logistic"
)
, nrounds = 2
, objective = "binary:logistic"
, verbose = 0
)
```
@ -222,11 +230,13 @@ bst <- xgboost(
# verbose = 1, print evaluation metric
bst <- xgboost(
data = dtrain
, max_depth = 2
, eta = 1
, nthread = 2
, params = list(
max_depth = 2
, eta = 1
, nthread = 2
, objective = "binary:logistic"
)
, nrounds = 2
, objective = "binary:logistic"
, verbose = 1
)
```
@ -235,11 +245,13 @@ bst <- xgboost(
# verbose = 2, also print information about tree
bst <- xgboost(
data = dtrain
, max_depth = 2
, eta = 1
, nthread = 2
, params = list(
max_depth = 2
, eta = 1
, nthread = 2
, objective = "binary:logistic"
)
, nrounds = 2
, objective = "binary:logistic"
, verbose = 2
)
```
@ -336,12 +348,14 @@ watchlist <- list(train = dtrain, test = dtest)
bst <- xgb.train(
data = dtrain
, max_depth = 2
, eta = 1
, nthread = 2
, params = list(
max_depth = 2
, eta = 1
, nthread = 2
, objective = "binary:logistic"
)
, nrounds = 2
, watchlist = watchlist
, objective = "binary:logistic"
)
```
@ -349,7 +363,7 @@ bst <- xgb.train(
Both training and test error related metrics are very similar, and in some way, it makes sense: what we have learned from the training dataset matches the observations from the test dataset.
If with your own dataset you have not such results, you should think about how you divided your dataset in training and test. May be there is something to fix. Again, `caret` package may [help](http://topepo.github.io/caret/data-splitting.html).
If with your own dataset you have not such results, you should think about how you divided your dataset in training and test. May be there is something to fix.
For a better understanding of the learning progression, you may want to have some specific metric or even use multiple evaluation metrics.
@ -357,13 +371,15 @@ For a better understanding of the learning progression, you may want to have som
bst <- xgb.train(
data = dtrain
, max_depth = 2
, eta = 1
, nthread = 2
, params = list(
eta = 1
, nthread = 2
, objective = "binary:logistic"
, eval_metric = "error"
, eval_metric = "logloss"
)
, nrounds = 2
, watchlist = watchlist
, eval_metric = "error"
, eval_metric = "logloss"
, objective = "binary:logistic"
)
```
@ -377,14 +393,15 @@ Until now, all the learnings we have performed were based on boosting trees. **X
```{r linearBoosting, message=F, warning=F}
bst <- xgb.train(
data = dtrain
, booster = "gblinear"
, max_depth = 2
, nthread = 2
, params = list(
booster = "gblinear"
, nthread = 2
, objective = "binary:logistic"
, eval_metric = "error"
, eval_metric = "logloss"
)
, nrounds = 2
, watchlist = watchlist
, eval_metric = "error"
, eval_metric = "logloss"
, objective = "binary:logistic"
)
```
@ -406,12 +423,14 @@ xgb.DMatrix.save(dtrain, fname)
dtrain2 <- xgb.DMatrix(fname)
bst <- xgb.train(
data = dtrain2
, max_depth = 2
, eta = 1
, nthread = 2
, params = list(
max_depth = 2
, eta = 1
, nthread = 2
, objective = "binary:logistic"
)
, nrounds = 2
, watchlist = watchlist
, objective = "binary:logistic"
)
```
@ -492,17 +511,17 @@ file.remove(fname)
> result is `0`? We are good!
In some very specific cases, like when you want to pilot **XGBoost** from `caret` package, you will want to save the model as a *R* binary vector. See below how to do it.
In some very specific cases, you will want to save the model as a *R* binary vector. See below how to do it.
```{r saveLoadRBinVectorModel, message=F, warning=F}
# save model to R's raw vector
rawVec <- xgb.serialize(bst)
rawVec <- xgb.save.raw(bst)
# print class
print(class(rawVec))
# load binary model to R
bst3 <- xgb.load(rawVec)
bst3 <- xgb.load.raw(rawVec)
xgb.parameters(bst3) <- list(nthread = 2)
pred3 <- predict(bst3, test$data)

View File

@ -53,11 +53,10 @@ labels <- c(1, 1, 1,
data <- data.frame(dates = dates, labels = labels)
bst <- xgb.train(
data = xgb.DMatrix(as.matrix(data$dates), label = labels),
data = xgb.DMatrix(as.matrix(data$dates), label = labels, missing = NA),
nthread = 2,
nrounds = 1,
objective = "binary:logistic",
missing = NA,
max_depth = 1
)
```