[R] On-demand serialization + standardization of attributes (#9924)
--------- Co-authored-by: Jiaming Yuan <jm.yuan@outlook.com>
This commit is contained in:
parent
01c4711556
commit
d3a8d284ab
@ -2,16 +2,19 @@
|
||||
|
||||
S3method("[",xgb.DMatrix)
|
||||
S3method("dimnames<-",xgb.DMatrix)
|
||||
S3method(coef,xgb.Booster)
|
||||
S3method(dim,xgb.DMatrix)
|
||||
S3method(dimnames,xgb.DMatrix)
|
||||
S3method(getinfo,xgb.Booster)
|
||||
S3method(getinfo,xgb.DMatrix)
|
||||
S3method(predict,xgb.Booster)
|
||||
S3method(predict,xgb.Booster.handle)
|
||||
S3method(print,xgb.Booster)
|
||||
S3method(print,xgb.DMatrix)
|
||||
S3method(print,xgb.cv.synchronous)
|
||||
S3method(setinfo,xgb.Booster)
|
||||
S3method(setinfo,xgb.DMatrix)
|
||||
S3method(slice,xgb.DMatrix)
|
||||
S3method(variable.names,xgb.Booster)
|
||||
export("xgb.attr<-")
|
||||
export("xgb.attributes<-")
|
||||
export("xgb.config<-")
|
||||
@ -26,13 +29,13 @@ export(cb.save.model)
|
||||
export(getinfo)
|
||||
export(setinfo)
|
||||
export(slice)
|
||||
export(xgb.Booster.complete)
|
||||
export(xgb.DMatrix)
|
||||
export(xgb.DMatrix.hasinfo)
|
||||
export(xgb.DMatrix.save)
|
||||
export(xgb.attr)
|
||||
export(xgb.attributes)
|
||||
export(xgb.config)
|
||||
export(xgb.copy.Booster)
|
||||
export(xgb.create.features)
|
||||
export(xgb.cv)
|
||||
export(xgb.dump)
|
||||
@ -41,10 +44,12 @@ export(xgb.get.DMatrix.data)
|
||||
export(xgb.get.DMatrix.num.non.missing)
|
||||
export(xgb.get.DMatrix.qcut)
|
||||
export(xgb.get.config)
|
||||
export(xgb.get.num.boosted.rounds)
|
||||
export(xgb.ggplot.deepness)
|
||||
export(xgb.ggplot.importance)
|
||||
export(xgb.ggplot.shap.summary)
|
||||
export(xgb.importance)
|
||||
export(xgb.is.same.Booster)
|
||||
export(xgb.load)
|
||||
export(xgb.load.raw)
|
||||
export(xgb.model.dt.tree)
|
||||
@ -56,10 +61,8 @@ export(xgb.plot.shap.summary)
|
||||
export(xgb.plot.tree)
|
||||
export(xgb.save)
|
||||
export(xgb.save.raw)
|
||||
export(xgb.serialize)
|
||||
export(xgb.set.config)
|
||||
export(xgb.train)
|
||||
export(xgb.unserialize)
|
||||
export(xgboost)
|
||||
import(methods)
|
||||
importClassesFrom(Matrix,dgCMatrix)
|
||||
@ -88,8 +91,10 @@ importFrom(graphics,title)
|
||||
importFrom(jsonlite,fromJSON)
|
||||
importFrom(jsonlite,toJSON)
|
||||
importFrom(methods,new)
|
||||
importFrom(stats,coef)
|
||||
importFrom(stats,median)
|
||||
importFrom(stats,predict)
|
||||
importFrom(stats,variable.names)
|
||||
importFrom(utils,head)
|
||||
importFrom(utils,object.size)
|
||||
importFrom(utils,str)
|
||||
|
||||
@ -228,7 +228,7 @@ cb.reset.parameters <- function(new_params) {
|
||||
})
|
||||
|
||||
if (!is.null(env$bst)) {
|
||||
xgb.parameters(env$bst$handle) <- pars
|
||||
xgb.parameters(env$bst) <- pars
|
||||
} else {
|
||||
for (fd in env$bst_folds)
|
||||
xgb.parameters(fd$bst) <- pars
|
||||
@ -333,13 +333,13 @@ cb.early.stop <- function(stopping_rounds, maximize = FALSE,
|
||||
if (!is.null(env$bst)) {
|
||||
if (!inherits(env$bst, 'xgb.Booster'))
|
||||
stop("'bst' in the parent frame must be an 'xgb.Booster'")
|
||||
if (!is.null(best_score <- xgb.attr(env$bst$handle, 'best_score'))) {
|
||||
if (!is.null(best_score <- xgb.attr(env$bst, 'best_score'))) {
|
||||
best_score <<- as.numeric(best_score)
|
||||
best_iteration <<- as.numeric(xgb.attr(env$bst$handle, 'best_iteration')) + 1
|
||||
best_msg <<- as.numeric(xgb.attr(env$bst$handle, 'best_msg'))
|
||||
best_iteration <<- as.numeric(xgb.attr(env$bst, 'best_iteration')) + 1
|
||||
best_msg <<- as.numeric(xgb.attr(env$bst, 'best_msg'))
|
||||
} else {
|
||||
xgb.attributes(env$bst$handle) <- list(best_iteration = best_iteration - 1,
|
||||
best_score = best_score)
|
||||
xgb.attributes(env$bst) <- list(best_iteration = best_iteration - 1,
|
||||
best_score = best_score)
|
||||
}
|
||||
} else if (is.null(env$bst_folds) || is.null(env$basket)) {
|
||||
stop("Parent frame has neither 'bst' nor ('bst_folds' and 'basket')")
|
||||
@ -348,7 +348,7 @@ cb.early.stop <- function(stopping_rounds, maximize = FALSE,
|
||||
|
||||
finalizer <- function(env) {
|
||||
if (!is.null(env$bst)) {
|
||||
attr_best_score <- as.numeric(xgb.attr(env$bst$handle, 'best_score'))
|
||||
attr_best_score <- as.numeric(xgb.attr(env$bst, 'best_score'))
|
||||
if (best_score != attr_best_score) {
|
||||
# If the difference is too big, throw an error
|
||||
if (abs(best_score - attr_best_score) >= 1e-14) {
|
||||
@ -358,9 +358,9 @@ cb.early.stop <- function(stopping_rounds, maximize = FALSE,
|
||||
# If the difference is due to floating-point truncation, update best_score
|
||||
best_score <- attr_best_score
|
||||
}
|
||||
env$bst$best_iteration <- best_iteration
|
||||
env$bst$best_ntreelimit <- best_ntreelimit
|
||||
env$bst$best_score <- best_score
|
||||
xgb.attr(env$bst, "best_iteration") <- best_iteration
|
||||
xgb.attr(env$bst, "best_ntreelimit") <- best_ntreelimit
|
||||
xgb.attr(env$bst, "best_score") <- best_score
|
||||
} else {
|
||||
env$basket$best_iteration <- best_iteration
|
||||
env$basket$best_ntreelimit <- best_ntreelimit
|
||||
@ -412,11 +412,15 @@ cb.early.stop <- function(stopping_rounds, maximize = FALSE,
|
||||
#' @param save_period save the model to disk after every
|
||||
#' \code{save_period} iterations; 0 means save the model at the end.
|
||||
#' @param save_name the name or path for the saved model file.
|
||||
#'
|
||||
#' Note that the format of the model being saved is determined by the file
|
||||
#' extension specified here (see \link{xgb.save} for details about how it works).
|
||||
#'
|
||||
#' It can contain a \code{\link[base]{sprintf}} formatting specifier
|
||||
#' to include the integer iteration number in the file name.
|
||||
#' E.g., with \code{save_name} = 'xgboost_%04d.model',
|
||||
#' the file saved at iteration 50 would be named "xgboost_0050.model".
|
||||
#'
|
||||
#' E.g., with \code{save_name} = 'xgboost_%04d.ubj',
|
||||
#' the file saved at iteration 50 would be named "xgboost_0050.ubj".
|
||||
#' @seealso \link{xgb.save}
|
||||
#' @details
|
||||
#' This callback function allows to save an xgb-model file, either periodically after each \code{save_period}'s or at the end.
|
||||
#'
|
||||
@ -430,7 +434,7 @@ cb.early.stop <- function(stopping_rounds, maximize = FALSE,
|
||||
#' \code{\link{callbacks}}
|
||||
#'
|
||||
#' @export
|
||||
cb.save.model <- function(save_period = 0, save_name = "xgboost.model") {
|
||||
cb.save.model <- function(save_period = 0, save_name = "xgboost.ubj") {
|
||||
|
||||
if (save_period < 0)
|
||||
stop("'save_period' cannot be negative")
|
||||
@ -440,8 +444,13 @@ cb.save.model <- function(save_period = 0, save_name = "xgboost.model") {
|
||||
stop("'save_model' callback requires the 'bst' booster object in its calling frame")
|
||||
|
||||
if ((save_period > 0 && (env$iteration - env$begin_iteration) %% save_period == 0) ||
|
||||
(save_period == 0 && env$iteration == env$end_iteration))
|
||||
xgb.save(env$bst, sprintf(save_name, env$iteration))
|
||||
(save_period == 0 && env$iteration == env$end_iteration)) {
|
||||
# Note: this throws a warning if the name doesn't have anything to format through 'sprintf'
|
||||
suppressWarnings({
|
||||
save_name <- sprintf(save_name, env$iteration)
|
||||
})
|
||||
xgb.save(env$bst, save_name)
|
||||
}
|
||||
}
|
||||
attr(callback, 'call') <- match.call()
|
||||
attr(callback, 'name') <- 'cb.save.model'
|
||||
@ -512,8 +521,7 @@ cb.cv.predict <- function(save_models = FALSE) {
|
||||
env$basket$pred <- pred
|
||||
if (save_models) {
|
||||
env$basket$models <- lapply(env$bst_folds, function(fd) {
|
||||
xgb.attr(fd$bst, 'niter') <- env$end_iteration - 1
|
||||
xgb.Booster.complete(xgb.handleToBooster(handle = fd$bst, raw = NULL), saveraw = TRUE)
|
||||
return(fd$bst)
|
||||
})
|
||||
}
|
||||
}
|
||||
@ -665,7 +673,7 @@ cb.gblinear.history <- function(sparse = FALSE) {
|
||||
} else { # xgb.cv:
|
||||
cf <- vector("list", length(env$bst_folds))
|
||||
for (i in seq_along(env$bst_folds)) {
|
||||
dmp <- xgb.dump(xgb.handleToBooster(handle = env$bst_folds[[i]]$bst, raw = NULL))
|
||||
dmp <- xgb.dump(env$bst_folds[[i]]$bst)
|
||||
cf[[i]] <- as.numeric(grep('(booster|bias|weigh)', dmp, invert = TRUE, value = TRUE))
|
||||
if (sparse) cf[[i]] <- as(cf[[i]], "sparseVector")
|
||||
}
|
||||
@ -685,14 +693,19 @@ cb.gblinear.history <- function(sparse = FALSE) {
|
||||
callback
|
||||
}
|
||||
|
||||
#' Extract gblinear coefficients history.
|
||||
#'
|
||||
#' A helper function to extract the matrix of linear coefficients' history
|
||||
#' @title Extract gblinear coefficients history.
|
||||
#' @description A helper function to extract the matrix of linear coefficients' history
|
||||
#' from a gblinear model created while using the \code{cb.gblinear.history()}
|
||||
#' callback.
|
||||
#' @details Note that this is an R-specific function that relies on R attributes that
|
||||
#' are not saved when using xgboost's own serialization functions like \link{xgb.load}
|
||||
#' or \link{xgb.load.raw}.
|
||||
#'
|
||||
#' In order for a serialized model to be accepted by tgis function, one must use R
|
||||
#' serializers such as \link{saveRDS}.
|
||||
#' @param model either an \code{xgb.Booster} or a result of \code{xgb.cv()}, trained
|
||||
#' using the \code{cb.gblinear.history()} callback.
|
||||
#' using the \code{cb.gblinear.history()} callback, but \bold{not} a booster
|
||||
#' loaded from \link{xgb.load} or \link{xgb.load.raw}.
|
||||
#' @param class_index zero-based class index to extract the coefficients for only that
|
||||
#' specific class in a multinomial multiclass model. When it is NULL, all the
|
||||
#' coefficients are returned. Has no effect in non-multiclass models.
|
||||
@ -713,20 +726,18 @@ xgb.gblinear.history <- function(model, class_index = NULL) {
|
||||
stop("model must be an object of either xgb.Booster or xgb.cv.synchronous class")
|
||||
is_cv <- inherits(model, "xgb.cv.synchronous")
|
||||
|
||||
if (is.null(model[["callbacks"]]) || is.null(model$callbacks[["cb.gblinear.history"]]))
|
||||
if (is_cv) {
|
||||
callbacks <- model$callbacks
|
||||
} else {
|
||||
callbacks <- attributes(model)$callbacks
|
||||
}
|
||||
|
||||
if (is.null(callbacks) || is.null(callbacks$cb.gblinear.history))
|
||||
stop("model must be trained while using the cb.gblinear.history() callback")
|
||||
|
||||
if (!is_cv) {
|
||||
# extract num_class & num_feat from the internal model
|
||||
dmp <- xgb.dump(model)
|
||||
if (length(dmp) < 2 || dmp[2] != "bias:")
|
||||
stop("It does not appear to be a gblinear model")
|
||||
dmp <- dmp[-c(1, 2)]
|
||||
n <- which(dmp == 'weight:')
|
||||
if (length(n) != 1)
|
||||
stop("It does not appear to be a gblinear model")
|
||||
num_class <- n - 1
|
||||
num_feat <- (length(dmp) - 4) / num_class
|
||||
num_class <- xgb.num_class(model)
|
||||
num_feat <- xgb.num_feature(model)
|
||||
} else {
|
||||
# in case of CV, the object is expected to have this info
|
||||
if (model$params$booster != "gblinear")
|
||||
@ -742,7 +753,7 @@ xgb.gblinear.history <- function(model, class_index = NULL) {
|
||||
(class_index[1] < 0 || class_index[1] >= num_class))
|
||||
stop("class_index has to be within [0,", num_class - 1, "]")
|
||||
|
||||
coef_path <- environment(model$callbacks$cb.gblinear.history)[["coefs"]]
|
||||
coef_path <- environment(callbacks$cb.gblinear.history)[["coefs"]]
|
||||
if (!is.null(class_index) && num_class > 1) {
|
||||
coef_path <- if (is.list(coef_path)) {
|
||||
lapply(coef_path,
|
||||
|
||||
@ -148,19 +148,17 @@ check.custom.eval <- function(env = parent.frame()) {
|
||||
|
||||
|
||||
# Update a booster handle for an iteration with dtrain data
|
||||
xgb.iter.update <- function(booster_handle, dtrain, iter, obj) {
|
||||
if (!identical(class(booster_handle), "xgb.Booster.handle")) {
|
||||
stop("booster_handle must be of xgb.Booster.handle class")
|
||||
}
|
||||
xgb.iter.update <- function(bst, dtrain, iter, obj) {
|
||||
if (!inherits(dtrain, "xgb.DMatrix")) {
|
||||
stop("dtrain must be of xgb.DMatrix class")
|
||||
}
|
||||
handle <- xgb.get.handle(bst)
|
||||
|
||||
if (is.null(obj)) {
|
||||
.Call(XGBoosterUpdateOneIter_R, booster_handle, as.integer(iter), dtrain)
|
||||
.Call(XGBoosterUpdateOneIter_R, handle, as.integer(iter), dtrain)
|
||||
} else {
|
||||
pred <- predict(
|
||||
booster_handle,
|
||||
bst,
|
||||
dtrain,
|
||||
outputmargin = TRUE,
|
||||
training = TRUE,
|
||||
@ -185,7 +183,7 @@ xgb.iter.update <- function(booster_handle, dtrain, iter, obj) {
|
||||
}
|
||||
|
||||
.Call(
|
||||
XGBoosterTrainOneIter_R, booster_handle, dtrain, iter, grad, hess
|
||||
XGBoosterTrainOneIter_R, handle, dtrain, iter, grad, hess
|
||||
)
|
||||
}
|
||||
return(TRUE)
|
||||
@ -195,23 +193,22 @@ xgb.iter.update <- function(booster_handle, dtrain, iter, obj) {
|
||||
# Evaluate one iteration.
|
||||
# Returns a named vector of evaluation metrics
|
||||
# with the names in a 'datasetname-metricname' format.
|
||||
xgb.iter.eval <- function(booster_handle, watchlist, iter, feval) {
|
||||
if (!identical(class(booster_handle), "xgb.Booster.handle"))
|
||||
stop("class of booster_handle must be xgb.Booster.handle")
|
||||
xgb.iter.eval <- function(bst, watchlist, iter, feval) {
|
||||
handle <- xgb.get.handle(bst)
|
||||
|
||||
if (length(watchlist) == 0)
|
||||
return(NULL)
|
||||
|
||||
evnames <- names(watchlist)
|
||||
if (is.null(feval)) {
|
||||
msg <- .Call(XGBoosterEvalOneIter_R, booster_handle, as.integer(iter), watchlist, as.list(evnames))
|
||||
msg <- .Call(XGBoosterEvalOneIter_R, handle, as.integer(iter), watchlist, as.list(evnames))
|
||||
mat <- matrix(strsplit(msg, '\\s+|:')[[1]][-1], nrow = 2)
|
||||
res <- structure(as.numeric(mat[2, ]), names = mat[1, ])
|
||||
} else {
|
||||
res <- sapply(seq_along(watchlist), function(j) {
|
||||
w <- watchlist[[j]]
|
||||
## predict using all trees
|
||||
preds <- predict(booster_handle, w, outputmargin = TRUE, iterationrange = c(1, 1))
|
||||
preds <- predict(bst, w, outputmargin = TRUE, iterationrange = c(1, 1))
|
||||
eval_res <- feval(preds, w)
|
||||
out <- eval_res$value
|
||||
names(out) <- paste0(evnames[j], "-", eval_res$metric)
|
||||
@ -352,16 +349,45 @@ xgb.createFolds <- function(y, k) {
|
||||
#' @name xgboost-deprecated
|
||||
NULL
|
||||
|
||||
#' Do not use \code{\link[base]{saveRDS}} or \code{\link[base]{save}} for long-term archival of
|
||||
#' models. Instead, use \code{\link{xgb.save}} or \code{\link{xgb.save.raw}}.
|
||||
#' @title Model Serialization and Compatibility
|
||||
#' @description
|
||||
#'
|
||||
#' It is a common practice to use the built-in \code{\link[base]{saveRDS}} function (or
|
||||
#' \code{\link[base]{save}}) to persist R objects to the disk. While it is possible to persist
|
||||
#' \code{xgb.Booster} objects using \code{\link[base]{saveRDS}}, it is not advisable to do so if
|
||||
#' the model is to be accessed in the future. If you train a model with the current version of
|
||||
#' XGBoost and persist it with \code{\link[base]{saveRDS}}, the model is not guaranteed to be
|
||||
#' accessible in later releases of XGBoost. To ensure that your model can be accessed in future
|
||||
#' releases of XGBoost, use \code{\link{xgb.save}} or \code{\link{xgb.save.raw}} instead.
|
||||
#' When it comes to serializing XGBoost models, it's possible to use R serializers such as
|
||||
#' \link{save} or \link{saveRDS} to serialize an XGBoost R model, but XGBoost also provides
|
||||
#' its own serializers with better compatibility guarantees, which allow loading
|
||||
#' said models in other language bindings of XGBoost.
|
||||
#'
|
||||
#' Note that an `xgb.Booster` object, outside of its core components, might also keep:\itemize{
|
||||
#' \item Additional model configuration (accessible through \link{xgb.config}),
|
||||
#' which includes model fitting parameters like `max_depth` and runtime parameters like `nthread`.
|
||||
#' These are not necessarily useful for prediction/importance/plotting.
|
||||
#' \item Additional R-specific attributes - e.g. results of callbacks, such as evaluation logs,
|
||||
#' which are kept as a `data.table` object, accessible through `attributes(model)$evaluation_log`
|
||||
#' if present.
|
||||
#' }
|
||||
#'
|
||||
#' The first one (configurations) does not have the same compatibility guarantees as
|
||||
#' the model itself, including attributes that are set and accessed through \link{xgb.attributes} - that is, such configuration
|
||||
#' might be lost after loading the booster in a different XGBoost version, regardless of the
|
||||
#' serializer that was used. These are saved when using \link{saveRDS}, but will be discarded
|
||||
#' if loaded into an incompatible XGBoost version. They are not saved when using XGBoost's
|
||||
#' serializers from its public interface including \link{xgb.save} and \link{xgb.save.raw}.
|
||||
#'
|
||||
#' The second ones (R attributes) are not part of the standard XGBoost model structure, and thus are
|
||||
#' not saved when using XGBoost's own serializers. These attributes are only used for informational
|
||||
#' purposes, such as keeping track of evaluation metrics as the model was fit, or saving the R
|
||||
#' call that produced the model, but are otherwise not used for prediction / importance / plotting / etc.
|
||||
#' These R attributes are only preserved when using R's serializers.
|
||||
#'
|
||||
#' Note that XGBoost models in R starting from version `2.1.0` and onwards, and XGBoost models
|
||||
#' before version `2.1.0`; have a very different R object structure and are incompatible with
|
||||
#' each other. Hence, models that were saved with R serializers live `saveRDS` or `save` before
|
||||
#' version `2.1.0` will not work with latter `xgboost` versions and vice versa. Be aware that
|
||||
#' the structure of R model objects could in theory change again in the future, so XGBoost's serializers
|
||||
#' should be preferred for long-term storage.
|
||||
#'
|
||||
#' Furthermore, note that using the package `qs` for serialization will require version 0.26 or
|
||||
#' higher of said package, and will have the same compatibility restrictions as R serializers.
|
||||
#'
|
||||
#' @details
|
||||
#' Use \code{\link{xgb.save}} to save the XGBoost model as a stand-alone file. You may opt into
|
||||
@ -374,9 +400,10 @@ NULL
|
||||
#' The \code{\link{xgb.save.raw}} function is useful if you'd like to persist the XGBoost model
|
||||
#' as part of another R object.
|
||||
#'
|
||||
#' Note: Do not use \code{\link{xgb.serialize}} to store models long-term. It persists not only the
|
||||
#' model but also internal configurations and parameters, and its format is not stable across
|
||||
#' multiple XGBoost versions. Use \code{\link{xgb.serialize}} only for checkpointing.
|
||||
#' Use \link{saveRDS} if you require the R-specific attributes that a booster might have, such
|
||||
#' as evaluation logs, but note that future compatibility of such objects is outside XGBoost's
|
||||
#' control as it relies on R's serialization format (see e.g. the details section in
|
||||
#' \link{serialize} and \link{save} from base R).
|
||||
#'
|
||||
#' For more details and explanation about model persistence and archival, consult the page
|
||||
#' \url{https://xgboost.readthedocs.io/en/latest/tutorials/saving_model.html}.
|
||||
|
||||
@ -1,180 +1,85 @@
|
||||
# Construct an internal xgboost Booster and return a handle to it.
|
||||
# Construct an internal xgboost Booster and get its current number of rounds.
|
||||
# internal utility function
|
||||
xgb.Booster.handle <- function(params, cachelist, modelfile, handle) {
|
||||
# Note: the number of rounds in the C booster gets reset to zero when changing
|
||||
# key booster parameters like 'process_type=update', but in some cases, when
|
||||
# replacing previous iterations, it needs to make a check that the new number
|
||||
# of iterations doesn't exceed the previous ones, hence it keeps track of the
|
||||
# current number of iterations before resetting the parameters in order to
|
||||
# perform the check later on.
|
||||
xgb.Booster <- function(params, cachelist, modelfile) {
|
||||
if (typeof(cachelist) != "list" ||
|
||||
!all(vapply(cachelist, inherits, logical(1), what = 'xgb.DMatrix'))) {
|
||||
stop("cachelist must be a list of xgb.DMatrix objects")
|
||||
}
|
||||
## Load existing model, dispatch for on disk model file and in memory buffer
|
||||
if (!is.null(modelfile)) {
|
||||
if (typeof(modelfile) == "character") {
|
||||
if (is.character(modelfile)) {
|
||||
## A filename
|
||||
handle <- .Call(XGBoosterCreate_R, cachelist)
|
||||
bst <- .Call(XGBoosterCreate_R, cachelist)
|
||||
modelfile <- path.expand(modelfile)
|
||||
.Call(XGBoosterLoadModel_R, handle, enc2utf8(modelfile[1]))
|
||||
class(handle) <- "xgb.Booster.handle"
|
||||
.Call(XGBoosterLoadModel_R, xgb.get.handle(bst), enc2utf8(modelfile[1]))
|
||||
niter <- xgb.get.num.boosted.rounds(bst)
|
||||
if (length(params) > 0) {
|
||||
xgb.parameters(handle) <- params
|
||||
xgb.parameters(bst) <- params
|
||||
}
|
||||
return(handle)
|
||||
} else if (typeof(modelfile) == "raw") {
|
||||
return(list(bst = bst, niter = niter))
|
||||
} else if (is.raw(modelfile)) {
|
||||
## A memory buffer
|
||||
bst <- xgb.unserialize(modelfile, handle)
|
||||
bst <- xgb.load.raw(modelfile)
|
||||
niter <- xgb.get.num.boosted.rounds(bst)
|
||||
xgb.parameters(bst) <- params
|
||||
return(bst)
|
||||
return(list(bst = bst, niter = niter))
|
||||
} else if (inherits(modelfile, "xgb.Booster")) {
|
||||
## A booster object
|
||||
bst <- xgb.Booster.complete(modelfile, saveraw = TRUE)
|
||||
bst <- xgb.unserialize(bst$raw)
|
||||
bst <- .Call(XGDuplicate_R, modelfile)
|
||||
niter <- xgb.get.num.boosted.rounds(bst)
|
||||
xgb.parameters(bst) <- params
|
||||
return(bst)
|
||||
return(list(bst = bst, niter = niter))
|
||||
} else {
|
||||
stop("modelfile must be either character filename, or raw booster dump, or xgb.Booster object")
|
||||
}
|
||||
}
|
||||
## Create new model
|
||||
handle <- .Call(XGBoosterCreate_R, cachelist)
|
||||
class(handle) <- "xgb.Booster.handle"
|
||||
bst <- .Call(XGBoosterCreate_R, cachelist)
|
||||
if (length(params) > 0) {
|
||||
xgb.parameters(handle) <- params
|
||||
xgb.parameters(bst) <- params
|
||||
}
|
||||
return(handle)
|
||||
return(list(bst = bst, niter = 0L))
|
||||
}
|
||||
|
||||
# Convert xgb.Booster.handle to xgb.Booster
|
||||
# internal utility function
|
||||
xgb.handleToBooster <- function(handle, raw) {
|
||||
bst <- list(handle = handle, raw = raw)
|
||||
class(bst) <- "xgb.Booster"
|
||||
return(bst)
|
||||
}
|
||||
|
||||
# Check whether xgb.Booster.handle is null
|
||||
# Check whether xgb.Booster handle is null
|
||||
# internal utility function
|
||||
is.null.handle <- function(handle) {
|
||||
if (is.null(handle)) return(TRUE)
|
||||
|
||||
if (!identical(class(handle), "xgb.Booster.handle"))
|
||||
stop("argument type must be xgb.Booster.handle")
|
||||
if (!inherits(handle, "externalptr"))
|
||||
stop("argument type must be 'externalptr'")
|
||||
|
||||
if (.Call(XGCheckNullPtr_R, handle))
|
||||
return(TRUE)
|
||||
|
||||
return(FALSE)
|
||||
return(.Call(XGCheckNullPtr_R, handle))
|
||||
}
|
||||
|
||||
# Return a verified to be valid handle out of either xgb.Booster.handle or
|
||||
# xgb.Booster internal utility function
|
||||
# Return a verified to be valid handle out of xgb.Booster
|
||||
# internal utility function
|
||||
xgb.get.handle <- function(object) {
|
||||
if (inherits(object, "xgb.Booster")) {
|
||||
handle <- object$handle
|
||||
} else if (inherits(object, "xgb.Booster.handle")) {
|
||||
handle <- object
|
||||
handle <- object$ptr
|
||||
if (is.null(handle) || !inherits(handle, "externalptr")) {
|
||||
stop("'xgb.Booster' object is corrupted or is from an incompatible xgboost version.")
|
||||
}
|
||||
} else {
|
||||
stop("argument must be of either xgb.Booster or xgb.Booster.handle class")
|
||||
stop("argument must be an 'xgb.Booster' object.")
|
||||
}
|
||||
if (is.null.handle(handle)) {
|
||||
stop("invalid xgb.Booster.handle")
|
||||
stop("invalid 'xgb.Booster' (blank 'externalptr').")
|
||||
}
|
||||
handle
|
||||
}
|
||||
|
||||
#' Restore missing parts of an incomplete xgb.Booster object
|
||||
#'
|
||||
#' It attempts to complete an `xgb.Booster` object by restoring either its missing
|
||||
#' raw model memory dump (when it has no `raw` data but its `xgb.Booster.handle` is valid)
|
||||
#' or its missing internal handle (when its `xgb.Booster.handle` is not valid
|
||||
#' but it has a raw Booster memory dump).
|
||||
#'
|
||||
#' @param object Object of class `xgb.Booster`.
|
||||
#' @param saveraw A flag indicating whether to append `raw` Booster memory dump data
|
||||
#' when it doesn't already exist.
|
||||
#'
|
||||
#' @details
|
||||
#'
|
||||
#' While this method is primarily for internal use, it might be useful in some practical situations.
|
||||
#'
|
||||
#' E.g., when an `xgb.Booster` model is saved as an R object and then is loaded as an R object,
|
||||
#' its handle (pointer) to an internal xgboost model would be invalid. The majority of xgboost methods
|
||||
#' should still work for such a model object since those methods would be using
|
||||
#' `xgb.Booster.complete()` internally. However, one might find it to be more efficient to call the
|
||||
#' `xgb.Booster.complete()` function explicitly once after loading a model as an R-object.
|
||||
#' That would prevent further repeated implicit reconstruction of an internal booster model.
|
||||
#'
|
||||
#' @return
|
||||
#' An object of `xgb.Booster` class.
|
||||
#'
|
||||
#' @examples
|
||||
#'
|
||||
#' data(agaricus.train, package = "xgboost")
|
||||
#'
|
||||
#' bst <- xgboost(
|
||||
#' data = agaricus.train$data,
|
||||
#' label = agaricus.train$label,
|
||||
#' max_depth = 2,
|
||||
#' eta = 1,
|
||||
#' nthread = 2,
|
||||
#' nrounds = 2,
|
||||
#' objective = "binary:logistic"
|
||||
#' )
|
||||
#'
|
||||
#' fname <- file.path(tempdir(), "xgb_model.Rds")
|
||||
#' saveRDS(bst, fname)
|
||||
#'
|
||||
#' # Warning: The resulting RDS file is only compatible with the current XGBoost version.
|
||||
#' # Refer to the section titled "a-compatibility-note-for-saveRDS-save".
|
||||
#' bst1 <- readRDS(fname)
|
||||
#' # the handle is invalid:
|
||||
#' print(bst1$handle)
|
||||
#'
|
||||
#' bst1 <- xgb.Booster.complete(bst1)
|
||||
#' # now the handle points to a valid internal booster model:
|
||||
#' print(bst1$handle)
|
||||
#'
|
||||
#' @export
|
||||
xgb.Booster.complete <- function(object, saveraw = TRUE) {
|
||||
if (!inherits(object, "xgb.Booster"))
|
||||
stop("argument type must be xgb.Booster")
|
||||
|
||||
if (is.null.handle(object$handle)) {
|
||||
object$handle <- xgb.Booster.handle(
|
||||
params = list(),
|
||||
cachelist = list(),
|
||||
modelfile = object$raw,
|
||||
handle = object$handle
|
||||
)
|
||||
} else {
|
||||
if (is.null(object$raw) && saveraw) {
|
||||
object$raw <- xgb.serialize(object$handle)
|
||||
}
|
||||
}
|
||||
|
||||
attrs <- xgb.attributes(object)
|
||||
if (!is.null(attrs$best_ntreelimit)) {
|
||||
object$best_ntreelimit <- as.integer(attrs$best_ntreelimit)
|
||||
}
|
||||
if (!is.null(attrs$best_iteration)) {
|
||||
## Convert from 0 based back to 1 based.
|
||||
object$best_iteration <- as.integer(attrs$best_iteration) + 1
|
||||
}
|
||||
if (!is.null(attrs$best_score)) {
|
||||
object$best_score <- as.numeric(attrs$best_score)
|
||||
}
|
||||
if (!is.null(attrs$best_msg)) {
|
||||
object$best_msg <- attrs$best_msg
|
||||
}
|
||||
if (!is.null(attrs$niter)) {
|
||||
object$niter <- as.integer(attrs$niter)
|
||||
}
|
||||
|
||||
return(object)
|
||||
return(handle)
|
||||
}
|
||||
|
||||
#' Predict method for XGBoost model
|
||||
#'
|
||||
#' Predicted values based on either xgboost model or model handle object.
|
||||
#'
|
||||
#' @param object Object of class `xgb.Booster` or `xgb.Booster.handle`.
|
||||
#' @param object Object of class `xgb.Booster`.
|
||||
#' @param newdata Takes `matrix`, `dgCMatrix`, `dgRMatrix`, `dsparseVector`,
|
||||
#' local data file, or `xgb.DMatrix`.
|
||||
#' For single-row predictions on sparse data, it is recommended to use the CSR format.
|
||||
@ -358,27 +263,19 @@ xgb.Booster.complete <- function(object, saveraw = TRUE) {
|
||||
#' pred5 <- predict(bst, as.matrix(iris[, -5]), iterationrange = c(1, 6))
|
||||
#' sum(pred5 != lb) / length(lb)
|
||||
#'
|
||||
#' @rdname predict.xgb.Booster
|
||||
#' @export
|
||||
predict.xgb.Booster <- function(object, newdata, missing = NA, outputmargin = FALSE, ntreelimit = NULL,
|
||||
predleaf = FALSE, predcontrib = FALSE, approxcontrib = FALSE, predinteraction = FALSE,
|
||||
reshape = FALSE, training = FALSE, iterationrange = NULL, strict_shape = FALSE, ...) {
|
||||
object <- xgb.Booster.complete(object, saveraw = FALSE)
|
||||
|
||||
if (!inherits(newdata, "xgb.DMatrix")) {
|
||||
config <- jsonlite::fromJSON(xgb.config(object))
|
||||
nthread <- strtoi(config$learner$generic_param$nthread)
|
||||
nthread <- xgb.nthread(object)
|
||||
newdata <- xgb.DMatrix(
|
||||
newdata,
|
||||
missing = missing, nthread = NVL(nthread, -1)
|
||||
)
|
||||
}
|
||||
if (!is.null(object[["feature_names"]]) &&
|
||||
!is.null(colnames(newdata)) &&
|
||||
!identical(object[["feature_names"]], colnames(newdata)))
|
||||
stop("Feature names stored in `object` and `newdata` are different!")
|
||||
|
||||
if (NVL(object$params[['booster']], '') == 'gblinear' || is.null(ntreelimit))
|
||||
if (NVL(xgb.booster_type(object), '') == 'gblinear' || is.null(ntreelimit))
|
||||
ntreelimit <- 0
|
||||
|
||||
if (ntreelimit != 0 && is.null(iterationrange)) {
|
||||
@ -391,11 +288,12 @@ predict.xgb.Booster <- function(object, newdata, missing = NA, outputmargin = FA
|
||||
## both are specified, let libgxgboost throw an error
|
||||
} else {
|
||||
## no limit is supplied, use best
|
||||
if (is.null(object$best_iteration)) {
|
||||
best_iteration <- xgb.best_iteration(object)
|
||||
if (is.null(best_iteration)) {
|
||||
iterationrange <- c(0, 0)
|
||||
} else {
|
||||
## We don't need to + 1 as R is 1-based index.
|
||||
iterationrange <- c(0, as.integer(object$best_iteration))
|
||||
iterationrange <- c(0, as.integer(best_iteration))
|
||||
}
|
||||
}
|
||||
## Handle the 0 length values.
|
||||
@ -438,7 +336,10 @@ predict.xgb.Booster <- function(object, newdata, missing = NA, outputmargin = FA
|
||||
}
|
||||
|
||||
predts <- .Call(
|
||||
XGBoosterPredictFromDMatrix_R, object$handle, newdata, jsonlite::toJSON(args, auto_unbox = TRUE)
|
||||
XGBoosterPredictFromDMatrix_R,
|
||||
xgb.get.handle(object),
|
||||
newdata,
|
||||
jsonlite::toJSON(args, auto_unbox = TRUE)
|
||||
)
|
||||
names(predts) <- c("shape", "results")
|
||||
shape <- predts$shape
|
||||
@ -509,22 +410,12 @@ predict.xgb.Booster <- function(object, newdata, missing = NA, outputmargin = FA
|
||||
return(arr)
|
||||
}
|
||||
|
||||
#' @rdname predict.xgb.Booster
|
||||
#' @export
|
||||
predict.xgb.Booster.handle <- function(object, ...) {
|
||||
|
||||
bst <- xgb.handleToBooster(handle = object, raw = NULL)
|
||||
|
||||
ret <- predict(bst, ...)
|
||||
return(ret)
|
||||
}
|
||||
|
||||
|
||||
#' Accessors for serializable attributes of a model
|
||||
#' @title Accessors for serializable attributes of a model
|
||||
#'
|
||||
#' These methods allow to manipulate the key-value attribute strings of an xgboost model.
|
||||
#' @description These methods allow to manipulate the key-value attribute strings of an xgboost model.
|
||||
#'
|
||||
#' @param object Object of class `xgb.Booster` or `xgb.Booster.handle`.
|
||||
#' @param object Object of class `xgb.Booster`. \bold{Will be modified in-place} when assigning to it.
|
||||
#' @param name A non-empty character string specifying which attribute is to be accessed.
|
||||
#' @param value For `xgb.attr<-`, a value of an attribute; for `xgb.attributes<-`,
|
||||
#' it is a list (or an object coercible to a list) with the names of attributes to set
|
||||
@ -546,16 +437,15 @@ predict.xgb.Booster.handle <- function(object, ...) {
|
||||
#' change the value of that parameter for a model.
|
||||
#' Use [xgb.parameters<-()] to set or change model parameters.
|
||||
#'
|
||||
#' The attribute setters would usually work more efficiently for `xgb.Booster.handle`
|
||||
#' than for `xgb.Booster`, since only just a handle (pointer) would need to be copied.
|
||||
#' That would only matter if attributes need to be set many times.
|
||||
#' Note, however, that when feeding a handle of an `xgb.Booster` object to the attribute setters,
|
||||
#' the raw model cache of an `xgb.Booster` object would not be automatically updated,
|
||||
#' and it would be the user's responsibility to call [xgb.serialize()] to update it.
|
||||
#'
|
||||
#' The `xgb.attributes<-` setter either updates the existing or adds one or several attributes,
|
||||
#' but it doesn't delete the other existing attributes.
|
||||
#'
|
||||
#' Important: since this modifies the booster's C object, semantics for assignment here
|
||||
#' will differ from R's, as any object reference to the same booster will be modified
|
||||
#' too, while assignment of R attributes through `attributes(model)$<attr> <- <value>`
|
||||
#' will follow the usual copy-on-write R semantics (see \link{xgb.copy.Booster} for an
|
||||
#' example of these behaviors).
|
||||
#'
|
||||
#' @return
|
||||
#' - `xgb.attr()` returns either a string value of an attribute
|
||||
#' or `NULL` if an attribute wasn't stored in a model.
|
||||
@ -597,14 +487,25 @@ predict.xgb.Booster.handle <- function(object, ...) {
|
||||
xgb.attr <- function(object, name) {
|
||||
if (is.null(name) || nchar(as.character(name[1])) == 0) stop("invalid attribute name")
|
||||
handle <- xgb.get.handle(object)
|
||||
.Call(XGBoosterGetAttr_R, handle, as.character(name[1]))
|
||||
out <- .Call(XGBoosterGetAttr_R, handle, as.character(name[1]))
|
||||
if (!NROW(out) || !nchar(out)) {
|
||||
return(NULL)
|
||||
}
|
||||
if (!is.null(out)) {
|
||||
if (name %in% c("best_iteration", "best_ntreelimit", "best_score")) {
|
||||
out <- as.numeric(out)
|
||||
}
|
||||
}
|
||||
return(out)
|
||||
}
|
||||
|
||||
#' @rdname xgb.attr
|
||||
#' @export
|
||||
`xgb.attr<-` <- function(object, name, value) {
|
||||
if (is.null(name) || nchar(as.character(name[1])) == 0) stop("invalid attribute name")
|
||||
name <- as.character(name[1])
|
||||
if (!NROW(name) || !nchar(name)) stop("invalid attribute name")
|
||||
handle <- xgb.get.handle(object)
|
||||
|
||||
if (!is.null(value)) {
|
||||
# Coerce the elements to be scalar strings.
|
||||
# Q: should we warn user about non-scalar elements?
|
||||
@ -614,11 +515,8 @@ xgb.attr <- function(object, name) {
|
||||
value <- as.character(value[1])
|
||||
}
|
||||
}
|
||||
.Call(XGBoosterSetAttr_R, handle, as.character(name[1]), value)
|
||||
if (is(object, 'xgb.Booster') && !is.null(object$raw)) {
|
||||
object$raw <- xgb.serialize(object$handle)
|
||||
}
|
||||
object
|
||||
.Call(XGBoosterSetAttr_R, handle, name, value)
|
||||
return(object)
|
||||
}
|
||||
|
||||
#' @rdname xgb.attr
|
||||
@ -626,12 +524,10 @@ xgb.attr <- function(object, name) {
|
||||
xgb.attributes <- function(object) {
|
||||
handle <- xgb.get.handle(object)
|
||||
attr_names <- .Call(XGBoosterGetAttrNames_R, handle)
|
||||
if (is.null(attr_names)) return(NULL)
|
||||
res <- lapply(attr_names, function(x) {
|
||||
.Call(XGBoosterGetAttr_R, handle, x)
|
||||
})
|
||||
names(res) <- attr_names
|
||||
res
|
||||
if (!NROW(attr_names)) return(list())
|
||||
out <- lapply(attr_names, function(name) xgb.attr(object, name))
|
||||
names(out) <- attr_names
|
||||
return(out)
|
||||
}
|
||||
|
||||
#' @rdname xgb.attr
|
||||
@ -641,31 +537,21 @@ xgb.attributes <- function(object) {
|
||||
if (is.null(names(a)) || any(nchar(names(a)) == 0)) {
|
||||
stop("attribute names cannot be empty strings")
|
||||
}
|
||||
# Coerce the elements to be scalar strings.
|
||||
# Q: should we warn a user about non-scalar elements?
|
||||
a <- lapply(a, function(x) {
|
||||
if (is.null(x)) return(NULL)
|
||||
if (is.numeric(x[1])) {
|
||||
format(x[1], digits = 17)
|
||||
} else {
|
||||
as.character(x[1])
|
||||
}
|
||||
})
|
||||
handle <- xgb.get.handle(object)
|
||||
for (i in seq_along(a)) {
|
||||
.Call(XGBoosterSetAttr_R, handle, names(a[i]), a[[i]])
|
||||
xgb.attr(object, names(a[i])) <- a[[i]]
|
||||
}
|
||||
if (is(object, 'xgb.Booster') && !is.null(object$raw)) {
|
||||
object$raw <- xgb.serialize(object$handle)
|
||||
}
|
||||
object
|
||||
return(object)
|
||||
}
|
||||
|
||||
#' Accessors for model parameters as JSON string
|
||||
#'
|
||||
#' @param object Object of class `xgb.Booster`.
|
||||
#' @param value A JSON string.
|
||||
#' @title Accessors for model parameters as JSON string
|
||||
#' @details Note that assignment is performed in-place on the booster C object, which unlike assignment
|
||||
#' of R attributes, doesn't follow typical copy-on-write semantics for assignment - i.e. all references
|
||||
#' to the same booster will also get updated.
|
||||
#'
|
||||
#' See \link{xgb.copy.Booster} for an example of this behavior.
|
||||
#' @param object Object of class `xgb.Booster`. \bold{Will be modified in-place} when assigning to it.
|
||||
#' @param value An R list.
|
||||
#' @return `xgb.config` will return the parameters as an R list.
|
||||
#' @examples
|
||||
#' data(agaricus.train, package = "xgboost")
|
||||
#'
|
||||
@ -690,31 +576,36 @@ xgb.attributes <- function(object) {
|
||||
#' @export
|
||||
xgb.config <- function(object) {
|
||||
handle <- xgb.get.handle(object)
|
||||
.Call(XGBoosterSaveJsonConfig_R, handle)
|
||||
return(jsonlite::fromJSON(.Call(XGBoosterSaveJsonConfig_R, handle)))
|
||||
}
|
||||
|
||||
#' @rdname xgb.config
|
||||
#' @export
|
||||
`xgb.config<-` <- function(object, value) {
|
||||
handle <- xgb.get.handle(object)
|
||||
.Call(XGBoosterLoadJsonConfig_R, handle, value)
|
||||
object$raw <- NULL # force renew the raw buffer
|
||||
object <- xgb.Booster.complete(object)
|
||||
object
|
||||
.Call(
|
||||
XGBoosterLoadJsonConfig_R,
|
||||
handle,
|
||||
jsonlite::toJSON(value, auto_unbox = TRUE, null = "null")
|
||||
)
|
||||
return(object)
|
||||
}
|
||||
|
||||
#' Accessors for model parameters
|
||||
#' @title Accessors for model parameters
|
||||
#' @description Only the setter for xgboost parameters is currently implemented.
|
||||
#' @details Just like \link{xgb.attr}, this function will make in-place modifications
|
||||
#' on the booster object which do not follow typical R assignment semantics - that is,
|
||||
#' all references to the same booster will also be updated, unlike assingment of R
|
||||
#' attributes which follow copy-on-write semantics.
|
||||
#'
|
||||
#' Only the setter for xgboost parameters is currently implemented.
|
||||
#' See \link{xgb.copy.Booster} for an example of this behavior.
|
||||
#'
|
||||
#' @param object Object of class `xgb.Booster` or `xgb.Booster.handle`.
|
||||
#' Be aware that setting parameters of a fitted booster related to training continuation / updates
|
||||
#' will reset its number of rounds indicator to zero.
|
||||
#' @param object Object of class `xgb.Booster`. \bold{Will be modified in-place}.
|
||||
#' @param value A list (or an object coercible to a list) with the names of parameters to set
|
||||
#' and the elements corresponding to parameter values.
|
||||
#'
|
||||
#' @details
|
||||
#' Note that the setter would usually work more efficiently for `xgb.Booster.handle`
|
||||
#' than for `xgb.Booster`, since only just a handle would need to be copied.
|
||||
#'
|
||||
#' @return The same booster `object`, which gets modified in-place.
|
||||
#' @examples
|
||||
#' data(agaricus.train, package = "xgboost")
|
||||
#' train <- agaricus.train
|
||||
@ -751,28 +642,301 @@ xgb.config <- function(object) {
|
||||
for (i in seq_along(p)) {
|
||||
.Call(XGBoosterSetParam_R, handle, names(p[i]), p[[i]])
|
||||
}
|
||||
if (is(object, 'xgb.Booster') && !is.null(object$raw)) {
|
||||
object$raw <- xgb.serialize(object$handle)
|
||||
return(object)
|
||||
}
|
||||
|
||||
#' @rdname getinfo
|
||||
#' @export
|
||||
getinfo.xgb.Booster <- function(object, name) {
|
||||
name <- as.character(head(name, 1L))
|
||||
allowed_fields <- c("feature_name", "feature_type")
|
||||
if (!(name %in% allowed_fields)) {
|
||||
stop("getinfo: name must be one of the following: ", paste(allowed_fields, collapse = ", "))
|
||||
}
|
||||
object
|
||||
handle <- xgb.get.handle(object)
|
||||
out <- .Call(
|
||||
XGBoosterGetStrFeatureInfo_R,
|
||||
handle,
|
||||
name
|
||||
)
|
||||
if (!NROW(out)) {
|
||||
return(NULL)
|
||||
}
|
||||
return(out)
|
||||
}
|
||||
|
||||
#' @rdname getinfo
|
||||
#' @export
|
||||
setinfo.xgb.Booster <- function(object, name, info) {
|
||||
name <- as.character(head(name, 1L))
|
||||
allowed_fields <- c("feature_name", "feature_type")
|
||||
if (!(name %in% allowed_fields)) {
|
||||
stop("setinfo: unknown info name ", name)
|
||||
}
|
||||
info <- as.character(info)
|
||||
handle <- xgb.get.handle(object)
|
||||
.Call(
|
||||
XGBoosterSetStrFeatureInfo_R,
|
||||
handle,
|
||||
name,
|
||||
info
|
||||
)
|
||||
return(TRUE)
|
||||
}
|
||||
|
||||
#' @title Get number of boosting in a fitted booster
|
||||
#' @param model A fitted `xgb.Booster` model.
|
||||
#' @return The number of rounds saved in the model, as an integer.
|
||||
#' @details Note that setting booster parameters related to training
|
||||
#' continuation / updates through \link{xgb.parameters<-} will reset the
|
||||
#' number of rounds to zero.
|
||||
#' @export
|
||||
xgb.get.num.boosted.rounds <- function(model) {
|
||||
return(.Call(XGBoosterBoostedRounds_R, xgb.get.handle(model)))
|
||||
}
|
||||
|
||||
#' @title Get Features Names from Booster
|
||||
#' @description Returns the feature / variable / column names from a fitted
|
||||
#' booster object, which are set automatically during the call to \link{xgb.train}
|
||||
#' from the DMatrix names, or which can be set manually through \link{setinfo}.
|
||||
#'
|
||||
#' If the object doesn't have feature names, will return `NULL`.
|
||||
#'
|
||||
#' It is equivalent to calling `getinfo(object, "feature_name")`.
|
||||
#' @param object An `xgb.Booster` object.
|
||||
#' @param ... Not used.
|
||||
#' @export
|
||||
variable.names.xgb.Booster <- function(object, ...) {
|
||||
return(getinfo(object, "feature_name"))
|
||||
}
|
||||
|
||||
# Extract the number of trees in a model.
|
||||
# TODO: either add a getter to C-interface, or simply set an 'ntree' attribute after each iteration.
|
||||
# internal utility function
|
||||
xgb.ntree <- function(bst) {
|
||||
length(grep('^booster', xgb.dump(bst)))
|
||||
config <- xgb.config(bst)
|
||||
out <- strtoi(config$learner$gradient_booster$gbtree_model_param$num_trees)
|
||||
return(out)
|
||||
}
|
||||
|
||||
xgb.nthread <- function(bst) {
|
||||
config <- xgb.config(bst)
|
||||
out <- strtoi(config$learner$generic_param$nthread)
|
||||
return(out)
|
||||
}
|
||||
|
||||
#' Print xgb.Booster
|
||||
xgb.booster_type <- function(bst) {
|
||||
config <- xgb.config(bst)
|
||||
out <- config$learner$learner_train_param$booster
|
||||
return(out)
|
||||
}
|
||||
|
||||
xgb.num_class <- function(bst) {
|
||||
config <- xgb.config(bst)
|
||||
out <- strtoi(config$learner$learner_model_param$num_class)
|
||||
return(out)
|
||||
}
|
||||
|
||||
xgb.feature_names <- function(bst) {
|
||||
return(getinfo(bst, "feature_name"))
|
||||
}
|
||||
|
||||
xgb.feature_types <- function(bst) {
|
||||
return(getinfo(bst, "feature_type"))
|
||||
}
|
||||
|
||||
xgb.num_feature <- function(bst) {
|
||||
handle <- xgb.get.handle(bst)
|
||||
return(.Call(XGBoosterGetNumFeature_R, handle))
|
||||
}
|
||||
|
||||
xgb.best_iteration <- function(bst) {
|
||||
out <- xgb.attr(bst, "best_iteration")
|
||||
if (!NROW(out) || !nchar(out)) {
|
||||
out <- NULL
|
||||
}
|
||||
return(out)
|
||||
}
|
||||
|
||||
#' @title Extract coefficients from linear booster
|
||||
#' @description Extracts the coefficients from a 'gblinear' booster object,
|
||||
#' as produced by \code{xgb.train} when using parameter `booster="gblinear"`.
|
||||
#'
|
||||
#' Print information about `xgb.Booster`.
|
||||
#' Note: this function will error out if passing a booster model
|
||||
#' which is not of "gblinear" type.
|
||||
#' @param object A fitted booster of 'gblinear' type.
|
||||
#' @param ... Not used.
|
||||
#' @return The extracted coefficients:\itemize{
|
||||
#' \item If there's only one coefficient per column in the data, will be returned as a
|
||||
#' vector, potentially containing the feature names if available, with the intercept
|
||||
#' as first column.
|
||||
#' \item If there's more than one coefficient per column in the data (e.g. when using
|
||||
#' `objective="multi:softmax"`), will be returned as a matrix with dimensions equal
|
||||
#' to `[num_features, num_cols]`, with the intercepts as first row. Note that the column
|
||||
#' (classes in multi-class classification) dimension will not be named.
|
||||
#' }
|
||||
#'
|
||||
#' The intercept returned here will include the 'base_score' parameter (unlike the 'bias'
|
||||
#' or the last coefficient in the model dump, which doesn't have 'base_score' added to it),
|
||||
#' hence one should get the same values from calling `predict(..., outputmargin = TRUE)` and
|
||||
#' from performing a matrix multiplication with `model.matrix(~., ...)`.
|
||||
#'
|
||||
#' Be aware that the coefficients are obtained by first converting them to strings and
|
||||
#' back, so there will always be some very small lose of precision compared to the actual
|
||||
#' coefficients as used by \link{predict.xgb.Booster}.
|
||||
#' @examples
|
||||
#' library(xgboost)
|
||||
#' data(mtcars)
|
||||
#' y <- mtcars[, 1]
|
||||
#' x <- as.matrix(mtcars[, -1])
|
||||
#' dm <- xgb.DMatrix(data = x, label = y, nthread = 1)
|
||||
#' params <- list(booster = "gblinear", nthread = 1)
|
||||
#' model <- xgb.train(data = dm, params = params, nrounds = 2)
|
||||
#' coef(model)
|
||||
#' @export
|
||||
coef.xgb.Booster <- function(object, ...) {
|
||||
booster_type <- xgb.booster_type(object)
|
||||
if (booster_type != "gblinear") {
|
||||
stop("Coefficients are not defined for Booster type ", booster_type)
|
||||
}
|
||||
model_json <- jsonlite::fromJSON(rawToChar(xgb.save.raw(object, raw_format = "json")))
|
||||
base_score <- model_json$learner$learner_model_param$base_score
|
||||
num_feature <- as.numeric(model_json$learner$learner_model_param$num_feature)
|
||||
|
||||
weights <- model_json$learner$gradient_booster$model$weights
|
||||
n_cols <- length(weights) / (num_feature + 1)
|
||||
if (n_cols != floor(n_cols) || n_cols < 1) {
|
||||
stop("Internal error: could not determine shape of coefficients.")
|
||||
}
|
||||
sep <- num_feature * n_cols
|
||||
coefs <- weights[seq(1, sep)]
|
||||
intercepts <- weights[seq(sep + 1, length(weights))]
|
||||
intercepts <- intercepts + as.numeric(base_score)
|
||||
|
||||
feature_names <- xgb.feature_names(object)
|
||||
if (!NROW(feature_names)) {
|
||||
# This mimics the default naming in R which names columns as "V1..N"
|
||||
# when names are needed but not available
|
||||
feature_names <- paste0("V", seq(1L, num_feature))
|
||||
}
|
||||
feature_names <- c("(Intercept)", feature_names)
|
||||
if (n_cols == 1L) {
|
||||
out <- c(intercepts, coefs)
|
||||
names(out) <- feature_names
|
||||
} else {
|
||||
coefs <- matrix(coefs, nrow = num_feature, byrow = TRUE)
|
||||
dim(intercepts) <- c(1L, n_cols)
|
||||
out <- rbind(intercepts, coefs)
|
||||
row.names(out) <- feature_names
|
||||
# TODO: if a class names attributes is added,
|
||||
# should use those names here.
|
||||
}
|
||||
return(out)
|
||||
}
|
||||
|
||||
#' @title Deep-copies a Booster Object
|
||||
#' @description Creates a deep copy of an 'xgb.Booster' object, such that the
|
||||
#' C object pointer contained will be a different object, and hence functions
|
||||
#' like \link{xgb.attr} will not affect the object from which it was copied.
|
||||
#' @param model An 'xgb.Booster' object.
|
||||
#' @return A deep copy of `model` - it will be identical in every way, but C-level
|
||||
#' functions called on that copy will not affect the `model` variable.
|
||||
#' @examples
|
||||
#' library(xgboost)
|
||||
#' data(mtcars)
|
||||
#' y <- mtcars$mpg
|
||||
#' x <- mtcars[, -1]
|
||||
#' dm <- xgb.DMatrix(x, label = y, nthread = 1)
|
||||
#' model <- xgb.train(
|
||||
#' data = dm,
|
||||
#' params = list(nthread = 1),
|
||||
#' nround = 3
|
||||
#' )
|
||||
#'
|
||||
#' # Set an arbitrary attribute kept at the C level
|
||||
#' xgb.attr(model, "my_attr") <- 100
|
||||
#' print(xgb.attr(model, "my_attr"))
|
||||
#'
|
||||
#' # Just assigning to a new variable will not create
|
||||
#' # a deep copy - C object pointer is shared, and in-place
|
||||
#' # modifications will affect both objects
|
||||
#' model_shallow_copy <- model
|
||||
#' xgb.attr(model_shallow_copy, "my_attr") <- 333
|
||||
#' # 'model' was also affected by this change:
|
||||
#' print(xgb.attr(model, "my_attr"))
|
||||
#'
|
||||
#' model_deep_copy <- xgb.copy.Booster(model)
|
||||
#' xgb.attr(model_deep_copy, "my_attr") <- 444
|
||||
#' # 'model' was NOT affected by this change
|
||||
#' # (keeps previous value that was assigned before)
|
||||
#' print(xgb.attr(model, "my_attr"))
|
||||
#'
|
||||
#' # Verify that the new object was actually modified
|
||||
#' print(xgb.attr(model_deep_copy, "my_attr"))
|
||||
#' @export
|
||||
xgb.copy.Booster <- function(model) {
|
||||
if (!inherits(model, "xgb.Booster")) {
|
||||
stop("'model' must be an 'xgb.Booster' object.")
|
||||
}
|
||||
return(.Call(XGDuplicate_R, model))
|
||||
}
|
||||
|
||||
#' @title Check if two boosters share the same C object
|
||||
#' @description Checks whether two booster objects refer to the same underlying C object.
|
||||
#' @details As booster objects (as returned by e.g. \link{xgb.train}) contain an R 'externalptr'
|
||||
#' object, they don't follow typical copy-on-write semantics of other R objects - that is, if
|
||||
#' one assigns a booster to a different variable and modifies that new variable through in-place
|
||||
#' methods like \link{xgb.attr<-}, the modification will be applied to both the old and the new
|
||||
#' variable, unlike typical R assignments which would only modify the latter.
|
||||
#'
|
||||
#' This function allows checking whether two booster objects share the same 'externalptr',
|
||||
#' regardless of the R attributes that they might have.
|
||||
#'
|
||||
#' In order to duplicate a booster in such a way that the copy wouldn't share the same
|
||||
#' 'externalptr', one can use function \link{xgb.copy.Booster}.
|
||||
#' @param obj1 Booster model to compare with `obj2`.
|
||||
#' @param obj2 Booster model to compare with `obj1`.
|
||||
#' @return Either `TRUE` or `FALSE` according to whether the two boosters share
|
||||
#' the underlying C object.
|
||||
#' @seealso \link{xgb.copy.Booster}
|
||||
#' @examples
|
||||
#' library(xgboost)
|
||||
#' data(mtcars)
|
||||
#' y <- mtcars$mpg
|
||||
#' x <- as.matrix(mtcars[, -1])
|
||||
#' model <- xgb.train(
|
||||
#' params = list(nthread = 1),
|
||||
#' data = xgb.DMatrix(x, label = y, nthread = 1),
|
||||
#' nround = 3
|
||||
#' )
|
||||
#'
|
||||
#' model_shallow_copy <- model
|
||||
#' xgb.is.same.Booster(model, model_shallow_copy) # same C object
|
||||
#'
|
||||
#' model_deep_copy <- xgb.copy.Booster(model)
|
||||
#' xgb.is.same.Booster(model, model_deep_copy) # different C objects
|
||||
#'
|
||||
#' # In-place assignments modify all references,
|
||||
#' # but not full/deep copies of the booster
|
||||
#' xgb.attr(model_shallow_copy, "my_attr") <- 111
|
||||
#' xgb.attr(model, "my_attr") # gets modified
|
||||
#' xgb.attr(model_deep_copy, "my_attr") # doesn't get modified
|
||||
#' @export
|
||||
xgb.is.same.Booster <- function(obj1, obj2) {
|
||||
if (!inherits(obj1, "xgb.Booster") || !inherits(obj2, "xgb.Booster")) {
|
||||
stop("'xgb.is.same.Booster' is only applicable to 'xgb.Booster' objects.")
|
||||
}
|
||||
return(
|
||||
.Call(
|
||||
XGPointerEqComparison_R,
|
||||
xgb.get.handle(obj1),
|
||||
xgb.get.handle(obj2)
|
||||
)
|
||||
)
|
||||
}
|
||||
|
||||
#' @title Print xgb.Booster
|
||||
#' @description Print information about `xgb.Booster`.
|
||||
#' @param x An `xgb.Booster` object.
|
||||
#' @param verbose Whether to print detailed data (e.g., attribute values).
|
||||
#' @param ... Not currently used.
|
||||
#'
|
||||
#' @param ... Not used.
|
||||
#' @return The same `x` object, returned invisibly
|
||||
#' @examples
|
||||
#' data(agaricus.train, package = "xgboost")
|
||||
#' train <- agaricus.train
|
||||
@ -790,79 +954,40 @@ xgb.ntree <- function(bst) {
|
||||
#' attr(bst, "myattr") <- "memo"
|
||||
#'
|
||||
#' print(bst)
|
||||
#' print(bst, verbose = TRUE)
|
||||
#'
|
||||
#' @export
|
||||
print.xgb.Booster <- function(x, verbose = FALSE, ...) {
|
||||
print.xgb.Booster <- function(x, ...) {
|
||||
# this lets it error out when the object comes from an earlier R xgboost version
|
||||
handle <- xgb.get.handle(x)
|
||||
cat('##### xgb.Booster\n')
|
||||
|
||||
valid_handle <- !is.null.handle(x$handle)
|
||||
if (!valid_handle)
|
||||
cat("Handle is invalid! Suggest using xgb.Booster.complete\n")
|
||||
|
||||
cat('raw: ')
|
||||
if (!is.null(x$raw)) {
|
||||
cat(format(object.size(x$raw), units = "auto"), '\n')
|
||||
} else {
|
||||
cat('NULL\n')
|
||||
}
|
||||
if (!is.null(x$call)) {
|
||||
R_attrs <- attributes(x)
|
||||
if (!is.null(R_attrs$call)) {
|
||||
cat('call:\n ')
|
||||
print(x$call)
|
||||
print(R_attrs$call)
|
||||
}
|
||||
|
||||
if (!is.null(x$params)) {
|
||||
cat('params (as set within xgb.train):\n')
|
||||
cat(' ',
|
||||
paste(names(x$params),
|
||||
paste0('"', unlist(x$params), '"'),
|
||||
sep = ' = ', collapse = ', '), '\n', sep = '')
|
||||
}
|
||||
# TODO: need an interface to access all the xgboosts parameters
|
||||
cat('# of features:', xgb.num_feature(x), '\n')
|
||||
cat('# of rounds: ', xgb.get.num.boosted.rounds(x), '\n')
|
||||
|
||||
attrs <- character(0)
|
||||
if (valid_handle)
|
||||
attrs <- xgb.attributes(x)
|
||||
if (length(attrs) > 0) {
|
||||
attr_names <- .Call(XGBoosterGetAttrNames_R, handle)
|
||||
if (NROW(attr_names)) {
|
||||
cat('xgb.attributes:\n')
|
||||
if (verbose) {
|
||||
cat(paste(paste0(' ', names(attrs)),
|
||||
paste0('"', unlist(attrs), '"'),
|
||||
sep = ' = ', collapse = '\n'), '\n', sep = '')
|
||||
} else {
|
||||
cat(' ', paste(names(attrs), collapse = ', '), '\n', sep = '')
|
||||
}
|
||||
cat(" ", paste(attr_names, collapse = ", "), "\n")
|
||||
}
|
||||
|
||||
if (!is.null(x$callbacks) && length(x$callbacks) > 0) {
|
||||
if (!is.null(R_attrs$callbacks) && length(R_attrs$callbacks) > 0) {
|
||||
cat('callbacks:\n')
|
||||
lapply(callback.calls(x$callbacks), function(x) {
|
||||
lapply(callback.calls(R_attrs$callbacks), function(x) {
|
||||
cat(' ')
|
||||
print(x)
|
||||
})
|
||||
}
|
||||
|
||||
if (!is.null(x$feature_names))
|
||||
cat('# of features:', length(x$feature_names), '\n')
|
||||
|
||||
cat('niter: ', x$niter, '\n', sep = '')
|
||||
# TODO: uncomment when faster xgb.ntree is implemented
|
||||
#cat('ntree: ', xgb.ntree(x), '\n', sep='')
|
||||
|
||||
for (n in setdiff(names(x), c('handle', 'raw', 'call', 'params', 'callbacks',
|
||||
'evaluation_log', 'niter', 'feature_names'))) {
|
||||
if (is.atomic(x[[n]])) {
|
||||
cat(n, ':', x[[n]], '\n', sep = ' ')
|
||||
} else {
|
||||
cat(n, ':\n\t', sep = ' ')
|
||||
print(x[[n]])
|
||||
}
|
||||
}
|
||||
|
||||
if (!is.null(x$evaluation_log)) {
|
||||
if (!is.null(R_attrs$evaluation_log)) {
|
||||
cat('evaluation_log:\n')
|
||||
print(x$evaluation_log, row.names = FALSE, topn = 2)
|
||||
print(R_attrs$evaluation_log, row.names = FALSE, topn = 2)
|
||||
}
|
||||
|
||||
invisible(x)
|
||||
return(invisible(x))
|
||||
}
|
||||
|
||||
@ -335,14 +335,13 @@ dimnames.xgb.DMatrix <- function(x) {
|
||||
}
|
||||
|
||||
|
||||
#' Get information of an xgb.DMatrix object
|
||||
#'
|
||||
#' Get information of an xgb.DMatrix object
|
||||
#' @param object Object of class \code{xgb.DMatrix}
|
||||
#' @title Get or set information of xgb.DMatrix and xgb.Booster objects
|
||||
#' @param object Object of class \code{xgb.DMatrix} of `xgb.Booster`.
|
||||
#' @param name the name of the information field to get (see details)
|
||||
#'
|
||||
#' @return For `getinfo`, will return the requested field. For `setinfo`, will always return value `TRUE`
|
||||
#' if it succeeds.
|
||||
#' @details
|
||||
#' The \code{name} field can be one of the following:
|
||||
#' The \code{name} field can be one of the following for `xgb.DMatrix`:
|
||||
#'
|
||||
#' \itemize{
|
||||
#' \item \code{label}
|
||||
@ -357,9 +356,17 @@ dimnames.xgb.DMatrix <- function(x) {
|
||||
#' }
|
||||
#' See the documentation for \link{xgb.DMatrix} for more information about these fields.
|
||||
#'
|
||||
#' For `xgb.Booster`, can be one of the following:
|
||||
#' \itemize{
|
||||
#' \item \code{feature_type}
|
||||
#' \item \code{feature_name}
|
||||
#' }
|
||||
#'
|
||||
#' Note that, while 'qid' cannot be retrieved, it's possible to get the equivalent 'group'
|
||||
#' for a DMatrix that had 'qid' assigned.
|
||||
#'
|
||||
#' \bold{Important}: when calling `setinfo`, the objects are modified in-place. See
|
||||
#' \link{xgb.copy.Booster} for an idea of this in-place assignment works.
|
||||
#' @examples
|
||||
#' data(agaricus.train, package='xgboost')
|
||||
#' dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label, nthread = 2))
|
||||
@ -412,13 +419,7 @@ getinfo.xgb.DMatrix <- function(object, name) {
|
||||
return(ret)
|
||||
}
|
||||
|
||||
|
||||
#' Set information of an xgb.DMatrix object
|
||||
#'
|
||||
#' Set information of an xgb.DMatrix object
|
||||
#'
|
||||
#' @param object Object of class "xgb.DMatrix"
|
||||
#' @param name the name of the field to get
|
||||
#' @rdname getinfo
|
||||
#' @param info the specific field of information to set
|
||||
#'
|
||||
#' @details
|
||||
@ -441,11 +442,10 @@ getinfo.xgb.DMatrix <- function(object, name) {
|
||||
#' setinfo(dtrain, 'label', 1-labels)
|
||||
#' labels2 <- getinfo(dtrain, 'label')
|
||||
#' stopifnot(all.equal(labels2, 1-labels))
|
||||
#' @rdname setinfo
|
||||
#' @export
|
||||
setinfo <- function(object, name, info) UseMethod("setinfo")
|
||||
|
||||
#' @rdname setinfo
|
||||
#' @rdname getinfo
|
||||
#' @export
|
||||
setinfo.xgb.DMatrix <- function(object, name, info) {
|
||||
.internal.setinfo.xgb.DMatrix(object, name, info)
|
||||
|
||||
@ -204,13 +204,13 @@ xgb.cv <- function(params = list(), data, nrounds, nfold, label = NULL, missing
|
||||
dtrain <- slice(dall, unlist(folds[-k]))
|
||||
else
|
||||
dtrain <- slice(dall, train_folds[[k]])
|
||||
handle <- xgb.Booster.handle(
|
||||
bst <- xgb.Booster(
|
||||
params = params,
|
||||
cachelist = list(dtrain, dtest),
|
||||
modelfile = NULL,
|
||||
handle = NULL
|
||||
modelfile = NULL
|
||||
)
|
||||
list(dtrain = dtrain, bst = handle, watchlist = list(train = dtrain, test = dtest), index = folds[[k]])
|
||||
bst <- bst$bst
|
||||
list(dtrain = dtrain, bst = bst, watchlist = list(train = dtrain, test = dtest), index = folds[[k]])
|
||||
})
|
||||
rm(dall)
|
||||
# a "basket" to collect some results from callbacks
|
||||
@ -231,13 +231,13 @@ xgb.cv <- function(params = list(), data, nrounds, nfold, label = NULL, missing
|
||||
|
||||
msg <- lapply(bst_folds, function(fd) {
|
||||
xgb.iter.update(
|
||||
booster_handle = fd$bst,
|
||||
bst = fd$bst,
|
||||
dtrain = fd$dtrain,
|
||||
iter = iteration - 1,
|
||||
obj = obj
|
||||
)
|
||||
xgb.iter.eval(
|
||||
booster_handle = fd$bst,
|
||||
bst = fd$bst,
|
||||
watchlist = fd$watchlist,
|
||||
iter = iteration - 1,
|
||||
feval = feval
|
||||
@ -267,7 +267,7 @@ xgb.cv <- function(params = list(), data, nrounds, nfold, label = NULL, missing
|
||||
ret <- c(ret, basket)
|
||||
|
||||
class(ret) <- 'xgb.cv.synchronous'
|
||||
invisible(ret)
|
||||
return(invisible(ret))
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -56,9 +56,13 @@ xgb.dump <- function(model, fname = NULL, fmap = "", with_stats = FALSE,
|
||||
if (!(is.null(fmap) || is.character(fmap)))
|
||||
stop("fmap: argument must be a character string (when provided)")
|
||||
|
||||
model <- xgb.Booster.complete(model)
|
||||
model_dump <- .Call(XGBoosterDumpModel_R, model$handle, NVL(fmap, "")[1], as.integer(with_stats),
|
||||
as.character(dump_format))
|
||||
model_dump <- .Call(
|
||||
XGBoosterDumpModel_R,
|
||||
xgb.get.handle(model),
|
||||
NVL(fmap, "")[1],
|
||||
as.integer(with_stats),
|
||||
as.character(dump_format)
|
||||
)
|
||||
if (dump_format == "dot") {
|
||||
return(sapply(model_dump, function(x) gsub("^booster\\[\\d+\\]\\n", "\\1", x)))
|
||||
}
|
||||
|
||||
@ -119,21 +119,21 @@ xgb.importance <- function(feature_names = NULL, model = NULL, trees = NULL,
|
||||
if (!(is.null(data) && is.null(label) && is.null(target)))
|
||||
warning("xgb.importance: parameters 'data', 'label' and 'target' are deprecated")
|
||||
|
||||
if (!inherits(model, "xgb.Booster"))
|
||||
stop("model: must be an object of class xgb.Booster")
|
||||
|
||||
if (is.null(feature_names) && !is.null(model$feature_names))
|
||||
feature_names <- model$feature_names
|
||||
if (is.null(feature_names)) {
|
||||
model_feature_names <- xgb.feature_names(model)
|
||||
if (NROW(model_feature_names)) {
|
||||
feature_names <- model_feature_names
|
||||
}
|
||||
}
|
||||
|
||||
if (!(is.null(feature_names) || is.character(feature_names)))
|
||||
stop("feature_names: Has to be a character vector")
|
||||
|
||||
model <- xgb.Booster.complete(model)
|
||||
config <- jsonlite::fromJSON(xgb.config(model))
|
||||
if (config$learner$gradient_booster$name == "gblinear") {
|
||||
handle <- xgb.get.handle(model)
|
||||
if (xgb.booster_type(model) == "gblinear") {
|
||||
args <- list(importance_type = "weight", feature_names = feature_names)
|
||||
results <- .Call(
|
||||
XGBoosterFeatureScore_R, model$handle, jsonlite::toJSON(args, auto_unbox = TRUE, null = "null")
|
||||
XGBoosterFeatureScore_R, handle, jsonlite::toJSON(args, auto_unbox = TRUE, null = "null")
|
||||
)
|
||||
names(results) <- c("features", "shape", "weight")
|
||||
if (length(results$shape) == 2) {
|
||||
@ -154,7 +154,7 @@ xgb.importance <- function(feature_names = NULL, model = NULL, trees = NULL,
|
||||
for (importance_type in c("weight", "total_gain", "total_cover")) {
|
||||
args <- list(importance_type = importance_type, feature_names = feature_names, tree_idx = trees)
|
||||
results <- .Call(
|
||||
XGBoosterFeatureScore_R, model$handle, jsonlite::toJSON(args, auto_unbox = TRUE, null = "null")
|
||||
XGBoosterFeatureScore_R, handle, jsonlite::toJSON(args, auto_unbox = TRUE, null = "null")
|
||||
)
|
||||
names(results) <- c("features", "shape", importance_type)
|
||||
concatenated[
|
||||
|
||||
@ -17,7 +17,7 @@
|
||||
#' An object of \code{xgb.Booster} class.
|
||||
#'
|
||||
#' @seealso
|
||||
#' \code{\link{xgb.save}}, \code{\link{xgb.Booster.complete}}.
|
||||
#' \code{\link{xgb.save}}
|
||||
#'
|
||||
#' @examples
|
||||
#' data(agaricus.train, package='xgboost')
|
||||
@ -46,25 +46,20 @@ xgb.load <- function(modelfile) {
|
||||
if (is.null(modelfile))
|
||||
stop("xgb.load: modelfile cannot be NULL")
|
||||
|
||||
handle <- xgb.Booster.handle(
|
||||
bst <- xgb.Booster(
|
||||
params = list(),
|
||||
cachelist = list(),
|
||||
modelfile = modelfile,
|
||||
handle = NULL
|
||||
modelfile = modelfile
|
||||
)
|
||||
bst <- bst$bst
|
||||
# re-use modelfile if it is raw so we do not need to serialize
|
||||
if (typeof(modelfile) == "raw") {
|
||||
warning(
|
||||
paste(
|
||||
"The support for loading raw booster with `xgb.load` will be ",
|
||||
"discontinued in upcoming release. Use `xgb.load.raw` or",
|
||||
" `xgb.unserialize` instead. "
|
||||
"discontinued in upcoming release. Use `xgb.load.raw` instead. "
|
||||
)
|
||||
)
|
||||
bst <- xgb.handleToBooster(handle = handle, raw = modelfile)
|
||||
} else {
|
||||
bst <- xgb.handleToBooster(handle = handle, raw = NULL)
|
||||
}
|
||||
bst <- xgb.Booster.complete(bst, saveraw = TRUE)
|
||||
return(bst)
|
||||
}
|
||||
|
||||
@ -3,21 +3,10 @@
|
||||
#' User can generate raw memory buffer by calling xgb.save.raw
|
||||
#'
|
||||
#' @param buffer the buffer returned by xgb.save.raw
|
||||
#' @param as_booster Return the loaded model as xgb.Booster instead of xgb.Booster.handle.
|
||||
#'
|
||||
#' @export
|
||||
xgb.load.raw <- function(buffer, as_booster = FALSE) {
|
||||
xgb.load.raw <- function(buffer) {
|
||||
cachelist <- list()
|
||||
handle <- .Call(XGBoosterCreate_R, cachelist)
|
||||
.Call(XGBoosterLoadModelFromRaw_R, handle, buffer)
|
||||
class(handle) <- "xgb.Booster.handle"
|
||||
|
||||
if (as_booster) {
|
||||
booster <- list(handle = handle, raw = NULL)
|
||||
class(booster) <- "xgb.Booster"
|
||||
booster <- xgb.Booster.complete(booster, saveraw = TRUE)
|
||||
return(booster)
|
||||
} else {
|
||||
return(handle)
|
||||
}
|
||||
bst <- .Call(XGBoosterCreate_R, cachelist)
|
||||
.Call(XGBoosterLoadModelFromRaw_R, xgb.get.handle(bst), buffer)
|
||||
return(bst)
|
||||
}
|
||||
|
||||
@ -2,8 +2,10 @@
|
||||
#'
|
||||
#' Parse a boosted tree model text dump into a `data.table` structure.
|
||||
#'
|
||||
#' @param feature_names Character vector used to overwrite the feature names
|
||||
#' of the model. The default (`NULL`) uses the original feature names.
|
||||
#' @param feature_names Character vector of feature names. If the model already
|
||||
#' contains feature names, those will be used when \code{feature_names=NULL} (default value).
|
||||
#'
|
||||
#' Note that, if the model already contains feature names, it's \bold{not} possible to override them here.
|
||||
#' @param model Object of class `xgb.Booster`.
|
||||
#' @param text Character vector previously generated by the function [xgb.dump()]
|
||||
#' (called with parameter `with_stats = TRUE`). `text` takes precedence over `model`.
|
||||
@ -54,8 +56,6 @@
|
||||
#' objective = "binary:logistic"
|
||||
#' )
|
||||
#'
|
||||
#' (dt <- xgb.model.dt.tree(colnames(agaricus.train$data), bst))
|
||||
#'
|
||||
#' # This bst model already has feature_names stored with it, so those would be used when
|
||||
#' # feature_names is not set:
|
||||
#' (dt <- xgb.model.dt.tree(model = bst))
|
||||
@ -79,8 +79,15 @@ xgb.model.dt.tree <- function(feature_names = NULL, model = NULL, text = NULL,
|
||||
" (or NULL if 'model' was provided).")
|
||||
}
|
||||
|
||||
if (is.null(feature_names) && !is.null(model) && !is.null(model$feature_names))
|
||||
feature_names <- model$feature_names
|
||||
model_feature_names <- NULL
|
||||
if (inherits(model, "xgb.Booster")) {
|
||||
model_feature_names <- xgb.feature_names(model)
|
||||
if (NROW(model_feature_names) && !is.null(feature_names)) {
|
||||
stop("'model' contains feature names. Cannot override them.")
|
||||
}
|
||||
}
|
||||
if (is.null(feature_names) && !is.null(model) && !is.null(model_feature_names))
|
||||
feature_names <- model_feature_names
|
||||
|
||||
if (!(is.null(feature_names) || is.character(feature_names))) {
|
||||
stop("feature_names: must be a character vector")
|
||||
@ -90,8 +97,10 @@ xgb.model.dt.tree <- function(feature_names = NULL, model = NULL, text = NULL,
|
||||
stop("trees: must be a vector of integers.")
|
||||
}
|
||||
|
||||
from_text <- TRUE
|
||||
if (is.null(text)) {
|
||||
text <- xgb.dump(model = model, with_stats = TRUE)
|
||||
from_text <- FALSE
|
||||
}
|
||||
|
||||
if (length(text) < 2 || !any(grepl('leaf=(\\d+)', text))) {
|
||||
@ -120,8 +129,28 @@ xgb.model.dt.tree <- function(feature_names = NULL, model = NULL, text = NULL,
|
||||
td[, isLeaf := grepl("leaf", t, fixed = TRUE)]
|
||||
|
||||
# parse branch lines
|
||||
branch_rx <- paste0("f(\\d+)<(", anynumber_regex, ")\\] yes=(\\d+),no=(\\d+),missing=(\\d+),",
|
||||
"gain=(", anynumber_regex, "),cover=(", anynumber_regex, ")")
|
||||
branch_rx_nonames <- paste0("f(\\d+)<(", anynumber_regex, ")\\] yes=(\\d+),no=(\\d+),missing=(\\d+),",
|
||||
"gain=(", anynumber_regex, "),cover=(", anynumber_regex, ")")
|
||||
branch_rx_w_names <- paste0("\\d+:\\[(.+)<(", anynumber_regex, ")\\] yes=(\\d+),no=(\\d+),missing=(\\d+),",
|
||||
"gain=(", anynumber_regex, "),cover=(", anynumber_regex, ")")
|
||||
text_has_feature_names <- FALSE
|
||||
if (NROW(model_feature_names)) {
|
||||
branch_rx <- branch_rx_w_names
|
||||
text_has_feature_names <- TRUE
|
||||
} else {
|
||||
# Note: when passing a text dump, it might or might not have feature names,
|
||||
# but that aspect is unknown from just the text attributes
|
||||
branch_rx <- branch_rx_nonames
|
||||
if (from_text) {
|
||||
if (sum(grepl(branch_rx_w_names, text)) > sum(grepl(branch_rx_nonames, text))) {
|
||||
branch_rx <- branch_rx_w_names
|
||||
text_has_feature_names <- TRUE
|
||||
}
|
||||
}
|
||||
}
|
||||
if (text_has_feature_names && is.null(model) && !is.null(feature_names)) {
|
||||
stop("'text' contains feature names. Cannot override them.")
|
||||
}
|
||||
branch_cols <- c("Feature", "Split", "Yes", "No", "Missing", "Gain", "Cover")
|
||||
td[
|
||||
isLeaf == FALSE,
|
||||
@ -144,10 +173,12 @@ xgb.model.dt.tree <- function(feature_names = NULL, model = NULL, text = NULL,
|
||||
is_stump <- function() {
|
||||
return(length(td$Feature) == 1 && is.na(td$Feature))
|
||||
}
|
||||
if (!is.null(feature_names) && !is_stump()) {
|
||||
if (length(feature_names) <= max(as.numeric(td$Feature), na.rm = TRUE))
|
||||
stop("feature_names has less elements than there are features used in the model")
|
||||
td[isLeaf == FALSE, Feature := feature_names[as.numeric(Feature) + 1]]
|
||||
if (!text_has_feature_names) {
|
||||
if (!is.null(feature_names) && !is_stump()) {
|
||||
if (length(feature_names) <= max(as.numeric(td$Feature), na.rm = TRUE))
|
||||
stop("feature_names has less elements than there are features used in the model")
|
||||
td[isLeaf == FALSE, Feature := feature_names[as.numeric(Feature) + 1]]
|
||||
}
|
||||
}
|
||||
|
||||
# parse leaf lines
|
||||
|
||||
@ -303,7 +303,11 @@ xgb.shap.data <- function(data, shap_contrib = NULL, features = NULL, top_n = 1,
|
||||
if (is.character(features) && is.null(colnames(data)))
|
||||
stop("either provide `data` with column names or provide `features` as column indices")
|
||||
|
||||
if (is.null(model$feature_names) && model$nfeatures != ncol(data))
|
||||
model_feature_names <- NULL
|
||||
if (is.null(features) && !is.null(model)) {
|
||||
model_feature_names <- xgb.feature_names(model)
|
||||
}
|
||||
if (is.null(model_feature_names) && xgb.num_feature(model) != ncol(data))
|
||||
stop("if model has no feature_names, columns in `data` must match features in model")
|
||||
|
||||
if (!is.null(subsample)) {
|
||||
@ -332,7 +336,7 @@ xgb.shap.data <- function(data, shap_contrib = NULL, features = NULL, top_n = 1,
|
||||
}
|
||||
|
||||
if (is.null(features)) {
|
||||
if (!is.null(model$feature_names)) {
|
||||
if (!is.null(model_feature_names)) {
|
||||
imp <- xgb.importance(model = model, trees = trees)
|
||||
} else {
|
||||
imp <- xgb.importance(model = model, trees = trees, feature_names = colnames(data))
|
||||
|
||||
@ -1,12 +1,24 @@
|
||||
#' Save xgboost model to binary file
|
||||
#'
|
||||
#' Save xgboost model to a file in binary format.
|
||||
#' Save xgboost model to a file in binary or JSON format.
|
||||
#'
|
||||
#' @param model model object of \code{xgb.Booster} class.
|
||||
#' @param fname name of the file to write.
|
||||
#' @param model Model object of \code{xgb.Booster} class.
|
||||
#' @param fname Name of the file to write.
|
||||
#'
|
||||
#' Note that the extension of this file name determined the serialization format to use:\itemize{
|
||||
#' \item Extension ".ubj" will use the universal binary JSON format (recommended).
|
||||
#' This format uses binary types for e.g. floating point numbers, thereby preventing any loss
|
||||
#' of precision when converting to a human-readable JSON text or similar.
|
||||
#' \item Extension ".json" will use plain JSON, which is a human-readable format.
|
||||
#' \item Extension ".deprecated" will use a \bold{deprecated} binary format. This format will
|
||||
#' not be able to save attributes introduced after v1 of XGBoost, such as the "best_iteration"
|
||||
#' attribute that boosters might keep, nor feature names or user-specifiec attributes.
|
||||
#' \item If the format is not specified by passing one of the file extensions above, will
|
||||
#' default to UBJ.
|
||||
#' }
|
||||
#'
|
||||
#' @details
|
||||
#' This methods allows to save a model in an xgboost-internal binary format which is universal
|
||||
#' This methods allows to save a model in an xgboost-internal binary or text format which is universal
|
||||
#' among the various xgboost interfaces. In R, the saved model file could be read-in later
|
||||
#' using either the \code{\link{xgb.load}} function or the \code{xgb_model} parameter
|
||||
#' of \code{\link{xgb.train}}.
|
||||
@ -14,13 +26,13 @@
|
||||
#' Note: a model can also be saved as an R-object (e.g., by using \code{\link[base]{readRDS}}
|
||||
#' or \code{\link[base]{save}}). However, it would then only be compatible with R, and
|
||||
#' corresponding R-methods would need to be used to load it. Moreover, persisting the model with
|
||||
#' \code{\link[base]{readRDS}} or \code{\link[base]{save}}) will cause compatibility problems in
|
||||
#' \code{\link[base]{readRDS}} or \code{\link[base]{save}}) might cause compatibility problems in
|
||||
#' future versions of XGBoost. Consult \code{\link{a-compatibility-note-for-saveRDS-save}} to learn
|
||||
#' how to persist models in a future-proof way, i.e. to make the model accessible in future
|
||||
#' releases of XGBoost.
|
||||
#'
|
||||
#' @seealso
|
||||
#' \code{\link{xgb.load}}, \code{\link{xgb.Booster.complete}}.
|
||||
#' \code{\link{xgb.load}}
|
||||
#'
|
||||
#' @examples
|
||||
#' data(agaricus.train, package='xgboost')
|
||||
@ -51,8 +63,7 @@ xgb.save <- function(model, fname) {
|
||||
stop("model must be xgb.Booster.",
|
||||
if (inherits(model, "xgb.DMatrix")) " Use xgb.DMatrix.save to save an xgb.DMatrix object." else "")
|
||||
}
|
||||
model <- xgb.Booster.complete(model, saveraw = FALSE)
|
||||
fname <- path.expand(fname)
|
||||
.Call(XGBoosterSaveModel_R, model$handle, enc2utf8(fname[1]))
|
||||
.Call(XGBoosterSaveModel_R, xgb.get.handle(model), enc2utf8(fname[1]))
|
||||
return(TRUE)
|
||||
}
|
||||
|
||||
@ -11,8 +11,6 @@
|
||||
#' \item \code{deprecated}: Encode the booster into old customized binary format.
|
||||
#' }
|
||||
#'
|
||||
#' Right now the default is \code{deprecated} but will be changed to \code{ubj} in upcoming release.
|
||||
#'
|
||||
#' @examples
|
||||
#' data(agaricus.train, package='xgboost')
|
||||
#' data(agaricus.test, package='xgboost')
|
||||
@ -30,7 +28,7 @@
|
||||
#' bst <- xgb.load.raw(raw)
|
||||
#'
|
||||
#' @export
|
||||
xgb.save.raw <- function(model, raw_format = "deprecated") {
|
||||
xgb.save.raw <- function(model, raw_format = "ubj") {
|
||||
handle <- xgb.get.handle(model)
|
||||
args <- list(format = raw_format)
|
||||
.Call(XGBoosterSaveModelToRaw_R, handle, jsonlite::toJSON(args, auto_unbox = TRUE))
|
||||
|
||||
@ -1,21 +0,0 @@
|
||||
#' Serialize the booster instance into R's raw vector. The serialization method differs
|
||||
#' from \code{\link{xgb.save.raw}} as the latter one saves only the model but not
|
||||
#' parameters. This serialization format is not stable across different xgboost versions.
|
||||
#'
|
||||
#' @param booster the booster instance
|
||||
#'
|
||||
#' @examples
|
||||
#' data(agaricus.train, package='xgboost')
|
||||
#' data(agaricus.test, package='xgboost')
|
||||
#' train <- agaricus.train
|
||||
#' test <- agaricus.test
|
||||
#' bst <- xgb.train(data = xgb.DMatrix(train$data, label = train$label), max_depth = 2,
|
||||
#' eta = 1, nthread = 2, nrounds = 2,objective = "binary:logistic")
|
||||
#' raw <- xgb.serialize(bst)
|
||||
#' bst <- xgb.unserialize(raw)
|
||||
#'
|
||||
#' @export
|
||||
xgb.serialize <- function(booster) {
|
||||
handle <- xgb.get.handle(booster)
|
||||
.Call(XGBoosterSerializeToBuffer_R, handle)
|
||||
}
|
||||
@ -152,6 +152,10 @@
|
||||
#' See \code{\link{callbacks}}. Some of the callbacks are automatically created depending on the
|
||||
#' parameters' values. User can provide either existing or their own callback methods in order
|
||||
#' to customize the training process.
|
||||
#'
|
||||
#' Note that some callbacks might try to set an evaluation log - be aware that these evaluation logs
|
||||
#' are kept as R attributes, and thus do not get saved when using non-R serializaters like
|
||||
#' \link{xgb.save} (but are kept when using R serializers like \link{saveRDS}).
|
||||
#' @param ... other parameters to pass to \code{params}.
|
||||
#' @param label vector of response values. Should not be provided when data is
|
||||
#' a local data file name or an \code{xgb.DMatrix}.
|
||||
@ -160,6 +164,9 @@
|
||||
#' This parameter is only used when input is a dense matrix.
|
||||
#' @param weight a vector indicating the weight for each row of the input.
|
||||
#'
|
||||
#' @return
|
||||
#' An object of class \code{xgb.Booster}.
|
||||
#'
|
||||
#' @details
|
||||
#' These are the training functions for \code{xgboost}.
|
||||
#'
|
||||
@ -201,28 +208,20 @@
|
||||
#' \item \code{cb.save.model}: when \code{save_period > 0} is set.
|
||||
#' }
|
||||
#'
|
||||
#' @return
|
||||
#' An object of class \code{xgb.Booster} with the following elements:
|
||||
#' \itemize{
|
||||
#' \item \code{handle} a handle (pointer) to the xgboost model in memory.
|
||||
#' \item \code{raw} a cached memory dump of the xgboost model saved as R's \code{raw} type.
|
||||
#' \item \code{niter} number of boosting iterations.
|
||||
#' \item \code{evaluation_log} evaluation history stored as a \code{data.table} with the
|
||||
#' first column corresponding to iteration number and the rest corresponding to evaluation
|
||||
#' metrics' values. It is created by the \code{\link{cb.evaluation.log}} callback.
|
||||
#' \item \code{call} a function call.
|
||||
#' \item \code{params} parameters that were passed to the xgboost library. Note that it does not
|
||||
#' capture parameters changed by the \code{\link{cb.reset.parameters}} callback.
|
||||
#' \item \code{callbacks} callback functions that were either automatically assigned or
|
||||
#' explicitly passed.
|
||||
#' \item \code{best_iteration} iteration number with the best evaluation metric value
|
||||
#' (only available with early stopping).
|
||||
#' \item \code{best_score} the best evaluation metric value during early stopping.
|
||||
#' (only available with early stopping).
|
||||
#' \item \code{feature_names} names of the training dataset features
|
||||
#' (only when column names were defined in training data).
|
||||
#' \item \code{nfeatures} number of features in training data.
|
||||
#' }
|
||||
#' Note that objects of type `xgb.Booster` as returned by this function behave a bit differently
|
||||
#' from typical R objects (it's an 'altrep' list class), and it makes a separation between
|
||||
#' internal booster attributes (restricted to jsonifyable data), accessed through \link{xgb.attr}
|
||||
#' and shared between interfaces through serialization functions like \link{xgb.save}; and
|
||||
#' R-specific attributes, accessed through \link{attributes} and \link{attr}, which are otherwise
|
||||
#' only used in the R interface, only kept when using R's serializers like \link{saveRDS}, and
|
||||
#' not anyhow used by functions like \link{predict.xgb.Booster}.
|
||||
#'
|
||||
#' Be aware that one such R attribute that is automatically added is `params` - this attribute
|
||||
#' is assigned from the `params` argument to this function, and is only meant to serve as a
|
||||
#' reference for what went into the booster, but is not used in other methods that take a booster
|
||||
#' object - so for example, changing the booster's configuration requires calling `xgb.config<-`
|
||||
#' or 'xgb.parameters<-', while simply modifying `attributes(model)$params$<...>` will have no
|
||||
#' effect elsewhere.
|
||||
#'
|
||||
#' @seealso
|
||||
#' \code{\link{callbacks}},
|
||||
@ -371,27 +370,31 @@ xgb.train <- function(params = list(), data, nrounds, watchlist = list(),
|
||||
# The tree updating process would need slightly different handling
|
||||
is_update <- NVL(params[['process_type']], '.') == 'update'
|
||||
|
||||
past_evaluation_log <- NULL
|
||||
if (inherits(xgb_model, "xgb.Booster")) {
|
||||
past_evaluation_log <- attributes(xgb_model)$evaluation_log
|
||||
}
|
||||
|
||||
# Construct a booster (either a new one or load from xgb_model)
|
||||
handle <- xgb.Booster.handle(
|
||||
bst <- xgb.Booster(
|
||||
params = params,
|
||||
cachelist = append(watchlist, dtrain),
|
||||
modelfile = xgb_model,
|
||||
handle = NULL
|
||||
modelfile = xgb_model
|
||||
)
|
||||
niter_init <- bst$niter
|
||||
bst <- bst$bst
|
||||
.Call(
|
||||
XGBoosterCopyInfoFromDMatrix_R,
|
||||
xgb.get.handle(bst),
|
||||
dtrain
|
||||
)
|
||||
bst <- xgb.handleToBooster(handle = handle, raw = NULL)
|
||||
|
||||
# extract parameters that can affect the relationship b/w #trees and #iterations
|
||||
num_class <- max(as.numeric(NVL(params[['num_class']], 1)), 1)
|
||||
num_parallel_tree <- max(as.numeric(NVL(params[['num_parallel_tree']], 1)), 1)
|
||||
# Note: it might look like these aren't used, but they need to be defined in this
|
||||
# environment for the callbacks for work correctly.
|
||||
num_class <- max(as.numeric(NVL(params[['num_class']], 1)), 1) # nolint
|
||||
num_parallel_tree <- max(as.numeric(NVL(params[['num_parallel_tree']], 1)), 1) # nolint
|
||||
|
||||
# When the 'xgb_model' was set, find out how many boosting iterations it has
|
||||
niter_init <- 0
|
||||
if (!is.null(xgb_model)) {
|
||||
niter_init <- as.numeric(xgb.attr(bst, 'niter')) + 1
|
||||
if (length(niter_init) == 0) {
|
||||
niter_init <- xgb.ntree(bst) %/% (num_parallel_tree * num_class)
|
||||
}
|
||||
}
|
||||
if (is_update && nrounds > niter_init)
|
||||
stop("nrounds cannot be larger than ", niter_init, " (nrounds of xgb_model)")
|
||||
|
||||
@ -405,7 +408,7 @@ xgb.train <- function(params = list(), data, nrounds, watchlist = list(),
|
||||
for (f in cb$pre_iter) f()
|
||||
|
||||
xgb.iter.update(
|
||||
booster_handle = bst$handle,
|
||||
bst = bst,
|
||||
dtrain = dtrain,
|
||||
iter = iteration - 1,
|
||||
obj = obj
|
||||
@ -413,46 +416,43 @@ xgb.train <- function(params = list(), data, nrounds, watchlist = list(),
|
||||
|
||||
if (length(watchlist) > 0) {
|
||||
bst_evaluation <- xgb.iter.eval( # nolint: object_usage_linter
|
||||
booster_handle = bst$handle,
|
||||
bst = bst,
|
||||
watchlist = watchlist,
|
||||
iter = iteration - 1,
|
||||
feval = feval
|
||||
)
|
||||
}
|
||||
|
||||
xgb.attr(bst$handle, 'niter') <- iteration - 1
|
||||
|
||||
for (f in cb$post_iter) f()
|
||||
|
||||
if (stop_condition) break
|
||||
}
|
||||
for (f in cb$finalize) f(finalize = TRUE)
|
||||
|
||||
bst <- xgb.Booster.complete(bst, saveraw = TRUE)
|
||||
|
||||
# store the total number of boosting iterations
|
||||
bst$niter <- end_iteration
|
||||
|
||||
# store the evaluation results
|
||||
if (length(evaluation_log) > 0 &&
|
||||
nrow(evaluation_log) > 0) {
|
||||
keep_evaluation_log <- FALSE
|
||||
if (length(evaluation_log) > 0 && nrow(evaluation_log) > 0) {
|
||||
keep_evaluation_log <- TRUE
|
||||
# include the previous compatible history when available
|
||||
if (inherits(xgb_model, 'xgb.Booster') &&
|
||||
!is_update &&
|
||||
!is.null(xgb_model$evaluation_log) &&
|
||||
!is.null(past_evaluation_log) &&
|
||||
isTRUE(all.equal(colnames(evaluation_log),
|
||||
colnames(xgb_model$evaluation_log)))) {
|
||||
evaluation_log <- rbindlist(list(xgb_model$evaluation_log, evaluation_log))
|
||||
colnames(past_evaluation_log)))) {
|
||||
evaluation_log <- rbindlist(list(past_evaluation_log, evaluation_log))
|
||||
}
|
||||
bst$evaluation_log <- evaluation_log
|
||||
}
|
||||
|
||||
bst$call <- match.call()
|
||||
bst$params <- params
|
||||
bst$callbacks <- callbacks
|
||||
if (!is.null(colnames(dtrain)))
|
||||
bst$feature_names <- colnames(dtrain)
|
||||
bst$nfeatures <- ncol(dtrain)
|
||||
extra_attrs <- list(
|
||||
call = match.call(),
|
||||
params = params,
|
||||
callbacks = callbacks
|
||||
)
|
||||
if (keep_evaluation_log) {
|
||||
extra_attrs$evaluation_log <- evaluation_log
|
||||
}
|
||||
curr_attrs <- attributes(bst)
|
||||
attributes(bst) <- c(curr_attrs, extra_attrs)
|
||||
|
||||
return(bst)
|
||||
}
|
||||
|
||||
@ -1,41 +0,0 @@
|
||||
#' Load the instance back from \code{\link{xgb.serialize}}
|
||||
#'
|
||||
#' @param buffer the buffer containing booster instance saved by \code{\link{xgb.serialize}}
|
||||
#' @param handle An \code{xgb.Booster.handle} object which will be overwritten with
|
||||
#' the new deserialized object. Must be a null handle (e.g. when loading the model through
|
||||
#' `readRDS`). If not provided, a new handle will be created.
|
||||
#' @return An \code{xgb.Booster.handle} object.
|
||||
#'
|
||||
#' @export
|
||||
xgb.unserialize <- function(buffer, handle = NULL) {
|
||||
cachelist <- list()
|
||||
if (is.null(handle)) {
|
||||
handle <- .Call(XGBoosterCreate_R, cachelist)
|
||||
} else {
|
||||
if (!is.null.handle(handle))
|
||||
stop("'handle' is not null/empty. Cannot overwrite existing handle.")
|
||||
.Call(XGBoosterCreateInEmptyObj_R, cachelist, handle)
|
||||
}
|
||||
tryCatch(
|
||||
.Call(XGBoosterUnserializeFromBuffer_R, handle, buffer),
|
||||
error = function(e) {
|
||||
error_msg <- conditionMessage(e)
|
||||
m <- regexec("(src[\\\\/]learner.cc:[0-9]+): Check failed: (header == serialisation_header_)",
|
||||
error_msg, perl = TRUE)
|
||||
groups <- regmatches(error_msg, m)[[1]]
|
||||
if (length(groups) == 3) {
|
||||
warning(paste("The model had been generated by XGBoost version 1.0.0 or earlier and was ",
|
||||
"loaded from a RDS file. We strongly ADVISE AGAINST using saveRDS() ",
|
||||
"function, to ensure that your model can be read in current and upcoming ",
|
||||
"XGBoost releases. Please use xgb.save() instead to preserve models for the ",
|
||||
"long term. For more details and explanation, see ",
|
||||
"https://xgboost.readthedocs.io/en/latest/tutorials/saving_model.html",
|
||||
sep = ""))
|
||||
.Call(XGBoosterLoadModelFromRaw_R, handle, buffer)
|
||||
} else {
|
||||
stop(e)
|
||||
}
|
||||
})
|
||||
class(handle) <- "xgb.Booster.handle"
|
||||
return(handle)
|
||||
}
|
||||
@ -100,8 +100,10 @@ NULL
|
||||
#' @importFrom jsonlite toJSON
|
||||
#' @importFrom methods new
|
||||
#' @importFrom utils object.size str tail
|
||||
#' @importFrom stats coef
|
||||
#' @importFrom stats predict
|
||||
#' @importFrom stats median
|
||||
#' @importFrom stats variable.names
|
||||
#' @importFrom utils head
|
||||
#' @importFrom graphics barplot
|
||||
#' @importFrom graphics lines
|
||||
|
||||
@ -1,5 +1,4 @@
|
||||
basic_walkthrough Basic feature walkthrough
|
||||
caret_wrapper Use xgboost to train in caret library
|
||||
custom_objective Customize loss function, and evaluation metric
|
||||
boost_from_prediction Boosting from existing prediction
|
||||
predict_first_ntree Predicting using first n trees
|
||||
|
||||
@ -1,7 +1,6 @@
|
||||
XGBoost R Feature Walkthrough
|
||||
====
|
||||
* [Basic walkthrough of wrappers](basic_walkthrough.R)
|
||||
* [Train a xgboost model from caret library](caret_wrapper.R)
|
||||
* [Customize loss function, and evaluation metric](custom_objective.R)
|
||||
* [Boosting from existing prediction](boost_from_prediction.R)
|
||||
* [Predicting using first n trees](predict_first_ntree.R)
|
||||
|
||||
@ -1,44 +0,0 @@
|
||||
# install development version of caret library that contains xgboost models
|
||||
require(caret)
|
||||
require(xgboost)
|
||||
require(data.table)
|
||||
require(vcd)
|
||||
require(e1071)
|
||||
|
||||
# Load Arthritis dataset in memory.
|
||||
data(Arthritis)
|
||||
# Create a copy of the dataset with data.table package
|
||||
# (data.table is 100% compliant with R dataframe but its syntax is a lot more consistent
|
||||
# and its performance are really good).
|
||||
df <- data.table(Arthritis, keep.rownames = FALSE)
|
||||
|
||||
# Let's add some new categorical features to see if it helps.
|
||||
# Of course these feature are highly correlated to the Age feature.
|
||||
# Usually it's not a good thing in ML, but Tree algorithms (including boosted trees) are able to select the best features,
|
||||
# even in case of highly correlated features.
|
||||
# For the first feature we create groups of age by rounding the real age.
|
||||
# Note that we transform it to factor (categorical data) so the algorithm treat them as independant values.
|
||||
df[, AgeDiscret := as.factor(round(Age / 10, 0))]
|
||||
|
||||
# Here is an even stronger simplification of the real age with an arbitrary split at 30 years old.
|
||||
# I choose this value based on nothing.
|
||||
# We will see later if simplifying the information based on arbitrary values is a good strategy
|
||||
# (I am sure you already have an idea of how well it will work!).
|
||||
df[, AgeCat := as.factor(ifelse(Age > 30, "Old", "Young"))]
|
||||
|
||||
# We remove ID as there is nothing to learn from this feature (it will just add some noise as the dataset is small).
|
||||
df[, ID := NULL]
|
||||
|
||||
#-------------Basic Training using XGBoost in caret Library-----------------
|
||||
# Set up control parameters for caret::train
|
||||
# Here we use 10-fold cross-validation, repeating twice, and using random search for tuning hyper-parameters.
|
||||
fitControl <- trainControl(method = "repeatedcv", number = 10, repeats = 2, search = "random")
|
||||
# train a xgbTree model using caret::train
|
||||
model <- train(factor(Improved) ~ ., data = df, method = "xgbTree", trControl = fitControl)
|
||||
|
||||
# Instead of tree for our boosters, you can also fit a linear regression or logistic regression model
|
||||
# using xgbLinear
|
||||
# model <- train(factor(Improved)~., data = df, method = "xgbLinear", trControl = fitControl)
|
||||
|
||||
# See model results
|
||||
print(model)
|
||||
@ -27,7 +27,7 @@ head(pred_with_leaf)
|
||||
create.new.tree.features <- function(model, original.features) {
|
||||
pred_with_leaf <- predict(model, original.features, predleaf = TRUE)
|
||||
cols <- list()
|
||||
for (i in 1:model$niter) {
|
||||
for (i in 1:xgb.get.num.boosted.rounds(model)) {
|
||||
# max is not the real max but it s not important for the purpose of adding features
|
||||
leaf.id <- sort(unique(pred_with_leaf[, i]))
|
||||
cols[[i]] <- factor(x = pred_with_leaf[, i], level = leaf.id)
|
||||
|
||||
@ -9,6 +9,5 @@ demo(create_sparse_matrix, package = 'xgboost')
|
||||
demo(predict_leaf_indices, package = 'xgboost')
|
||||
demo(early_stopping, package = 'xgboost')
|
||||
demo(poisson_regression, package = 'xgboost')
|
||||
demo(caret_wrapper, package = 'xgboost')
|
||||
demo(tweedie_regression, package = 'xgboost')
|
||||
#demo(gpu_accelerated, package = 'xgboost') # can only run when built with GPU support
|
||||
|
||||
@ -2,16 +2,44 @@
|
||||
% Please edit documentation in R/utils.R
|
||||
\name{a-compatibility-note-for-saveRDS-save}
|
||||
\alias{a-compatibility-note-for-saveRDS-save}
|
||||
\title{Do not use \code{\link[base]{saveRDS}} or \code{\link[base]{save}} for long-term archival of
|
||||
models. Instead, use \code{\link{xgb.save}} or \code{\link{xgb.save.raw}}.}
|
||||
\title{Model Serialization and Compatibility}
|
||||
\description{
|
||||
It is a common practice to use the built-in \code{\link[base]{saveRDS}} function (or
|
||||
\code{\link[base]{save}}) to persist R objects to the disk. While it is possible to persist
|
||||
\code{xgb.Booster} objects using \code{\link[base]{saveRDS}}, it is not advisable to do so if
|
||||
the model is to be accessed in the future. If you train a model with the current version of
|
||||
XGBoost and persist it with \code{\link[base]{saveRDS}}, the model is not guaranteed to be
|
||||
accessible in later releases of XGBoost. To ensure that your model can be accessed in future
|
||||
releases of XGBoost, use \code{\link{xgb.save}} or \code{\link{xgb.save.raw}} instead.
|
||||
When it comes to serializing XGBoost models, it's possible to use R serializers such as
|
||||
\link{save} or \link{saveRDS} to serialize an XGBoost R model, but XGBoost also provides
|
||||
its own serializers with better compatibility guarantees, which allow loading
|
||||
said models in other language bindings of XGBoost.
|
||||
|
||||
Note that an \code{xgb.Booster} object, outside of its core components, might also keep:\itemize{
|
||||
\item Additional model configuration (accessible through \link{xgb.config}),
|
||||
which includes model fitting parameters like \code{max_depth} and runtime parameters like \code{nthread}.
|
||||
These are not necessarily useful for prediction/importance/plotting.
|
||||
\item Additional R-specific attributes - e.g. results of callbacks, such as evaluation logs,
|
||||
which are kept as a \code{data.table} object, accessible through \code{attributes(model)$evaluation_log}
|
||||
if present.
|
||||
}
|
||||
|
||||
The first one (configurations) does not have the same compatibility guarantees as
|
||||
the model itself, including attributes that are set and accessed through \link{xgb.attributes} - that is, such configuration
|
||||
might be lost after loading the booster in a different XGBoost version, regardless of the
|
||||
serializer that was used. These are saved when using \link{saveRDS}, but will be discarded
|
||||
if loaded into an incompatible XGBoost version. They are not saved when using XGBoost's
|
||||
serializers from its public interface including \link{xgb.save} and \link{xgb.save.raw}.
|
||||
|
||||
The second ones (R attributes) are not part of the standard XGBoost model structure, and thus are
|
||||
not saved when using XGBoost's own serializers. These attributes are only used for informational
|
||||
purposes, such as keeping track of evaluation metrics as the model was fit, or saving the R
|
||||
call that produced the model, but are otherwise not used for prediction / importance / plotting / etc.
|
||||
These R attributes are only preserved when using R's serializers.
|
||||
|
||||
Note that XGBoost models in R starting from version \verb{2.1.0} and onwards, and XGBoost models
|
||||
before version \verb{2.1.0}; have a very different R object structure and are incompatible with
|
||||
each other. Hence, models that were saved with R serializers live \code{saveRDS} or \code{save} before
|
||||
version \verb{2.1.0} will not work with latter \code{xgboost} versions and vice versa. Be aware that
|
||||
the structure of R model objects could in theory change again in the future, so XGBoost's serializers
|
||||
should be preferred for long-term storage.
|
||||
|
||||
Furthermore, note that using the package \code{qs} for serialization will require version 0.26 or
|
||||
higher of said package, and will have the same compatibility restrictions as R serializers.
|
||||
}
|
||||
\details{
|
||||
Use \code{\link{xgb.save}} to save the XGBoost model as a stand-alone file. You may opt into
|
||||
@ -24,9 +52,10 @@ re-construct the corresponding model. To read the model back, use \code{\link{xg
|
||||
The \code{\link{xgb.save.raw}} function is useful if you'd like to persist the XGBoost model
|
||||
as part of another R object.
|
||||
|
||||
Note: Do not use \code{\link{xgb.serialize}} to store models long-term. It persists not only the
|
||||
model but also internal configurations and parameters, and its format is not stable across
|
||||
multiple XGBoost versions. Use \code{\link{xgb.serialize}} only for checkpointing.
|
||||
Use \link{saveRDS} if you require the R-specific attributes that a booster might have, such
|
||||
as evaluation logs, but note that future compatibility of such objects is outside XGBoost's
|
||||
control as it relies on R's serialization format (see e.g. the details section in
|
||||
\link{serialize} and \link{save} from base R).
|
||||
|
||||
For more details and explanation about model persistence and archival, consult the page
|
||||
\url{https://xgboost.readthedocs.io/en/latest/tutorials/saving_model.html}.
|
||||
|
||||
@ -4,17 +4,22 @@
|
||||
\alias{cb.save.model}
|
||||
\title{Callback closure for saving a model file.}
|
||||
\usage{
|
||||
cb.save.model(save_period = 0, save_name = "xgboost.model")
|
||||
cb.save.model(save_period = 0, save_name = "xgboost.ubj")
|
||||
}
|
||||
\arguments{
|
||||
\item{save_period}{save the model to disk after every
|
||||
\code{save_period} iterations; 0 means save the model at the end.}
|
||||
|
||||
\item{save_name}{the name or path for the saved model file.
|
||||
It can contain a \code{\link[base]{sprintf}} formatting specifier
|
||||
to include the integer iteration number in the file name.
|
||||
E.g., with \code{save_name} = 'xgboost_\%04d.model',
|
||||
the file saved at iteration 50 would be named "xgboost_0050.model".}
|
||||
|
||||
\if{html}{\out{<div class="sourceCode">}}\preformatted{ Note that the format of the model being saved is determined by the file
|
||||
extension specified here (see \link{xgb.save} for details about how it works).
|
||||
|
||||
It can contain a \code{\link[base]{sprintf}} formatting specifier
|
||||
to include the integer iteration number in the file name.
|
||||
E.g., with \code{save_name} = 'xgboost_\%04d.ubj',
|
||||
the file saved at iteration 50 would be named "xgboost_0050.ubj".
|
||||
}\if{html}{\out{</div>}}}
|
||||
}
|
||||
\description{
|
||||
Callback closure for saving a model file.
|
||||
@ -29,5 +34,7 @@ Callback function expects the following values to be set in its calling frame:
|
||||
\code{end_iteration}.
|
||||
}
|
||||
\seealso{
|
||||
\link{xgb.save}
|
||||
|
||||
\code{\link{callbacks}}
|
||||
}
|
||||
|
||||
50
R-package/man/coef.xgb.Booster.Rd
Normal file
50
R-package/man/coef.xgb.Booster.Rd
Normal file
@ -0,0 +1,50 @@
|
||||
% Generated by roxygen2: do not edit by hand
|
||||
% Please edit documentation in R/xgb.Booster.R
|
||||
\name{coef.xgb.Booster}
|
||||
\alias{coef.xgb.Booster}
|
||||
\title{Extract coefficients from linear booster}
|
||||
\usage{
|
||||
\method{coef}{xgb.Booster}(object, ...)
|
||||
}
|
||||
\arguments{
|
||||
\item{object}{A fitted booster of 'gblinear' type.}
|
||||
|
||||
\item{...}{Not used.}
|
||||
}
|
||||
\value{
|
||||
The extracted coefficients:\itemize{
|
||||
\item If there's only one coefficient per column in the data, will be returned as a
|
||||
vector, potentially containing the feature names if available, with the intercept
|
||||
as first column.
|
||||
\item If there's more than one coefficient per column in the data (e.g. when using
|
||||
\code{objective="multi:softmax"}), will be returned as a matrix with dimensions equal
|
||||
to \verb{[num_features, num_cols]}, with the intercepts as first row. Note that the column
|
||||
(classes in multi-class classification) dimension will not be named.
|
||||
}
|
||||
|
||||
The intercept returned here will include the 'base_score' parameter (unlike the 'bias'
|
||||
or the last coefficient in the model dump, which doesn't have 'base_score' added to it),
|
||||
hence one should get the same values from calling \code{predict(..., outputmargin = TRUE)} and
|
||||
from performing a matrix multiplication with \code{model.matrix(~., ...)}.
|
||||
|
||||
Be aware that the coefficients are obtained by first converting them to strings and
|
||||
back, so there will always be some very small lose of precision compared to the actual
|
||||
coefficients as used by \link{predict.xgb.Booster}.
|
||||
}
|
||||
\description{
|
||||
Extracts the coefficients from a 'gblinear' booster object,
|
||||
as produced by \code{xgb.train} when using parameter \code{booster="gblinear"}.
|
||||
|
||||
Note: this function will error out if passing a booster model
|
||||
which is not of "gblinear" type.
|
||||
}
|
||||
\examples{
|
||||
library(xgboost)
|
||||
data(mtcars)
|
||||
y <- mtcars[, 1]
|
||||
x <- as.matrix(mtcars[, -1])
|
||||
dm <- xgb.DMatrix(data = x, label = y, nthread = 1)
|
||||
params <- list(booster = "gblinear", nthread = 1)
|
||||
model <- xgb.train(data = dm, params = params, nrounds = 2)
|
||||
coef(model)
|
||||
}
|
||||
@ -1,24 +1,42 @@
|
||||
% Generated by roxygen2: do not edit by hand
|
||||
% Please edit documentation in R/xgb.DMatrix.R
|
||||
\name{getinfo}
|
||||
% Please edit documentation in R/xgb.Booster.R, R/xgb.DMatrix.R
|
||||
\name{getinfo.xgb.Booster}
|
||||
\alias{getinfo.xgb.Booster}
|
||||
\alias{setinfo.xgb.Booster}
|
||||
\alias{getinfo}
|
||||
\alias{getinfo.xgb.DMatrix}
|
||||
\title{Get information of an xgb.DMatrix object}
|
||||
\alias{setinfo}
|
||||
\alias{setinfo.xgb.DMatrix}
|
||||
\title{Get or set information of xgb.DMatrix and xgb.Booster objects}
|
||||
\usage{
|
||||
\method{getinfo}{xgb.Booster}(object, name)
|
||||
|
||||
\method{setinfo}{xgb.Booster}(object, name, info)
|
||||
|
||||
getinfo(object, name)
|
||||
|
||||
\method{getinfo}{xgb.DMatrix}(object, name)
|
||||
|
||||
setinfo(object, name, info)
|
||||
|
||||
\method{setinfo}{xgb.DMatrix}(object, name, info)
|
||||
}
|
||||
\arguments{
|
||||
\item{object}{Object of class \code{xgb.DMatrix}}
|
||||
\item{object}{Object of class \code{xgb.DMatrix} of \code{xgb.Booster}.}
|
||||
|
||||
\item{name}{the name of the information field to get (see details)}
|
||||
|
||||
\item{info}{the specific field of information to set}
|
||||
}
|
||||
\value{
|
||||
For \code{getinfo}, will return the requested field. For \code{setinfo}, will always return value \code{TRUE}
|
||||
if it succeeds.
|
||||
}
|
||||
\description{
|
||||
Get information of an xgb.DMatrix object
|
||||
Get or set information of xgb.DMatrix and xgb.Booster objects
|
||||
}
|
||||
\details{
|
||||
The \code{name} field can be one of the following:
|
||||
The \code{name} field can be one of the following for \code{xgb.DMatrix}:
|
||||
|
||||
\itemize{
|
||||
\item \code{label}
|
||||
@ -33,8 +51,28 @@ The \code{name} field can be one of the following:
|
||||
}
|
||||
See the documentation for \link{xgb.DMatrix} for more information about these fields.
|
||||
|
||||
For \code{xgb.Booster}, can be one of the following:
|
||||
\itemize{
|
||||
\item \code{feature_type}
|
||||
\item \code{feature_name}
|
||||
}
|
||||
|
||||
Note that, while 'qid' cannot be retrieved, it's possible to get the equivalent 'group'
|
||||
for a DMatrix that had 'qid' assigned.
|
||||
|
||||
\bold{Important}: when calling \code{setinfo}, the objects are modified in-place. See
|
||||
\link{xgb.copy.Booster} for an idea of this in-place assignment works.
|
||||
|
||||
See the documentation for \link{xgb.DMatrix} for possible fields that can be set
|
||||
(which correspond to arguments in that function).
|
||||
|
||||
Note that the following fields are allowed in the construction of an \code{xgb.DMatrix}
|
||||
but \bold{aren't} allowed here:\itemize{
|
||||
\item data
|
||||
\item missing
|
||||
\item silent
|
||||
\item nthread
|
||||
}
|
||||
}
|
||||
\examples{
|
||||
data(agaricus.train, package='xgboost')
|
||||
@ -45,4 +83,11 @@ setinfo(dtrain, 'label', 1-labels)
|
||||
|
||||
labels2 <- getinfo(dtrain, 'label')
|
||||
stopifnot(all(labels2 == 1-labels))
|
||||
data(agaricus.train, package='xgboost')
|
||||
dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label, nthread = 2))
|
||||
|
||||
labels <- getinfo(dtrain, 'label')
|
||||
setinfo(dtrain, 'label', 1-labels)
|
||||
labels2 <- getinfo(dtrain, 'label')
|
||||
stopifnot(all.equal(labels2, 1-labels))
|
||||
}
|
||||
|
||||
@ -2,7 +2,6 @@
|
||||
% Please edit documentation in R/xgb.Booster.R
|
||||
\name{predict.xgb.Booster}
|
||||
\alias{predict.xgb.Booster}
|
||||
\alias{predict.xgb.Booster.handle}
|
||||
\title{Predict method for XGBoost model}
|
||||
\usage{
|
||||
\method{predict}{xgb.Booster}(
|
||||
@ -21,11 +20,9 @@
|
||||
strict_shape = FALSE,
|
||||
...
|
||||
)
|
||||
|
||||
\method{predict}{xgb.Booster.handle}(object, ...)
|
||||
}
|
||||
\arguments{
|
||||
\item{object}{Object of class \code{xgb.Booster} or \code{xgb.Booster.handle}.}
|
||||
\item{object}{Object of class \code{xgb.Booster}.}
|
||||
|
||||
\item{newdata}{Takes \code{matrix}, \code{dgCMatrix}, \code{dgRMatrix}, \code{dsparseVector},
|
||||
local data file, or \code{xgb.DMatrix}.
|
||||
|
||||
@ -4,14 +4,15 @@
|
||||
\alias{print.xgb.Booster}
|
||||
\title{Print xgb.Booster}
|
||||
\usage{
|
||||
\method{print}{xgb.Booster}(x, verbose = FALSE, ...)
|
||||
\method{print}{xgb.Booster}(x, ...)
|
||||
}
|
||||
\arguments{
|
||||
\item{x}{An \code{xgb.Booster} object.}
|
||||
|
||||
\item{verbose}{Whether to print detailed data (e.g., attribute values).}
|
||||
|
||||
\item{...}{Not currently used.}
|
||||
\item{...}{Not used.}
|
||||
}
|
||||
\value{
|
||||
The same \code{x} object, returned invisibly
|
||||
}
|
||||
\description{
|
||||
Print information about \code{xgb.Booster}.
|
||||
@ -33,6 +34,5 @@ bst <- xgboost(
|
||||
attr(bst, "myattr") <- "memo"
|
||||
|
||||
print(bst)
|
||||
print(bst, verbose = TRUE)
|
||||
|
||||
}
|
||||
|
||||
@ -1,42 +0,0 @@
|
||||
% Generated by roxygen2: do not edit by hand
|
||||
% Please edit documentation in R/xgb.DMatrix.R
|
||||
\name{setinfo}
|
||||
\alias{setinfo}
|
||||
\alias{setinfo.xgb.DMatrix}
|
||||
\title{Set information of an xgb.DMatrix object}
|
||||
\usage{
|
||||
setinfo(object, name, info)
|
||||
|
||||
\method{setinfo}{xgb.DMatrix}(object, name, info)
|
||||
}
|
||||
\arguments{
|
||||
\item{object}{Object of class "xgb.DMatrix"}
|
||||
|
||||
\item{name}{the name of the field to get}
|
||||
|
||||
\item{info}{the specific field of information to set}
|
||||
}
|
||||
\description{
|
||||
Set information of an xgb.DMatrix object
|
||||
}
|
||||
\details{
|
||||
See the documentation for \link{xgb.DMatrix} for possible fields that can be set
|
||||
(which correspond to arguments in that function).
|
||||
|
||||
Note that the following fields are allowed in the construction of an \code{xgb.DMatrix}
|
||||
but \bold{aren't} allowed here:\itemize{
|
||||
\item data
|
||||
\item missing
|
||||
\item silent
|
||||
\item nthread
|
||||
}
|
||||
}
|
||||
\examples{
|
||||
data(agaricus.train, package='xgboost')
|
||||
dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label, nthread = 2))
|
||||
|
||||
labels <- getinfo(dtrain, 'label')
|
||||
setinfo(dtrain, 'label', 1-labels)
|
||||
labels2 <- getinfo(dtrain, 'label')
|
||||
stopifnot(all.equal(labels2, 1-labels))
|
||||
}
|
||||
22
R-package/man/variable.names.xgb.Booster.Rd
Normal file
22
R-package/man/variable.names.xgb.Booster.Rd
Normal file
@ -0,0 +1,22 @@
|
||||
% Generated by roxygen2: do not edit by hand
|
||||
% Please edit documentation in R/xgb.Booster.R
|
||||
\name{variable.names.xgb.Booster}
|
||||
\alias{variable.names.xgb.Booster}
|
||||
\title{Get Features Names from Booster}
|
||||
\usage{
|
||||
\method{variable.names}{xgb.Booster}(object, ...)
|
||||
}
|
||||
\arguments{
|
||||
\item{object}{An \code{xgb.Booster} object.}
|
||||
|
||||
\item{...}{Not used.}
|
||||
}
|
||||
\description{
|
||||
Returns the feature / variable / column names from a fitted
|
||||
booster object, which are set automatically during the call to \link{xgb.train}
|
||||
from the DMatrix names, or which can be set manually through \link{setinfo}.
|
||||
|
||||
If the object doesn't have feature names, will return \code{NULL}.
|
||||
|
||||
It is equivalent to calling \code{getinfo(object, "feature_name")}.
|
||||
}
|
||||
@ -1,61 +0,0 @@
|
||||
% Generated by roxygen2: do not edit by hand
|
||||
% Please edit documentation in R/xgb.Booster.R
|
||||
\name{xgb.Booster.complete}
|
||||
\alias{xgb.Booster.complete}
|
||||
\title{Restore missing parts of an incomplete xgb.Booster object}
|
||||
\usage{
|
||||
xgb.Booster.complete(object, saveraw = TRUE)
|
||||
}
|
||||
\arguments{
|
||||
\item{object}{Object of class \code{xgb.Booster}.}
|
||||
|
||||
\item{saveraw}{A flag indicating whether to append \code{raw} Booster memory dump data
|
||||
when it doesn't already exist.}
|
||||
}
|
||||
\value{
|
||||
An object of \code{xgb.Booster} class.
|
||||
}
|
||||
\description{
|
||||
It attempts to complete an \code{xgb.Booster} object by restoring either its missing
|
||||
raw model memory dump (when it has no \code{raw} data but its \code{xgb.Booster.handle} is valid)
|
||||
or its missing internal handle (when its \code{xgb.Booster.handle} is not valid
|
||||
but it has a raw Booster memory dump).
|
||||
}
|
||||
\details{
|
||||
While this method is primarily for internal use, it might be useful in some practical situations.
|
||||
|
||||
E.g., when an \code{xgb.Booster} model is saved as an R object and then is loaded as an R object,
|
||||
its handle (pointer) to an internal xgboost model would be invalid. The majority of xgboost methods
|
||||
should still work for such a model object since those methods would be using
|
||||
\code{xgb.Booster.complete()} internally. However, one might find it to be more efficient to call the
|
||||
\code{xgb.Booster.complete()} function explicitly once after loading a model as an R-object.
|
||||
That would prevent further repeated implicit reconstruction of an internal booster model.
|
||||
}
|
||||
\examples{
|
||||
|
||||
data(agaricus.train, package = "xgboost")
|
||||
|
||||
bst <- xgboost(
|
||||
data = agaricus.train$data,
|
||||
label = agaricus.train$label,
|
||||
max_depth = 2,
|
||||
eta = 1,
|
||||
nthread = 2,
|
||||
nrounds = 2,
|
||||
objective = "binary:logistic"
|
||||
)
|
||||
|
||||
fname <- file.path(tempdir(), "xgb_model.Rds")
|
||||
saveRDS(bst, fname)
|
||||
|
||||
# Warning: The resulting RDS file is only compatible with the current XGBoost version.
|
||||
# Refer to the section titled "a-compatibility-note-for-saveRDS-save".
|
||||
bst1 <- readRDS(fname)
|
||||
# the handle is invalid:
|
||||
print(bst1$handle)
|
||||
|
||||
bst1 <- xgb.Booster.complete(bst1)
|
||||
# now the handle points to a valid internal booster model:
|
||||
print(bst1$handle)
|
||||
|
||||
}
|
||||
@ -16,7 +16,7 @@ xgb.attributes(object)
|
||||
xgb.attributes(object) <- value
|
||||
}
|
||||
\arguments{
|
||||
\item{object}{Object of class \code{xgb.Booster} or \code{xgb.Booster.handle}.}
|
||||
\item{object}{Object of class \code{xgb.Booster}. \bold{Will be modified in-place} when assigning to it.}
|
||||
|
||||
\item{name}{A non-empty character string specifying which attribute is to be accessed.}
|
||||
|
||||
@ -51,15 +51,14 @@ Also, setting an attribute that has the same name as one of xgboost's parameters
|
||||
change the value of that parameter for a model.
|
||||
Use \code{\link[=xgb.parameters<-]{xgb.parameters<-()}} to set or change model parameters.
|
||||
|
||||
The attribute setters would usually work more efficiently for \code{xgb.Booster.handle}
|
||||
than for \code{xgb.Booster}, since only just a handle (pointer) would need to be copied.
|
||||
That would only matter if attributes need to be set many times.
|
||||
Note, however, that when feeding a handle of an \code{xgb.Booster} object to the attribute setters,
|
||||
the raw model cache of an \code{xgb.Booster} object would not be automatically updated,
|
||||
and it would be the user's responsibility to call \code{\link[=xgb.serialize]{xgb.serialize()}} to update it.
|
||||
|
||||
The \verb{xgb.attributes<-} setter either updates the existing or adds one or several attributes,
|
||||
but it doesn't delete the other existing attributes.
|
||||
|
||||
Important: since this modifies the booster's C object, semantics for assignment here
|
||||
will differ from R's, as any object reference to the same booster will be modified
|
||||
too, while assignment of R attributes through \verb{attributes(model)$<attr> <- <value>}
|
||||
will follow the usual copy-on-write R semantics (see \link{xgb.copy.Booster} for an
|
||||
example of these behaviors).
|
||||
}
|
||||
\examples{
|
||||
data(agaricus.train, package = "xgboost")
|
||||
|
||||
@ -10,13 +10,23 @@ xgb.config(object)
|
||||
xgb.config(object) <- value
|
||||
}
|
||||
\arguments{
|
||||
\item{object}{Object of class \code{xgb.Booster}.}
|
||||
\item{object}{Object of class \code{xgb.Booster}. \bold{Will be modified in-place} when assigning to it.}
|
||||
|
||||
\item{value}{A JSON string.}
|
||||
\item{value}{An R list.}
|
||||
}
|
||||
\value{
|
||||
\code{xgb.config} will return the parameters as an R list.
|
||||
}
|
||||
\description{
|
||||
Accessors for model parameters as JSON string
|
||||
}
|
||||
\details{
|
||||
Note that assignment is performed in-place on the booster C object, which unlike assignment
|
||||
of R attributes, doesn't follow typical copy-on-write semantics for assignment - i.e. all references
|
||||
to the same booster will also get updated.
|
||||
|
||||
See \link{xgb.copy.Booster} for an example of this behavior.
|
||||
}
|
||||
\examples{
|
||||
data(agaricus.train, package = "xgboost")
|
||||
|
||||
|
||||
53
R-package/man/xgb.copy.Booster.Rd
Normal file
53
R-package/man/xgb.copy.Booster.Rd
Normal file
@ -0,0 +1,53 @@
|
||||
% Generated by roxygen2: do not edit by hand
|
||||
% Please edit documentation in R/xgb.Booster.R
|
||||
\name{xgb.copy.Booster}
|
||||
\alias{xgb.copy.Booster}
|
||||
\title{Deep-copies a Booster Object}
|
||||
\usage{
|
||||
xgb.copy.Booster(model)
|
||||
}
|
||||
\arguments{
|
||||
\item{model}{An 'xgb.Booster' object.}
|
||||
}
|
||||
\value{
|
||||
A deep copy of \code{model} - it will be identical in every way, but C-level
|
||||
functions called on that copy will not affect the \code{model} variable.
|
||||
}
|
||||
\description{
|
||||
Creates a deep copy of an 'xgb.Booster' object, such that the
|
||||
C object pointer contained will be a different object, and hence functions
|
||||
like \link{xgb.attr} will not affect the object from which it was copied.
|
||||
}
|
||||
\examples{
|
||||
library(xgboost)
|
||||
data(mtcars)
|
||||
y <- mtcars$mpg
|
||||
x <- mtcars[, -1]
|
||||
dm <- xgb.DMatrix(x, label = y, nthread = 1)
|
||||
model <- xgb.train(
|
||||
data = dm,
|
||||
params = list(nthread = 1),
|
||||
nround = 3
|
||||
)
|
||||
|
||||
# Set an arbitrary attribute kept at the C level
|
||||
xgb.attr(model, "my_attr") <- 100
|
||||
print(xgb.attr(model, "my_attr"))
|
||||
|
||||
# Just assigning to a new variable will not create
|
||||
# a deep copy - C object pointer is shared, and in-place
|
||||
# modifications will affect both objects
|
||||
model_shallow_copy <- model
|
||||
xgb.attr(model_shallow_copy, "my_attr") <- 333
|
||||
# 'model' was also affected by this change:
|
||||
print(xgb.attr(model, "my_attr"))
|
||||
|
||||
model_deep_copy <- xgb.copy.Booster(model)
|
||||
xgb.attr(model_deep_copy, "my_attr") <- 444
|
||||
# 'model' was NOT affected by this change
|
||||
# (keeps previous value that was assigned before)
|
||||
print(xgb.attr(model, "my_attr"))
|
||||
|
||||
# Verify that the new object was actually modified
|
||||
print(xgb.attr(model_deep_copy, "my_attr"))
|
||||
}
|
||||
@ -8,7 +8,8 @@ xgb.gblinear.history(model, class_index = NULL)
|
||||
}
|
||||
\arguments{
|
||||
\item{model}{either an \code{xgb.Booster} or a result of \code{xgb.cv()}, trained
|
||||
using the \code{cb.gblinear.history()} callback.}
|
||||
using the \code{cb.gblinear.history()} callback, but \bold{not} a booster
|
||||
loaded from \link{xgb.load} or \link{xgb.load.raw}.}
|
||||
|
||||
\item{class_index}{zero-based class index to extract the coefficients for only that
|
||||
specific class in a multinomial multiclass model. When it is NULL, all the
|
||||
@ -27,3 +28,11 @@ A helper function to extract the matrix of linear coefficients' history
|
||||
from a gblinear model created while using the \code{cb.gblinear.history()}
|
||||
callback.
|
||||
}
|
||||
\details{
|
||||
Note that this is an R-specific function that relies on R attributes that
|
||||
are not saved when using xgboost's own serialization functions like \link{xgb.load}
|
||||
or \link{xgb.load.raw}.
|
||||
|
||||
In order for a serialized model to be accepted by tgis function, one must use R
|
||||
serializers such as \link{saveRDS}.
|
||||
}
|
||||
|
||||
22
R-package/man/xgb.get.num.boosted.rounds.Rd
Normal file
22
R-package/man/xgb.get.num.boosted.rounds.Rd
Normal file
@ -0,0 +1,22 @@
|
||||
% Generated by roxygen2: do not edit by hand
|
||||
% Please edit documentation in R/xgb.Booster.R
|
||||
\name{xgb.get.num.boosted.rounds}
|
||||
\alias{xgb.get.num.boosted.rounds}
|
||||
\title{Get number of boosting in a fitted booster}
|
||||
\usage{
|
||||
xgb.get.num.boosted.rounds(model)
|
||||
}
|
||||
\arguments{
|
||||
\item{model}{A fitted \code{xgb.Booster} model.}
|
||||
}
|
||||
\value{
|
||||
The number of rounds saved in the model, as an integer.
|
||||
}
|
||||
\description{
|
||||
Get number of boosting in a fitted booster
|
||||
}
|
||||
\details{
|
||||
Note that setting booster parameters related to training
|
||||
continuation / updates through \link{xgb.parameters<-} will reset the
|
||||
number of rounds to zero.
|
||||
}
|
||||
59
R-package/man/xgb.is.same.Booster.Rd
Normal file
59
R-package/man/xgb.is.same.Booster.Rd
Normal file
@ -0,0 +1,59 @@
|
||||
% Generated by roxygen2: do not edit by hand
|
||||
% Please edit documentation in R/xgb.Booster.R
|
||||
\name{xgb.is.same.Booster}
|
||||
\alias{xgb.is.same.Booster}
|
||||
\title{Check if two boosters share the same C object}
|
||||
\usage{
|
||||
xgb.is.same.Booster(obj1, obj2)
|
||||
}
|
||||
\arguments{
|
||||
\item{obj1}{Booster model to compare with \code{obj2}.}
|
||||
|
||||
\item{obj2}{Booster model to compare with \code{obj1}.}
|
||||
}
|
||||
\value{
|
||||
Either \code{TRUE} or \code{FALSE} according to whether the two boosters share
|
||||
the underlying C object.
|
||||
}
|
||||
\description{
|
||||
Checks whether two booster objects refer to the same underlying C object.
|
||||
}
|
||||
\details{
|
||||
As booster objects (as returned by e.g. \link{xgb.train}) contain an R 'externalptr'
|
||||
object, they don't follow typical copy-on-write semantics of other R objects - that is, if
|
||||
one assigns a booster to a different variable and modifies that new variable through in-place
|
||||
methods like \link{xgb.attr<-}, the modification will be applied to both the old and the new
|
||||
variable, unlike typical R assignments which would only modify the latter.
|
||||
|
||||
This function allows checking whether two booster objects share the same 'externalptr',
|
||||
regardless of the R attributes that they might have.
|
||||
|
||||
In order to duplicate a booster in such a way that the copy wouldn't share the same
|
||||
'externalptr', one can use function \link{xgb.copy.Booster}.
|
||||
}
|
||||
\examples{
|
||||
library(xgboost)
|
||||
data(mtcars)
|
||||
y <- mtcars$mpg
|
||||
x <- as.matrix(mtcars[, -1])
|
||||
model <- xgb.train(
|
||||
params = list(nthread = 1),
|
||||
data = xgb.DMatrix(x, label = y, nthread = 1),
|
||||
nround = 3
|
||||
)
|
||||
|
||||
model_shallow_copy <- model
|
||||
xgb.is.same.Booster(model, model_shallow_copy) # same C object
|
||||
|
||||
model_deep_copy <- xgb.copy.Booster(model)
|
||||
xgb.is.same.Booster(model, model_deep_copy) # different C objects
|
||||
|
||||
# In-place assignments modify all references,
|
||||
# but not full/deep copies of the booster
|
||||
xgb.attr(model_shallow_copy, "my_attr") <- 111
|
||||
xgb.attr(model, "my_attr") # gets modified
|
||||
xgb.attr(model_deep_copy, "my_attr") # doesn't get modified
|
||||
}
|
||||
\seealso{
|
||||
\link{xgb.copy.Booster}
|
||||
}
|
||||
@ -48,5 +48,5 @@ xgb.save(bst, fname)
|
||||
bst <- xgb.load(fname)
|
||||
}
|
||||
\seealso{
|
||||
\code{\link{xgb.save}}, \code{\link{xgb.Booster.complete}}.
|
||||
\code{\link{xgb.save}}
|
||||
}
|
||||
|
||||
@ -4,12 +4,10 @@
|
||||
\alias{xgb.load.raw}
|
||||
\title{Load serialised xgboost model from R's raw vector}
|
||||
\usage{
|
||||
xgb.load.raw(buffer, as_booster = FALSE)
|
||||
xgb.load.raw(buffer)
|
||||
}
|
||||
\arguments{
|
||||
\item{buffer}{the buffer returned by xgb.save.raw}
|
||||
|
||||
\item{as_booster}{Return the loaded model as xgb.Booster instead of xgb.Booster.handle.}
|
||||
}
|
||||
\description{
|
||||
User can generate raw memory buffer by calling xgb.save.raw
|
||||
|
||||
@ -14,8 +14,11 @@ xgb.model.dt.tree(
|
||||
)
|
||||
}
|
||||
\arguments{
|
||||
\item{feature_names}{Character vector used to overwrite the feature names
|
||||
of the model. The default (\code{NULL}) uses the original feature names.}
|
||||
\item{feature_names}{Character vector of feature names. If the model already
|
||||
contains feature names, those will be used when \code{feature_names=NULL} (default value).
|
||||
|
||||
\if{html}{\out{<div class="sourceCode">}}\preformatted{ Note that, if the model already contains feature names, it's \\bold\{not\} possible to override them here.
|
||||
}\if{html}{\out{</div>}}}
|
||||
|
||||
\item{model}{Object of class \code{xgb.Booster}.}
|
||||
|
||||
@ -76,8 +79,6 @@ bst <- xgboost(
|
||||
objective = "binary:logistic"
|
||||
)
|
||||
|
||||
(dt <- xgb.model.dt.tree(colnames(agaricus.train$data), bst))
|
||||
|
||||
# This bst model already has feature_names stored with it, so those would be used when
|
||||
# feature_names is not set:
|
||||
(dt <- xgb.model.dt.tree(model = bst))
|
||||
|
||||
@ -7,17 +7,27 @@
|
||||
xgb.parameters(object) <- value
|
||||
}
|
||||
\arguments{
|
||||
\item{object}{Object of class \code{xgb.Booster} or \code{xgb.Booster.handle}.}
|
||||
\item{object}{Object of class \code{xgb.Booster}. \bold{Will be modified in-place}.}
|
||||
|
||||
\item{value}{A list (or an object coercible to a list) with the names of parameters to set
|
||||
and the elements corresponding to parameter values.}
|
||||
}
|
||||
\value{
|
||||
The same booster \code{object}, which gets modified in-place.
|
||||
}
|
||||
\description{
|
||||
Only the setter for xgboost parameters is currently implemented.
|
||||
}
|
||||
\details{
|
||||
Note that the setter would usually work more efficiently for \code{xgb.Booster.handle}
|
||||
than for \code{xgb.Booster}, since only just a handle would need to be copied.
|
||||
Just like \link{xgb.attr}, this function will make in-place modifications
|
||||
on the booster object which do not follow typical R assignment semantics - that is,
|
||||
all references to the same booster will also be updated, unlike assingment of R
|
||||
attributes which follow copy-on-write semantics.
|
||||
|
||||
See \link{xgb.copy.Booster} for an example of this behavior.
|
||||
|
||||
Be aware that setting parameters of a fitted booster related to training continuation / updates
|
||||
will reset its number of rounds indicator to zero.
|
||||
}
|
||||
\examples{
|
||||
data(agaricus.train, package = "xgboost")
|
||||
|
||||
@ -7,15 +7,27 @@
|
||||
xgb.save(model, fname)
|
||||
}
|
||||
\arguments{
|
||||
\item{model}{model object of \code{xgb.Booster} class.}
|
||||
\item{model}{Model object of \code{xgb.Booster} class.}
|
||||
|
||||
\item{fname}{name of the file to write.}
|
||||
\item{fname}{Name of the file to write.
|
||||
|
||||
Note that the extension of this file name determined the serialization format to use:\itemize{
|
||||
\item Extension ".ubj" will use the universal binary JSON format (recommended).
|
||||
This format uses binary types for e.g. floating point numbers, thereby preventing any loss
|
||||
of precision when converting to a human-readable JSON text or similar.
|
||||
\item Extension ".json" will use plain JSON, which is a human-readable format.
|
||||
\item Extension ".deprecated" will use a \bold{deprecated} binary format. This format will
|
||||
not be able to save attributes introduced after v1 of XGBoost, such as the "best_iteration"
|
||||
attribute that boosters might keep, nor feature names or user-specifiec attributes.
|
||||
\item If the format is not specified by passing one of the file extensions above, will
|
||||
default to UBJ.
|
||||
}}
|
||||
}
|
||||
\description{
|
||||
Save xgboost model to a file in binary format.
|
||||
Save xgboost model to a file in binary or JSON format.
|
||||
}
|
||||
\details{
|
||||
This methods allows to save a model in an xgboost-internal binary format which is universal
|
||||
This methods allows to save a model in an xgboost-internal binary or text format which is universal
|
||||
among the various xgboost interfaces. In R, the saved model file could be read-in later
|
||||
using either the \code{\link{xgb.load}} function or the \code{xgb_model} parameter
|
||||
of \code{\link{xgb.train}}.
|
||||
@ -23,7 +35,7 @@ of \code{\link{xgb.train}}.
|
||||
Note: a model can also be saved as an R-object (e.g., by using \code{\link[base]{readRDS}}
|
||||
or \code{\link[base]{save}}). However, it would then only be compatible with R, and
|
||||
corresponding R-methods would need to be used to load it. Moreover, persisting the model with
|
||||
\code{\link[base]{readRDS}} or \code{\link[base]{save}}) will cause compatibility problems in
|
||||
\code{\link[base]{readRDS}} or \code{\link[base]{save}}) might cause compatibility problems in
|
||||
future versions of XGBoost. Consult \code{\link{a-compatibility-note-for-saveRDS-save}} to learn
|
||||
how to persist models in a future-proof way, i.e. to make the model accessible in future
|
||||
releases of XGBoost.
|
||||
@ -51,5 +63,5 @@ xgb.save(bst, fname)
|
||||
bst <- xgb.load(fname)
|
||||
}
|
||||
\seealso{
|
||||
\code{\link{xgb.load}}, \code{\link{xgb.Booster.complete}}.
|
||||
\code{\link{xgb.load}}
|
||||
}
|
||||
|
||||
@ -5,7 +5,7 @@
|
||||
\title{Save xgboost model to R's raw vector,
|
||||
user can call xgb.load.raw to load the model back from raw vector}
|
||||
\usage{
|
||||
xgb.save.raw(model, raw_format = "deprecated")
|
||||
xgb.save.raw(model, raw_format = "ubj")
|
||||
}
|
||||
\arguments{
|
||||
\item{model}{the model object.}
|
||||
@ -15,9 +15,7 @@ xgb.save.raw(model, raw_format = "deprecated")
|
||||
\item \code{json}: Encode the booster into JSON text document.
|
||||
\item \code{ubj}: Encode the booster into Universal Binary JSON.
|
||||
\item \code{deprecated}: Encode the booster into old customized binary format.
|
||||
}
|
||||
|
||||
Right now the default is \code{deprecated} but will be changed to \code{ubj} in upcoming release.}
|
||||
}}
|
||||
}
|
||||
\description{
|
||||
Save xgboost model from xgboost or xgb.train
|
||||
|
||||
@ -1,29 +0,0 @@
|
||||
% Generated by roxygen2: do not edit by hand
|
||||
% Please edit documentation in R/xgb.serialize.R
|
||||
\name{xgb.serialize}
|
||||
\alias{xgb.serialize}
|
||||
\title{Serialize the booster instance into R's raw vector. The serialization method differs
|
||||
from \code{\link{xgb.save.raw}} as the latter one saves only the model but not
|
||||
parameters. This serialization format is not stable across different xgboost versions.}
|
||||
\usage{
|
||||
xgb.serialize(booster)
|
||||
}
|
||||
\arguments{
|
||||
\item{booster}{the booster instance}
|
||||
}
|
||||
\description{
|
||||
Serialize the booster instance into R's raw vector. The serialization method differs
|
||||
from \code{\link{xgb.save.raw}} as the latter one saves only the model but not
|
||||
parameters. This serialization format is not stable across different xgboost versions.
|
||||
}
|
||||
\examples{
|
||||
data(agaricus.train, package='xgboost')
|
||||
data(agaricus.test, package='xgboost')
|
||||
train <- agaricus.train
|
||||
test <- agaricus.test
|
||||
bst <- xgb.train(data = xgb.DMatrix(train$data, label = train$label), max_depth = 2,
|
||||
eta = 1, nthread = 2, nrounds = 2,objective = "binary:logistic")
|
||||
raw <- xgb.serialize(bst)
|
||||
bst <- xgb.unserialize(raw)
|
||||
|
||||
}
|
||||
@ -205,7 +205,12 @@ file with a previously saved model.}
|
||||
\item{callbacks}{a list of callback functions to perform various task during boosting.
|
||||
See \code{\link{callbacks}}. Some of the callbacks are automatically created depending on the
|
||||
parameters' values. User can provide either existing or their own callback methods in order
|
||||
to customize the training process.}
|
||||
to customize the training process.
|
||||
|
||||
\if{html}{\out{<div class="sourceCode">}}\preformatted{ Note that some callbacks might try to set an evaluation log - be aware that these evaluation logs
|
||||
are kept as R attributes, and thus do not get saved when using non-R serializaters like
|
||||
\link{xgb.save} (but are kept when using R serializers like \link{saveRDS}).
|
||||
}\if{html}{\out{</div>}}}
|
||||
|
||||
\item{...}{other parameters to pass to \code{params}.}
|
||||
|
||||
@ -219,27 +224,7 @@ This parameter is only used when input is a dense matrix.}
|
||||
\item{weight}{a vector indicating the weight for each row of the input.}
|
||||
}
|
||||
\value{
|
||||
An object of class \code{xgb.Booster} with the following elements:
|
||||
\itemize{
|
||||
\item \code{handle} a handle (pointer) to the xgboost model in memory.
|
||||
\item \code{raw} a cached memory dump of the xgboost model saved as R's \code{raw} type.
|
||||
\item \code{niter} number of boosting iterations.
|
||||
\item \code{evaluation_log} evaluation history stored as a \code{data.table} with the
|
||||
first column corresponding to iteration number and the rest corresponding to evaluation
|
||||
metrics' values. It is created by the \code{\link{cb.evaluation.log}} callback.
|
||||
\item \code{call} a function call.
|
||||
\item \code{params} parameters that were passed to the xgboost library. Note that it does not
|
||||
capture parameters changed by the \code{\link{cb.reset.parameters}} callback.
|
||||
\item \code{callbacks} callback functions that were either automatically assigned or
|
||||
explicitly passed.
|
||||
\item \code{best_iteration} iteration number with the best evaluation metric value
|
||||
(only available with early stopping).
|
||||
\item \code{best_score} the best evaluation metric value during early stopping.
|
||||
(only available with early stopping).
|
||||
\item \code{feature_names} names of the training dataset features
|
||||
(only when column names were defined in training data).
|
||||
\item \code{nfeatures} number of features in training data.
|
||||
}
|
||||
An object of class \code{xgb.Booster}.
|
||||
}
|
||||
\description{
|
||||
\code{xgb.train} is an advanced interface for training an xgboost model.
|
||||
@ -285,6 +270,21 @@ and the \code{print_every_n} parameter is passed to it.
|
||||
\item \code{cb.early.stop}: when \code{early_stopping_rounds} is set.
|
||||
\item \code{cb.save.model}: when \code{save_period > 0} is set.
|
||||
}
|
||||
|
||||
Note that objects of type \code{xgb.Booster} as returned by this function behave a bit differently
|
||||
from typical R objects (it's an 'altrep' list class), and it makes a separation between
|
||||
internal booster attributes (restricted to jsonifyable data), accessed through \link{xgb.attr}
|
||||
and shared between interfaces through serialization functions like \link{xgb.save}; and
|
||||
R-specific attributes, accessed through \link{attributes} and \link{attr}, which are otherwise
|
||||
only used in the R interface, only kept when using R's serializers like \link{saveRDS}, and
|
||||
not anyhow used by functions like \link{predict.xgb.Booster}.
|
||||
|
||||
Be aware that one such R attribute that is automatically added is \code{params} - this attribute
|
||||
is assigned from the \code{params} argument to this function, and is only meant to serve as a
|
||||
reference for what went into the booster, but is not used in other methods that take a booster
|
||||
object - so for example, changing the booster's configuration requires calling \verb{xgb.config<-}
|
||||
or 'xgb.parameters<-', while simply modifying \verb{attributes(model)$params$<...>} will have no
|
||||
effect elsewhere.
|
||||
}
|
||||
\examples{
|
||||
data(agaricus.train, package='xgboost')
|
||||
|
||||
@ -1,21 +0,0 @@
|
||||
% Generated by roxygen2: do not edit by hand
|
||||
% Please edit documentation in R/xgb.unserialize.R
|
||||
\name{xgb.unserialize}
|
||||
\alias{xgb.unserialize}
|
||||
\title{Load the instance back from \code{\link{xgb.serialize}}}
|
||||
\usage{
|
||||
xgb.unserialize(buffer, handle = NULL)
|
||||
}
|
||||
\arguments{
|
||||
\item{buffer}{the buffer containing booster instance saved by \code{\link{xgb.serialize}}}
|
||||
|
||||
\item{handle}{An \code{xgb.Booster.handle} object which will be overwritten with
|
||||
the new deserialized object. Must be a null handle (e.g. when loading the model through
|
||||
\code{readRDS}). If not provided, a new handle will be created.}
|
||||
}
|
||||
\value{
|
||||
An \code{xgb.Booster.handle} object.
|
||||
}
|
||||
\description{
|
||||
Load the instance back from \code{\link{xgb.serialize}}
|
||||
}
|
||||
@ -15,9 +15,16 @@ Check these declarations against the C/Fortran source code.
|
||||
*/
|
||||
|
||||
/* .Call calls */
|
||||
extern void XGBInitializeAltrepClass_R(DllInfo *info);
|
||||
extern SEXP XGDuplicate_R(SEXP);
|
||||
extern SEXP XGPointerEqComparison_R(SEXP, SEXP);
|
||||
extern SEXP XGBoosterTrainOneIter_R(SEXP, SEXP, SEXP, SEXP, SEXP);
|
||||
extern SEXP XGBoosterCreate_R(SEXP);
|
||||
extern SEXP XGBoosterCreateInEmptyObj_R(SEXP, SEXP);
|
||||
extern SEXP XGBoosterCopyInfoFromDMatrix_R(SEXP, SEXP);
|
||||
extern SEXP XGBoosterSetStrFeatureInfo_R(SEXP, SEXP, SEXP);
|
||||
extern SEXP XGBoosterGetStrFeatureInfo_R(SEXP, SEXP);
|
||||
extern SEXP XGBoosterBoostedRounds_R(SEXP);
|
||||
extern SEXP XGBoosterGetNumFeature_R(SEXP);
|
||||
extern SEXP XGBoosterDumpModel_R(SEXP, SEXP, SEXP, SEXP);
|
||||
extern SEXP XGBoosterEvalOneIter_R(SEXP, SEXP, SEXP, SEXP);
|
||||
extern SEXP XGBoosterGetAttrNames_R(SEXP);
|
||||
@ -57,9 +64,15 @@ extern SEXP XGBGetGlobalConfig_R(void);
|
||||
extern SEXP XGBoosterFeatureScore_R(SEXP, SEXP);
|
||||
|
||||
static const R_CallMethodDef CallEntries[] = {
|
||||
{"XGDuplicate_R", (DL_FUNC) &XGDuplicate_R, 1},
|
||||
{"XGPointerEqComparison_R", (DL_FUNC) &XGPointerEqComparison_R, 2},
|
||||
{"XGBoosterTrainOneIter_R", (DL_FUNC) &XGBoosterTrainOneIter_R, 5},
|
||||
{"XGBoosterCreate_R", (DL_FUNC) &XGBoosterCreate_R, 1},
|
||||
{"XGBoosterCreateInEmptyObj_R", (DL_FUNC) &XGBoosterCreateInEmptyObj_R, 2},
|
||||
{"XGBoosterCopyInfoFromDMatrix_R", (DL_FUNC) &XGBoosterCopyInfoFromDMatrix_R, 2},
|
||||
{"XGBoosterSetStrFeatureInfo_R",(DL_FUNC) &XGBoosterSetStrFeatureInfo_R,3}, // NOLINT
|
||||
{"XGBoosterGetStrFeatureInfo_R",(DL_FUNC) &XGBoosterGetStrFeatureInfo_R,2}, // NOLINT
|
||||
{"XGBoosterBoostedRounds_R", (DL_FUNC) &XGBoosterBoostedRounds_R, 1},
|
||||
{"XGBoosterGetNumFeature_R", (DL_FUNC) &XGBoosterGetNumFeature_R, 1},
|
||||
{"XGBoosterDumpModel_R", (DL_FUNC) &XGBoosterDumpModel_R, 4},
|
||||
{"XGBoosterEvalOneIter_R", (DL_FUNC) &XGBoosterEvalOneIter_R, 4},
|
||||
{"XGBoosterGetAttrNames_R", (DL_FUNC) &XGBoosterGetAttrNames_R, 1},
|
||||
@ -106,4 +119,5 @@ __declspec(dllexport)
|
||||
void attribute_visible R_init_xgboost(DllInfo *dll) {
|
||||
R_registerRoutines(dll, NULL, CallEntries, NULL, NULL);
|
||||
R_useDynamicSymbols(dll, FALSE);
|
||||
XGBInitializeAltrepClass_R(dll);
|
||||
}
|
||||
|
||||
@ -260,16 +260,18 @@ char cpp_ex_msg[512];
|
||||
using dmlc::BeginPtr;
|
||||
|
||||
XGB_DLL SEXP XGCheckNullPtr_R(SEXP handle) {
|
||||
return ScalarLogical(R_ExternalPtrAddr(handle) == NULL);
|
||||
return Rf_ScalarLogical(R_ExternalPtrAddr(handle) == nullptr);
|
||||
}
|
||||
|
||||
XGB_DLL void _DMatrixFinalizer(SEXP ext) {
|
||||
namespace {
|
||||
void _DMatrixFinalizer(SEXP ext) {
|
||||
R_API_BEGIN();
|
||||
if (R_ExternalPtrAddr(ext) == NULL) return;
|
||||
CHECK_CALL(XGDMatrixFree(R_ExternalPtrAddr(ext)));
|
||||
R_ClearExternalPtr(ext);
|
||||
R_API_END();
|
||||
}
|
||||
} /* namespace */
|
||||
|
||||
XGB_DLL SEXP XGBSetGlobalConfig_R(SEXP json_str) {
|
||||
R_API_BEGIN();
|
||||
@ -527,8 +529,14 @@ XGB_DLL SEXP XGDMatrixSetStrFeatureInfo_R(SEXP handle, SEXP field, SEXP array) {
|
||||
}
|
||||
|
||||
SEXP str_info_holder = PROTECT(Rf_allocVector(VECSXP, len));
|
||||
for (size_t i = 0; i < len; ++i) {
|
||||
SET_VECTOR_ELT(str_info_holder, i, Rf_asChar(VECTOR_ELT(array, i)));
|
||||
if (TYPEOF(array) == STRSXP) {
|
||||
for (size_t i = 0; i < len; ++i) {
|
||||
SET_VECTOR_ELT(str_info_holder, i, STRING_ELT(array, i));
|
||||
}
|
||||
} else {
|
||||
for (size_t i = 0; i < len; ++i) {
|
||||
SET_VECTOR_ELT(str_info_holder, i, Rf_asChar(VECTOR_ELT(array, i)));
|
||||
}
|
||||
}
|
||||
|
||||
SEXP field_ = PROTECT(Rf_asChar(field));
|
||||
@ -614,6 +622,14 @@ XGB_DLL SEXP XGDMatrixNumCol_R(SEXP handle) {
|
||||
return ScalarInteger(static_cast<int>(ncol));
|
||||
}
|
||||
|
||||
XGB_DLL SEXP XGDuplicate_R(SEXP obj) {
|
||||
return Rf_duplicate(obj);
|
||||
}
|
||||
|
||||
XGB_DLL SEXP XGPointerEqComparison_R(SEXP obj1, SEXP obj2) {
|
||||
return Rf_ScalarLogical(R_ExternalPtrAddr(obj1) == R_ExternalPtrAddr(obj2));
|
||||
}
|
||||
|
||||
XGB_DLL SEXP XGDMatrixGetQuantileCut_R(SEXP handle) {
|
||||
const char *out_names[] = {"indptr", "data", ""};
|
||||
SEXP continuation_token = Rf_protect(R_MakeUnwindCont());
|
||||
@ -682,14 +698,134 @@ XGB_DLL SEXP XGDMatrixGetDataAsCSR_R(SEXP handle) {
|
||||
}
|
||||
|
||||
// functions related to booster
|
||||
void _BoosterFinalizer(SEXP ext) {
|
||||
if (R_ExternalPtrAddr(ext) == NULL) return;
|
||||
CHECK_CALL(XGBoosterFree(R_ExternalPtrAddr(ext)));
|
||||
R_ClearExternalPtr(ext);
|
||||
namespace {
|
||||
void _BoosterFinalizer(SEXP R_ptr) {
|
||||
if (R_ExternalPtrAddr(R_ptr) == NULL) return;
|
||||
CHECK_CALL(XGBoosterFree(R_ExternalPtrAddr(R_ptr)));
|
||||
R_ClearExternalPtr(R_ptr);
|
||||
}
|
||||
|
||||
/* Booster is represented as an altrep list with one element which
|
||||
corresponds to an 'externalptr' holding the C object, forbidding
|
||||
modification by not implementing setters, and adding custom serialization. */
|
||||
R_altrep_class_t XGBAltrepPointerClass;
|
||||
|
||||
R_xlen_t XGBAltrepPointerLength_R(SEXP R_altrepped_obj) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
SEXP XGBAltrepPointerGetElt_R(SEXP R_altrepped_obj, R_xlen_t idx) {
|
||||
return R_altrep_data1(R_altrepped_obj);
|
||||
}
|
||||
|
||||
SEXP XGBMakeEmptyAltrep() {
|
||||
SEXP class_name = Rf_protect(Rf_mkString("xgb.Booster"));
|
||||
SEXP elt_names = Rf_protect(Rf_mkString("ptr"));
|
||||
SEXP R_ptr = Rf_protect(R_MakeExternalPtr(nullptr, R_NilValue, R_NilValue));
|
||||
SEXP R_altrepped_obj = Rf_protect(R_new_altrep(XGBAltrepPointerClass, R_ptr, R_NilValue));
|
||||
Rf_setAttrib(R_altrepped_obj, R_NamesSymbol, elt_names);
|
||||
Rf_setAttrib(R_altrepped_obj, R_ClassSymbol, class_name);
|
||||
Rf_unprotect(4);
|
||||
return R_altrepped_obj;
|
||||
}
|
||||
|
||||
/* Note: the idea for separating this function from the one above is to be
|
||||
able to trigger all R allocations first before doing non-R allocations. */
|
||||
void XGBAltrepSetPointer(SEXP R_altrepped_obj, BoosterHandle handle) {
|
||||
SEXP R_ptr = R_altrep_data1(R_altrepped_obj);
|
||||
R_SetExternalPtrAddr(R_ptr, handle);
|
||||
R_RegisterCFinalizerEx(R_ptr, _BoosterFinalizer, TRUE);
|
||||
}
|
||||
|
||||
SEXP XGBAltrepSerializer_R(SEXP R_altrepped_obj) {
|
||||
R_API_BEGIN();
|
||||
BoosterHandle handle = R_ExternalPtrAddr(R_altrep_data1(R_altrepped_obj));
|
||||
char const *serialized_bytes;
|
||||
bst_ulong serialized_length;
|
||||
CHECK_CALL(XGBoosterSerializeToBuffer(
|
||||
handle, &serialized_length, &serialized_bytes));
|
||||
SEXP R_state = Rf_protect(Rf_allocVector(RAWSXP, serialized_length));
|
||||
if (serialized_length != 0) {
|
||||
std::memcpy(RAW(R_state), serialized_bytes, serialized_length);
|
||||
}
|
||||
Rf_unprotect(1);
|
||||
return R_state;
|
||||
R_API_END();
|
||||
return R_NilValue; /* <- should not be reached */
|
||||
}
|
||||
|
||||
SEXP XGBAltrepDeserializer_R(SEXP unused, SEXP R_state) {
|
||||
SEXP R_altrepped_obj = Rf_protect(XGBMakeEmptyAltrep());
|
||||
R_API_BEGIN();
|
||||
BoosterHandle handle = nullptr;
|
||||
CHECK_CALL(XGBoosterCreate(nullptr, 0, &handle));
|
||||
int res_code = XGBoosterUnserializeFromBuffer(handle,
|
||||
RAW(R_state),
|
||||
Rf_xlength(R_state));
|
||||
if (res_code != 0) {
|
||||
XGBoosterFree(handle);
|
||||
}
|
||||
CHECK_CALL(res_code);
|
||||
XGBAltrepSetPointer(R_altrepped_obj, handle);
|
||||
R_API_END();
|
||||
Rf_unprotect(1);
|
||||
return R_altrepped_obj;
|
||||
}
|
||||
|
||||
// https://purrple.cat/blog/2018/10/14/altrep-and-cpp/
|
||||
Rboolean XGBAltrepInspector_R(
|
||||
SEXP x, int pre, int deep, int pvec,
|
||||
void (*inspect_subtree)(SEXP, int, int, int)) {
|
||||
Rprintf("Altrepped external pointer [address:%p]\n",
|
||||
R_ExternalPtrAddr(R_altrep_data1(x)));
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
SEXP XGBAltrepDuplicate_R(SEXP R_altrepped_obj, Rboolean deep) {
|
||||
R_API_BEGIN();
|
||||
if (!deep) {
|
||||
SEXP out = Rf_protect(XGBMakeEmptyAltrep());
|
||||
R_set_altrep_data1(out, R_altrep_data1(R_altrepped_obj));
|
||||
Rf_unprotect(1);
|
||||
return out;
|
||||
} else {
|
||||
SEXP out = Rf_protect(XGBMakeEmptyAltrep());
|
||||
char const *serialized_bytes;
|
||||
bst_ulong serialized_length;
|
||||
CHECK_CALL(XGBoosterSerializeToBuffer(
|
||||
R_ExternalPtrAddr(R_altrep_data1(R_altrepped_obj)),
|
||||
&serialized_length, &serialized_bytes));
|
||||
BoosterHandle new_handle = nullptr;
|
||||
CHECK_CALL(XGBoosterCreate(nullptr, 0, &new_handle));
|
||||
int res_code = XGBoosterUnserializeFromBuffer(new_handle,
|
||||
serialized_bytes,
|
||||
serialized_length);
|
||||
if (res_code != 0) {
|
||||
XGBoosterFree(new_handle);
|
||||
}
|
||||
CHECK_CALL(res_code);
|
||||
XGBAltrepSetPointer(out, new_handle);
|
||||
Rf_unprotect(1);
|
||||
return out;
|
||||
}
|
||||
R_API_END();
|
||||
return R_NilValue; /* <- should not be reached */
|
||||
}
|
||||
|
||||
} /* namespace */
|
||||
|
||||
XGB_DLL void XGBInitializeAltrepClass_R(DllInfo *dll) {
|
||||
XGBAltrepPointerClass = R_make_altlist_class("XGBAltrepPointerClass", "xgboost", dll);
|
||||
R_set_altrep_Length_method(XGBAltrepPointerClass, XGBAltrepPointerLength_R);
|
||||
R_set_altlist_Elt_method(XGBAltrepPointerClass, XGBAltrepPointerGetElt_R);
|
||||
R_set_altrep_Inspect_method(XGBAltrepPointerClass, XGBAltrepInspector_R);
|
||||
R_set_altrep_Serialized_state_method(XGBAltrepPointerClass, XGBAltrepSerializer_R);
|
||||
R_set_altrep_Unserialize_method(XGBAltrepPointerClass, XGBAltrepDeserializer_R);
|
||||
R_set_altrep_Duplicate_method(XGBAltrepPointerClass, XGBAltrepDuplicate_R);
|
||||
}
|
||||
|
||||
XGB_DLL SEXP XGBoosterCreate_R(SEXP dmats) {
|
||||
SEXP ret = PROTECT(R_MakeExternalPtr(nullptr, R_NilValue, R_NilValue));
|
||||
SEXP out = Rf_protect(XGBMakeEmptyAltrep());
|
||||
R_API_BEGIN();
|
||||
R_xlen_t len = Rf_xlength(dmats);
|
||||
BoosterHandle handle;
|
||||
@ -703,33 +839,104 @@ XGB_DLL SEXP XGBoosterCreate_R(SEXP dmats) {
|
||||
res_code = XGBoosterCreate(BeginPtr(dvec), dvec.size(), &handle);
|
||||
}
|
||||
CHECK_CALL(res_code);
|
||||
R_SetExternalPtrAddr(ret, handle);
|
||||
R_RegisterCFinalizerEx(ret, _BoosterFinalizer, TRUE);
|
||||
XGBAltrepSetPointer(out, handle);
|
||||
R_API_END();
|
||||
UNPROTECT(1);
|
||||
return ret;
|
||||
Rf_unprotect(1);
|
||||
return out;
|
||||
}
|
||||
|
||||
XGB_DLL SEXP XGBoosterCreateInEmptyObj_R(SEXP dmats, SEXP R_handle) {
|
||||
XGB_DLL SEXP XGBoosterCopyInfoFromDMatrix_R(SEXP booster, SEXP dmat) {
|
||||
R_API_BEGIN();
|
||||
R_xlen_t len = Rf_xlength(dmats);
|
||||
BoosterHandle handle;
|
||||
char const **feature_names;
|
||||
bst_ulong len_feature_names = 0;
|
||||
CHECK_CALL(XGDMatrixGetStrFeatureInfo(R_ExternalPtrAddr(dmat),
|
||||
"feature_name",
|
||||
&len_feature_names,
|
||||
&feature_names));
|
||||
if (len_feature_names) {
|
||||
CHECK_CALL(XGBoosterSetStrFeatureInfo(R_ExternalPtrAddr(booster),
|
||||
"feature_name",
|
||||
feature_names,
|
||||
len_feature_names));
|
||||
}
|
||||
|
||||
char const **feature_types;
|
||||
bst_ulong len_feature_types = 0;
|
||||
CHECK_CALL(XGDMatrixGetStrFeatureInfo(R_ExternalPtrAddr(dmat),
|
||||
"feature_type",
|
||||
&len_feature_types,
|
||||
&feature_types));
|
||||
if (len_feature_types) {
|
||||
CHECK_CALL(XGBoosterSetStrFeatureInfo(R_ExternalPtrAddr(booster),
|
||||
"feature_type",
|
||||
feature_types,
|
||||
len_feature_types));
|
||||
}
|
||||
R_API_END();
|
||||
return R_NilValue;
|
||||
}
|
||||
|
||||
XGB_DLL SEXP XGBoosterSetStrFeatureInfo_R(SEXP handle, SEXP field, SEXP features) {
|
||||
R_API_BEGIN();
|
||||
SEXP field_char = Rf_protect(Rf_asChar(field));
|
||||
bst_ulong len_features = Rf_xlength(features);
|
||||
|
||||
int res_code;
|
||||
{
|
||||
std::vector<void*> dvec(len);
|
||||
for (R_xlen_t i = 0; i < len; ++i) {
|
||||
dvec[i] = R_ExternalPtrAddr(VECTOR_ELT(dmats, i));
|
||||
std::vector<const char*> str_arr(len_features);
|
||||
for (bst_ulong idx = 0; idx < len_features; idx++) {
|
||||
str_arr[idx] = CHAR(STRING_ELT(features, idx));
|
||||
}
|
||||
res_code = XGBoosterCreate(BeginPtr(dvec), dvec.size(), &handle);
|
||||
res_code = XGBoosterSetStrFeatureInfo(R_ExternalPtrAddr(handle),
|
||||
CHAR(field_char),
|
||||
str_arr.data(),
|
||||
len_features);
|
||||
}
|
||||
CHECK_CALL(res_code);
|
||||
R_SetExternalPtrAddr(R_handle, handle);
|
||||
R_RegisterCFinalizerEx(R_handle, _BoosterFinalizer, TRUE);
|
||||
Rf_unprotect(1);
|
||||
R_API_END();
|
||||
return R_NilValue;
|
||||
}
|
||||
|
||||
XGB_DLL SEXP XGBoosterGetStrFeatureInfo_R(SEXP handle, SEXP field) {
|
||||
R_API_BEGIN();
|
||||
bst_ulong len;
|
||||
const char **out_features;
|
||||
SEXP field_char = Rf_protect(Rf_asChar(field));
|
||||
CHECK_CALL(XGBoosterGetStrFeatureInfo(R_ExternalPtrAddr(handle),
|
||||
CHAR(field_char), &len, &out_features));
|
||||
SEXP out = Rf_protect(Rf_allocVector(STRSXP, len));
|
||||
for (bst_ulong idx = 0; idx < len; idx++) {
|
||||
SET_STRING_ELT(out, idx, Rf_mkChar(out_features[idx]));
|
||||
}
|
||||
Rf_unprotect(2);
|
||||
return out;
|
||||
R_API_END();
|
||||
return R_NilValue; /* <- should not be reached */
|
||||
}
|
||||
|
||||
XGB_DLL SEXP XGBoosterBoostedRounds_R(SEXP handle) {
|
||||
SEXP out = Rf_protect(Rf_allocVector(INTSXP, 1));
|
||||
R_API_BEGIN();
|
||||
CHECK_CALL(XGBoosterBoostedRounds(R_ExternalPtrAddr(handle), INTEGER(out)));
|
||||
R_API_END();
|
||||
Rf_unprotect(1);
|
||||
return out;
|
||||
}
|
||||
|
||||
/* Note: R's integer class is 32-bit-and-signed only, while xgboost
|
||||
supports more, so it returns it as a floating point instead */
|
||||
XGB_DLL SEXP XGBoosterGetNumFeature_R(SEXP handle) {
|
||||
SEXP out = Rf_protect(Rf_allocVector(REALSXP, 1));
|
||||
R_API_BEGIN();
|
||||
bst_ulong res;
|
||||
CHECK_CALL(XGBoosterGetNumFeature(R_ExternalPtrAddr(handle), &res));
|
||||
REAL(out)[0] = static_cast<double>(res);
|
||||
R_API_END();
|
||||
Rf_unprotect(1);
|
||||
return out;
|
||||
}
|
||||
|
||||
XGB_DLL SEXP XGBoosterSetParam_R(SEXP handle, SEXP name, SEXP val) {
|
||||
R_API_BEGIN();
|
||||
SEXP name_ = PROTECT(Rf_asChar(name));
|
||||
@ -745,8 +952,8 @@ XGB_DLL SEXP XGBoosterSetParam_R(SEXP handle, SEXP name, SEXP val) {
|
||||
XGB_DLL SEXP XGBoosterUpdateOneIter_R(SEXP handle, SEXP iter, SEXP dtrain) {
|
||||
R_API_BEGIN();
|
||||
CHECK_CALL(XGBoosterUpdateOneIter(R_ExternalPtrAddr(handle),
|
||||
asInteger(iter),
|
||||
R_ExternalPtrAddr(dtrain)));
|
||||
Rf_asInteger(iter),
|
||||
R_ExternalPtrAddr(dtrain)));
|
||||
R_API_END();
|
||||
return R_NilValue;
|
||||
}
|
||||
|
||||
@ -8,7 +8,9 @@
|
||||
#define XGBOOST_R_H_ // NOLINT(*)
|
||||
|
||||
|
||||
#include <R.h>
|
||||
#include <Rinternals.h>
|
||||
#include <R_ext/Altrep.h>
|
||||
#include <R_ext/Random.h>
|
||||
#include <Rmath.h>
|
||||
|
||||
@ -143,6 +145,25 @@ XGB_DLL SEXP XGDMatrixNumRow_R(SEXP handle);
|
||||
*/
|
||||
XGB_DLL SEXP XGDMatrixNumCol_R(SEXP handle);
|
||||
|
||||
/*!
|
||||
* \brief Call R C-level function 'duplicate'
|
||||
* \param obj Object to duplicate
|
||||
*/
|
||||
XGB_DLL SEXP XGDuplicate_R(SEXP obj);
|
||||
|
||||
/*!
|
||||
* \brief Equality comparison for two pointers
|
||||
* \param obj1 R 'externalptr'
|
||||
* \param obj2 R 'externalptr'
|
||||
*/
|
||||
XGB_DLL SEXP XGPointerEqComparison_R(SEXP obj1, SEXP obj2);
|
||||
|
||||
/*!
|
||||
* \brief Register the Altrep class used for the booster
|
||||
* \param dll DLL info as provided by R_init
|
||||
*/
|
||||
XGB_DLL void XGBInitializeAltrepClass_R(DllInfo *dll);
|
||||
|
||||
/*!
|
||||
* \brief return the quantile cuts used for the histogram method
|
||||
* \param handle an instance of data matrix
|
||||
@ -174,13 +195,37 @@ XGB_DLL SEXP XGDMatrixGetDataAsCSR_R(SEXP handle);
|
||||
*/
|
||||
XGB_DLL SEXP XGBoosterCreate_R(SEXP dmats);
|
||||
|
||||
/*!
|
||||
* \brief copy information about features from a DMatrix into a Booster
|
||||
* \param booster R 'externalptr' pointing to a booster object
|
||||
* \param dmat R 'externalptr' pointing to a DMatrix object
|
||||
*/
|
||||
XGB_DLL SEXP XGBoosterCopyInfoFromDMatrix_R(SEXP booster, SEXP dmat);
|
||||
|
||||
/*!
|
||||
* \brief create xgboost learner, saving the pointer into an existing R object
|
||||
* \param dmats a list of dmatrix handles that will be cached
|
||||
* \param R_handle a clean R external pointer (not holding any object)
|
||||
* \brief handle R 'externalptr' holding the booster object
|
||||
* \param field field name
|
||||
* \param features features to set for the field
|
||||
*/
|
||||
XGB_DLL SEXP XGBoosterCreateInEmptyObj_R(SEXP dmats, SEXP R_handle);
|
||||
XGB_DLL SEXP XGBoosterSetStrFeatureInfo_R(SEXP handle, SEXP field, SEXP features);
|
||||
|
||||
/*!
|
||||
* \brief handle R 'externalptr' holding the booster object
|
||||
* \param field field name
|
||||
*/
|
||||
XGB_DLL SEXP XGBoosterGetStrFeatureInfo_R(SEXP handle, SEXP field);
|
||||
|
||||
/*!
|
||||
* \brief Get the number of boosted rounds from a model
|
||||
* \param handle R 'externalptr' holding the booster object
|
||||
*/
|
||||
XGB_DLL SEXP XGBoosterBoostedRounds_R(SEXP handle);
|
||||
|
||||
/*!
|
||||
* \brief Get the number of features to which the model was fitted
|
||||
* \param handle R 'externalptr' holding the booster object
|
||||
*/
|
||||
XGB_DLL SEXP XGBoosterGetNumFeature_R(SEXP handle);
|
||||
|
||||
/*!
|
||||
* \brief set parameters
|
||||
|
||||
@ -3,7 +3,6 @@
|
||||
## inconsistent is found.
|
||||
pkgs <- c(
|
||||
## CI
|
||||
"caret",
|
||||
"pkgbuild",
|
||||
"roxygen2",
|
||||
"XML",
|
||||
|
||||
@ -25,10 +25,10 @@ test_that("train and predict binary classification", {
|
||||
"train-error"
|
||||
)
|
||||
expect_equal(class(bst), "xgb.Booster")
|
||||
expect_equal(bst$niter, nrounds)
|
||||
expect_false(is.null(bst$evaluation_log))
|
||||
expect_equal(nrow(bst$evaluation_log), nrounds)
|
||||
expect_lt(bst$evaluation_log[, min(train_error)], 0.03)
|
||||
expect_equal(xgb.get.num.boosted.rounds(bst), nrounds)
|
||||
expect_false(is.null(attributes(bst)$evaluation_log))
|
||||
expect_equal(nrow(attributes(bst)$evaluation_log), nrounds)
|
||||
expect_lt(attributes(bst)$evaluation_log[, min(train_error)], 0.03)
|
||||
|
||||
pred <- predict(bst, test$data)
|
||||
expect_length(pred, 1611)
|
||||
@ -36,7 +36,7 @@ test_that("train and predict binary classification", {
|
||||
pred1 <- predict(bst, train$data, ntreelimit = 1)
|
||||
expect_length(pred1, 6513)
|
||||
err_pred1 <- sum((pred1 > 0.5) != train$label) / length(train$label)
|
||||
err_log <- bst$evaluation_log[1, train_error]
|
||||
err_log <- attributes(bst)$evaluation_log[1, train_error]
|
||||
expect_lt(abs(err_pred1 - err_log), 10e-6)
|
||||
|
||||
pred2 <- predict(bst, train$data, iterationrange = c(1, 2))
|
||||
@ -160,9 +160,9 @@ test_that("train and predict softprob", {
|
||||
),
|
||||
"train-merror"
|
||||
)
|
||||
expect_false(is.null(bst$evaluation_log))
|
||||
expect_lt(bst$evaluation_log[, min(train_merror)], 0.025)
|
||||
expect_equal(bst$niter * 3, xgb.ntree(bst))
|
||||
expect_false(is.null(attributes(bst)$evaluation_log))
|
||||
expect_lt(attributes(bst)$evaluation_log[, min(train_merror)], 0.025)
|
||||
expect_equal(xgb.get.num.boosted.rounds(bst) * 3, xgb.ntree(bst))
|
||||
pred <- predict(bst, as.matrix(iris[, -5]))
|
||||
expect_length(pred, nrow(iris) * 3)
|
||||
# row sums add up to total probability of 1:
|
||||
@ -172,12 +172,12 @@ test_that("train and predict softprob", {
|
||||
expect_equal(as.numeric(t(mpred)), pred)
|
||||
pred_labels <- max.col(mpred) - 1
|
||||
err <- sum(pred_labels != lb) / length(lb)
|
||||
expect_equal(bst$evaluation_log[5, train_merror], err, tolerance = 5e-6)
|
||||
expect_equal(attributes(bst)$evaluation_log[5, train_merror], err, tolerance = 5e-6)
|
||||
# manually calculate error at the 1st iteration:
|
||||
mpred <- predict(bst, as.matrix(iris[, -5]), reshape = TRUE, ntreelimit = 1)
|
||||
pred_labels <- max.col(mpred) - 1
|
||||
err <- sum(pred_labels != lb) / length(lb)
|
||||
expect_equal(bst$evaluation_log[1, train_merror], err, tolerance = 5e-6)
|
||||
expect_equal(attributes(bst)$evaluation_log[1, train_merror], err, tolerance = 5e-6)
|
||||
|
||||
mpred1 <- predict(bst, as.matrix(iris[, -5]), reshape = TRUE, iterationrange = c(1, 2))
|
||||
expect_equal(mpred, mpred1)
|
||||
@ -211,14 +211,14 @@ test_that("train and predict softmax", {
|
||||
),
|
||||
"train-merror"
|
||||
)
|
||||
expect_false(is.null(bst$evaluation_log))
|
||||
expect_lt(bst$evaluation_log[, min(train_merror)], 0.025)
|
||||
expect_equal(bst$niter * 3, xgb.ntree(bst))
|
||||
expect_false(is.null(attributes(bst)$evaluation_log))
|
||||
expect_lt(attributes(bst)$evaluation_log[, min(train_merror)], 0.025)
|
||||
expect_equal(xgb.get.num.boosted.rounds(bst) * 3, xgb.ntree(bst))
|
||||
|
||||
pred <- predict(bst, as.matrix(iris[, -5]))
|
||||
expect_length(pred, nrow(iris))
|
||||
err <- sum(pred != lb) / length(lb)
|
||||
expect_equal(bst$evaluation_log[5, train_merror], err, tolerance = 5e-6)
|
||||
expect_equal(attributes(bst)$evaluation_log[5, train_merror], err, tolerance = 5e-6)
|
||||
})
|
||||
|
||||
test_that("train and predict RF", {
|
||||
@ -232,12 +232,12 @@ test_that("train and predict RF", {
|
||||
num_parallel_tree = 20, subsample = 0.6, colsample_bytree = 0.1,
|
||||
watchlist = list(train = xgb.DMatrix(train$data, label = lb))
|
||||
)
|
||||
expect_equal(bst$niter, 1)
|
||||
expect_equal(xgb.get.num.boosted.rounds(bst), 1)
|
||||
expect_equal(xgb.ntree(bst), 20)
|
||||
|
||||
pred <- predict(bst, train$data)
|
||||
pred_err <- sum((pred > 0.5) != lb) / length(lb)
|
||||
expect_lt(abs(bst$evaluation_log[1, train_error] - pred_err), 10e-6)
|
||||
expect_lt(abs(attributes(bst)$evaluation_log[1, train_error] - pred_err), 10e-6)
|
||||
# expect_lt(pred_err, 0.03)
|
||||
|
||||
pred <- predict(bst, train$data, ntreelimit = 20)
|
||||
@ -260,18 +260,18 @@ test_that("train and predict RF with softprob", {
|
||||
num_parallel_tree = 4, subsample = 0.5, colsample_bytree = 0.5,
|
||||
watchlist = list(train = xgb.DMatrix(as.matrix(iris[, -5]), label = lb))
|
||||
)
|
||||
expect_equal(bst$niter, 15)
|
||||
expect_equal(xgb.get.num.boosted.rounds(bst), 15)
|
||||
expect_equal(xgb.ntree(bst), 15 * 3 * 4)
|
||||
# predict for all iterations:
|
||||
pred <- predict(bst, as.matrix(iris[, -5]), reshape = TRUE)
|
||||
expect_equal(dim(pred), c(nrow(iris), 3))
|
||||
pred_labels <- max.col(pred) - 1
|
||||
err <- sum(pred_labels != lb) / length(lb)
|
||||
expect_equal(bst$evaluation_log[nrounds, train_merror], err, tolerance = 5e-6)
|
||||
expect_equal(attributes(bst)$evaluation_log[nrounds, train_merror], err, tolerance = 5e-6)
|
||||
# predict for 7 iterations and adjust for 4 parallel trees per iteration
|
||||
pred <- predict(bst, as.matrix(iris[, -5]), reshape = TRUE, ntreelimit = 7 * 4)
|
||||
err <- sum((max.col(pred) - 1) != lb) / length(lb)
|
||||
expect_equal(bst$evaluation_log[7, train_merror], err, tolerance = 5e-6)
|
||||
expect_equal(attributes(bst)$evaluation_log[7, train_merror], err, tolerance = 5e-6)
|
||||
})
|
||||
|
||||
test_that("use of multiple eval metrics works", {
|
||||
@ -284,9 +284,9 @@ test_that("use of multiple eval metrics works", {
|
||||
),
|
||||
"train-error.*train-auc.*train-logloss"
|
||||
)
|
||||
expect_false(is.null(bst$evaluation_log))
|
||||
expect_equal(dim(bst$evaluation_log), c(2, 4))
|
||||
expect_equal(colnames(bst$evaluation_log), c("iter", "train_error", "train_auc", "train_logloss"))
|
||||
expect_false(is.null(attributes(bst)$evaluation_log))
|
||||
expect_equal(dim(attributes(bst)$evaluation_log), c(2, 4))
|
||||
expect_equal(colnames(attributes(bst)$evaluation_log), c("iter", "train_error", "train_auc", "train_logloss"))
|
||||
expect_output(
|
||||
bst2 <- xgb.train(
|
||||
data = xgb.DMatrix(train$data, label = train$label), max_depth = 2,
|
||||
@ -296,9 +296,9 @@ test_that("use of multiple eval metrics works", {
|
||||
),
|
||||
"train-error.*train-auc.*train-logloss"
|
||||
)
|
||||
expect_false(is.null(bst2$evaluation_log))
|
||||
expect_equal(dim(bst2$evaluation_log), c(2, 4))
|
||||
expect_equal(colnames(bst2$evaluation_log), c("iter", "train_error", "train_auc", "train_logloss"))
|
||||
expect_false(is.null(attributes(bst2)$evaluation_log))
|
||||
expect_equal(dim(attributes(bst2)$evaluation_log), c(2, 4))
|
||||
expect_equal(colnames(attributes(bst2)$evaluation_log), c("iter", "train_error", "train_auc", "train_logloss"))
|
||||
})
|
||||
|
||||
|
||||
@ -318,41 +318,25 @@ test_that("training continuation works", {
|
||||
# continue for two more:
|
||||
bst2 <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0, xgb_model = bst1)
|
||||
if (!windows_flag && !solaris_flag) {
|
||||
expect_equal(bst$raw, bst2$raw)
|
||||
expect_equal(xgb.save.raw(bst), xgb.save.raw(bst2))
|
||||
}
|
||||
expect_false(is.null(bst2$evaluation_log))
|
||||
expect_equal(dim(bst2$evaluation_log), c(4, 2))
|
||||
expect_equal(bst2$evaluation_log, bst$evaluation_log)
|
||||
expect_false(is.null(attributes(bst2)$evaluation_log))
|
||||
expect_equal(dim(attributes(bst2)$evaluation_log), c(4, 2))
|
||||
expect_equal(attributes(bst2)$evaluation_log, attributes(bst)$evaluation_log)
|
||||
# test continuing from raw model data
|
||||
bst2 <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0, xgb_model = bst1$raw)
|
||||
bst2 <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0, xgb_model = xgb.save.raw(bst1))
|
||||
if (!windows_flag && !solaris_flag) {
|
||||
expect_equal(bst$raw, bst2$raw)
|
||||
expect_equal(xgb.save.raw(bst), xgb.save.raw(bst2))
|
||||
}
|
||||
expect_equal(dim(bst2$evaluation_log), c(2, 2))
|
||||
expect_equal(dim(attributes(bst2)$evaluation_log), c(2, 2))
|
||||
# test continuing from a model in file
|
||||
fname <- file.path(tempdir(), "xgboost.json")
|
||||
xgb.save(bst1, fname)
|
||||
bst2 <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0, xgb_model = fname)
|
||||
if (!windows_flag && !solaris_flag) {
|
||||
expect_equal(bst$raw, bst2$raw)
|
||||
expect_equal(xgb.save.raw(bst), xgb.save.raw(bst2))
|
||||
}
|
||||
expect_equal(dim(bst2$evaluation_log), c(2, 2))
|
||||
})
|
||||
|
||||
test_that("model serialization works", {
|
||||
out_path <- file.path(tempdir(), "model_serialization")
|
||||
dtrain <- xgb.DMatrix(train$data, label = train$label, nthread = n_threads)
|
||||
watchlist <- list(train = dtrain)
|
||||
param <- list(objective = "binary:logistic", nthread = n_threads)
|
||||
booster <- xgb.train(param, dtrain, nrounds = 4, watchlist)
|
||||
raw <- xgb.serialize(booster)
|
||||
saveRDS(raw, out_path)
|
||||
raw <- readRDS(out_path)
|
||||
|
||||
loaded <- xgb.unserialize(raw)
|
||||
raw_from_loaded <- xgb.serialize(loaded)
|
||||
expect_equal(raw, raw_from_loaded)
|
||||
file.remove(out_path)
|
||||
expect_equal(dim(attributes(bst2)$evaluation_log), c(2, 2))
|
||||
})
|
||||
|
||||
test_that("xgb.cv works", {
|
||||
@ -455,8 +439,8 @@ test_that("max_delta_step works", {
|
||||
# model with restricted max_delta_step
|
||||
bst2 <- xgb.train(param, dtrain, nrounds, watchlist, verbose = 1, max_delta_step = 1)
|
||||
# the no-restriction model is expected to have consistently lower loss during the initial iterations
|
||||
expect_true(all(bst1$evaluation_log$train_logloss < bst2$evaluation_log$train_logloss))
|
||||
expect_lt(mean(bst1$evaluation_log$train_logloss) / mean(bst2$evaluation_log$train_logloss), 0.8)
|
||||
expect_true(all(attributes(bst1)$evaluation_log$train_logloss < attributes(bst2)$evaluation_log$train_logloss))
|
||||
expect_lt(mean(attributes(bst1)$evaluation_log$train_logloss) / mean(attributes(bst2)$evaluation_log$train_logloss), 0.8)
|
||||
})
|
||||
|
||||
test_that("colsample_bytree works", {
|
||||
@ -675,3 +659,131 @@ test_that("Can use ranking objectives with either 'qid' or 'group'", {
|
||||
pred_gr <- predict(model_gr, x)
|
||||
expect_equal(pred_qid, pred_gr)
|
||||
})
|
||||
|
||||
test_that("Coefficients from gblinear have the expected shape and names", {
|
||||
# Single-column coefficients
|
||||
data(mtcars)
|
||||
y <- mtcars$mpg
|
||||
x <- as.matrix(mtcars[, -1])
|
||||
mm <- model.matrix(~., data = mtcars[, -1])
|
||||
dm <- xgb.DMatrix(x, label = y, nthread = 1)
|
||||
model <- xgb.train(
|
||||
data = dm,
|
||||
params = list(
|
||||
booster = "gblinear",
|
||||
nthread = 1
|
||||
),
|
||||
nrounds = 3
|
||||
)
|
||||
coefs <- coef(model)
|
||||
expect_equal(length(coefs), ncol(x) + 1)
|
||||
expect_equal(names(coefs), c("(Intercept)", colnames(x)))
|
||||
pred_auto <- predict(model, x)
|
||||
pred_manual <- as.numeric(mm %*% coefs)
|
||||
expect_equal(pred_manual, pred_auto, tolerance = 1e-5)
|
||||
|
||||
# Multi-column coefficients
|
||||
data(iris)
|
||||
y <- as.numeric(iris$Species) - 1
|
||||
x <- as.matrix(iris[, -5])
|
||||
dm <- xgb.DMatrix(x, label = y, nthread = 1)
|
||||
mm <- model.matrix(~., data = iris[, -5])
|
||||
model <- xgb.train(
|
||||
data = dm,
|
||||
params = list(
|
||||
booster = "gblinear",
|
||||
objective = "multi:softprob",
|
||||
num_class = 3,
|
||||
nthread = 1
|
||||
),
|
||||
nrounds = 3
|
||||
)
|
||||
coefs <- coef(model)
|
||||
expect_equal(nrow(coefs), ncol(x) + 1)
|
||||
expect_equal(ncol(coefs), 3)
|
||||
expect_equal(row.names(coefs), c("(Intercept)", colnames(x)))
|
||||
pred_auto <- predict(model, x, outputmargin = TRUE, reshape = TRUE)
|
||||
pred_manual <- unname(mm %*% coefs)
|
||||
expect_equal(pred_manual, pred_auto, tolerance = 1e-7)
|
||||
})
|
||||
|
||||
test_that("Deep copies work as expected", {
|
||||
data(mtcars)
|
||||
y <- mtcars$mpg
|
||||
x <- mtcars[, -1]
|
||||
dm <- xgb.DMatrix(x, label = y, nthread = 1)
|
||||
model <- xgb.train(
|
||||
data = dm,
|
||||
params = list(nthread = 1),
|
||||
nrounds = 3
|
||||
)
|
||||
|
||||
xgb.attr(model, "my_attr") <- 100
|
||||
model_shallow_copy <- model
|
||||
xgb.attr(model_shallow_copy, "my_attr") <- 333
|
||||
attr_orig <- xgb.attr(model, "my_attr")
|
||||
attr_shallow <- xgb.attr(model_shallow_copy, "my_attr")
|
||||
expect_equal(attr_orig, attr_shallow)
|
||||
|
||||
model_deep_copy <- xgb.copy.Booster(model)
|
||||
xgb.attr(model_deep_copy, "my_attr") <- 444
|
||||
attr_orig <- xgb.attr(model, "my_attr")
|
||||
attr_deep <- xgb.attr(model_deep_copy, "my_attr")
|
||||
expect_false(attr_orig == attr_deep)
|
||||
})
|
||||
|
||||
test_that("Pointer comparison works as expected", {
|
||||
library(xgboost)
|
||||
y <- mtcars$mpg
|
||||
x <- as.matrix(mtcars[, -1])
|
||||
model <- xgb.train(
|
||||
params = list(nthread = 1),
|
||||
data = xgb.DMatrix(x, label = y, nthread = 1),
|
||||
nrounds = 3
|
||||
)
|
||||
|
||||
model_shallow_copy <- model
|
||||
expect_true(xgb.is.same.Booster(model, model_shallow_copy))
|
||||
|
||||
model_deep_copy <- xgb.copy.Booster(model)
|
||||
expect_false(xgb.is.same.Booster(model, model_deep_copy))
|
||||
|
||||
xgb.attr(model_shallow_copy, "my_attr") <- 111
|
||||
expect_equal(xgb.attr(model, "my_attr"), "111")
|
||||
expect_null(xgb.attr(model_deep_copy, "my_attr"))
|
||||
})
|
||||
|
||||
test_that("DMatrix field are set to booster when training", {
|
||||
set.seed(123)
|
||||
y <- rnorm(100)
|
||||
x <- matrix(rnorm(100 * 3), nrow = 100)
|
||||
x[, 2] <- abs(as.integer(x[, 2]))
|
||||
|
||||
dm_unnamed <- xgb.DMatrix(x, label = y, nthread = 1)
|
||||
dm_feature_names <- xgb.DMatrix(x, label = y, feature_names = c("a", "b", "c"), nthread = 1)
|
||||
dm_feature_types <- xgb.DMatrix(x, label = y)
|
||||
setinfo(dm_feature_types, "feature_type", c("q", "c", "q"))
|
||||
dm_both <- xgb.DMatrix(x, label = y, feature_names = c("a", "b", "c"), nthread = 1)
|
||||
setinfo(dm_both, "feature_type", c("q", "c", "q"))
|
||||
|
||||
params <- list(nthread = 1)
|
||||
model_unnamed <- xgb.train(data = dm_unnamed, params = params, nrounds = 3)
|
||||
model_feature_names <- xgb.train(data = dm_feature_names, params = params, nrounds = 3)
|
||||
model_feature_types <- xgb.train(data = dm_feature_types, params = params, nrounds = 3)
|
||||
model_both <- xgb.train(data = dm_both, params = params, nrounds = 3)
|
||||
|
||||
expect_null(getinfo(model_unnamed, "feature_name"))
|
||||
expect_equal(getinfo(model_feature_names, "feature_name"), c("a", "b", "c"))
|
||||
expect_null(getinfo(model_feature_types, "feature_name"))
|
||||
expect_equal(getinfo(model_both, "feature_name"), c("a", "b", "c"))
|
||||
|
||||
expect_null(variable.names(model_unnamed))
|
||||
expect_equal(variable.names(model_feature_names), c("a", "b", "c"))
|
||||
expect_null(variable.names(model_feature_types))
|
||||
expect_equal(variable.names(model_both), c("a", "b", "c"))
|
||||
|
||||
expect_null(getinfo(model_unnamed, "feature_type"))
|
||||
expect_null(getinfo(model_feature_names, "feature_type"))
|
||||
expect_equal(getinfo(model_feature_types, "feature_type"), c("q", "c", "q"))
|
||||
expect_equal(getinfo(model_both, "feature_type"), c("q", "c", "q"))
|
||||
})
|
||||
|
||||
@ -111,9 +111,9 @@ test_that("can store evaluation_log without printing", {
|
||||
expect_silent(
|
||||
bst <- xgb.train(param, dtrain, nrounds = 10, watchlist, eta = 1, verbose = 0)
|
||||
)
|
||||
expect_false(is.null(bst$evaluation_log))
|
||||
expect_false(is.null(bst$evaluation_log$train_error))
|
||||
expect_lt(bst$evaluation_log[, min(train_error)], 0.2)
|
||||
expect_false(is.null(attributes(bst)$evaluation_log))
|
||||
expect_false(is.null(attributes(bst)$evaluation_log$train_error))
|
||||
expect_lt(attributes(bst)$evaluation_log[, min(train_error)], 0.2)
|
||||
})
|
||||
|
||||
test_that("cb.reset.parameters works as expected", {
|
||||
@ -121,34 +121,34 @@ test_that("cb.reset.parameters works as expected", {
|
||||
# fixed eta
|
||||
set.seed(111)
|
||||
bst0 <- xgb.train(param, dtrain, nrounds = 2, watchlist, eta = 0.9, verbose = 0)
|
||||
expect_false(is.null(bst0$evaluation_log))
|
||||
expect_false(is.null(bst0$evaluation_log$train_error))
|
||||
expect_false(is.null(attributes(bst0)$evaluation_log))
|
||||
expect_false(is.null(attributes(bst0)$evaluation_log$train_error))
|
||||
|
||||
# same eta but re-set as a vector parameter in the callback
|
||||
set.seed(111)
|
||||
my_par <- list(eta = c(0.9, 0.9))
|
||||
bst1 <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0,
|
||||
callbacks = list(cb.reset.parameters(my_par)))
|
||||
expect_false(is.null(bst1$evaluation_log$train_error))
|
||||
expect_equal(bst0$evaluation_log$train_error,
|
||||
bst1$evaluation_log$train_error)
|
||||
expect_false(is.null(attributes(bst1)$evaluation_log$train_error))
|
||||
expect_equal(attributes(bst0)$evaluation_log$train_error,
|
||||
attributes(bst1)$evaluation_log$train_error)
|
||||
|
||||
# same eta but re-set via a function in the callback
|
||||
set.seed(111)
|
||||
my_par <- list(eta = function(itr, itr_end) 0.9)
|
||||
bst2 <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0,
|
||||
callbacks = list(cb.reset.parameters(my_par)))
|
||||
expect_false(is.null(bst2$evaluation_log$train_error))
|
||||
expect_equal(bst0$evaluation_log$train_error,
|
||||
bst2$evaluation_log$train_error)
|
||||
expect_false(is.null(attributes(bst2)$evaluation_log$train_error))
|
||||
expect_equal(attributes(bst0)$evaluation_log$train_error,
|
||||
attributes(bst2)$evaluation_log$train_error)
|
||||
|
||||
# different eta re-set as a vector parameter in the callback
|
||||
set.seed(111)
|
||||
my_par <- list(eta = c(0.6, 0.5))
|
||||
bst3 <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0,
|
||||
callbacks = list(cb.reset.parameters(my_par)))
|
||||
expect_false(is.null(bst3$evaluation_log$train_error))
|
||||
expect_false(all(bst0$evaluation_log$train_error == bst3$evaluation_log$train_error))
|
||||
expect_false(is.null(attributes(bst3)$evaluation_log$train_error))
|
||||
expect_false(all(attributes(bst0)$evaluation_log$train_error == attributes(bst3)$evaluation_log$train_error))
|
||||
|
||||
# resetting multiple parameters at the same time runs with no error
|
||||
my_par <- list(eta = c(1., 0.5), gamma = c(1, 2), max_depth = c(4, 8))
|
||||
@ -166,8 +166,8 @@ test_that("cb.reset.parameters works as expected", {
|
||||
my_par <- list(eta = c(0., 0.))
|
||||
bstX <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0,
|
||||
callbacks = list(cb.reset.parameters(my_par)))
|
||||
expect_false(is.null(bstX$evaluation_log$train_error))
|
||||
er <- unique(bstX$evaluation_log$train_error)
|
||||
expect_false(is.null(attributes(bstX)$evaluation_log$train_error))
|
||||
er <- unique(attributes(bstX)$evaluation_log$train_error)
|
||||
expect_length(er, 1)
|
||||
expect_gt(er, 0.4)
|
||||
})
|
||||
@ -183,14 +183,14 @@ test_that("cb.save.model works as expected", {
|
||||
expect_true(file.exists(files[2]))
|
||||
b1 <- xgb.load(files[1])
|
||||
xgb.parameters(b1) <- list(nthread = 2)
|
||||
expect_equal(xgb.ntree(b1), 1)
|
||||
expect_equal(xgb.get.num.boosted.rounds(b1), 1)
|
||||
b2 <- xgb.load(files[2])
|
||||
xgb.parameters(b2) <- list(nthread = 2)
|
||||
expect_equal(xgb.ntree(b2), 2)
|
||||
expect_equal(xgb.get.num.boosted.rounds(b2), 2)
|
||||
|
||||
xgb.config(b2) <- xgb.config(bst)
|
||||
expect_equal(xgb.config(bst), xgb.config(b2))
|
||||
expect_equal(bst$raw, b2$raw)
|
||||
expect_equal(xgb.save.raw(bst), xgb.save.raw(b2))
|
||||
|
||||
# save_period = 0 saves the last iteration's model
|
||||
bst <- xgb.train(param, dtrain, nrounds = 2, watchlist, eta = 1, verbose = 0,
|
||||
@ -198,7 +198,7 @@ test_that("cb.save.model works as expected", {
|
||||
expect_true(file.exists(files[3]))
|
||||
b2 <- xgb.load(files[3])
|
||||
xgb.config(b2) <- xgb.config(bst)
|
||||
expect_equal(bst$raw, b2$raw)
|
||||
expect_equal(xgb.save.raw(bst), xgb.save.raw(b2))
|
||||
|
||||
for (f in files) if (file.exists(f)) file.remove(f)
|
||||
})
|
||||
@ -209,14 +209,14 @@ test_that("early stopping xgb.train works", {
|
||||
bst <- xgb.train(param, dtrain, nrounds = 20, watchlist, eta = 0.3,
|
||||
early_stopping_rounds = 3, maximize = FALSE)
|
||||
, "Stopping. Best iteration")
|
||||
expect_false(is.null(bst$best_iteration))
|
||||
expect_lt(bst$best_iteration, 19)
|
||||
expect_equal(bst$best_iteration, bst$best_ntreelimit)
|
||||
expect_false(is.null(xgb.attr(bst, "best_iteration")))
|
||||
expect_lt(xgb.attr(bst, "best_iteration"), 19)
|
||||
expect_equal(xgb.attr(bst, "best_iteration"), xgb.attr(bst, "best_ntreelimit"))
|
||||
|
||||
pred <- predict(bst, dtest)
|
||||
expect_equal(length(pred), 1611)
|
||||
err_pred <- err(ltest, pred)
|
||||
err_log <- bst$evaluation_log[bst$best_iteration, test_error]
|
||||
err_log <- attributes(bst)$evaluation_log[xgb.attr(bst, "best_iteration"), test_error]
|
||||
expect_equal(err_log, err_pred, tolerance = 5e-6)
|
||||
|
||||
set.seed(11)
|
||||
@ -224,15 +224,15 @@ test_that("early stopping xgb.train works", {
|
||||
bst0 <- xgb.train(param, dtrain, nrounds = 20, watchlist, eta = 0.3,
|
||||
early_stopping_rounds = 3, maximize = FALSE, verbose = 0)
|
||||
)
|
||||
expect_equal(bst$evaluation_log, bst0$evaluation_log)
|
||||
expect_equal(attributes(bst)$evaluation_log, attributes(bst0)$evaluation_log)
|
||||
|
||||
fname <- file.path(tempdir(), "model.bin")
|
||||
xgb.save(bst, fname)
|
||||
loaded <- xgb.load(fname)
|
||||
|
||||
expect_false(is.null(loaded$best_iteration))
|
||||
expect_equal(loaded$best_iteration, bst$best_ntreelimit)
|
||||
expect_equal(loaded$best_ntreelimit, bst$best_ntreelimit)
|
||||
expect_false(is.null(xgb.attr(loaded, "best_iteration")))
|
||||
expect_equal(xgb.attr(loaded, "best_iteration"), xgb.attr(bst, "best_ntreelimit"))
|
||||
expect_equal(xgb.attr(loaded, "best_ntreelimit"), xgb.attr(bst, "best_ntreelimit"))
|
||||
})
|
||||
|
||||
test_that("early stopping using a specific metric works", {
|
||||
@ -243,14 +243,14 @@ test_that("early stopping using a specific metric works", {
|
||||
callbacks = list(cb.early.stop(stopping_rounds = 3, maximize = FALSE,
|
||||
metric_name = 'test_logloss')))
|
||||
, "Stopping. Best iteration")
|
||||
expect_false(is.null(bst$best_iteration))
|
||||
expect_lt(bst$best_iteration, 19)
|
||||
expect_equal(bst$best_iteration, bst$best_ntreelimit)
|
||||
expect_false(is.null(xgb.attr(bst, "best_iteration")))
|
||||
expect_lt(xgb.attr(bst, "best_iteration"), 19)
|
||||
expect_equal(xgb.attr(bst, "best_iteration"), xgb.attr(bst, "best_ntreelimit"))
|
||||
|
||||
pred <- predict(bst, dtest, ntreelimit = bst$best_ntreelimit)
|
||||
pred <- predict(bst, dtest, ntreelimit = xgb.attr(bst, "best_ntreelimit"))
|
||||
expect_equal(length(pred), 1611)
|
||||
logloss_pred <- sum(-ltest * log(pred) - (1 - ltest) * log(1 - pred)) / length(ltest)
|
||||
logloss_log <- bst$evaluation_log[bst$best_iteration, test_logloss]
|
||||
logloss_log <- attributes(bst)$evaluation_log[xgb.attr(bst, "best_iteration"), test_logloss]
|
||||
expect_equal(logloss_log, logloss_pred, tolerance = 1e-5)
|
||||
})
|
||||
|
||||
|
||||
@ -35,9 +35,9 @@ num_round <- 2
|
||||
test_that("custom objective works", {
|
||||
bst <- xgb.train(param, dtrain, num_round, watchlist)
|
||||
expect_equal(class(bst), "xgb.Booster")
|
||||
expect_false(is.null(bst$evaluation_log))
|
||||
expect_false(is.null(bst$evaluation_log$eval_error))
|
||||
expect_lt(bst$evaluation_log[num_round, eval_error], 0.03)
|
||||
expect_false(is.null(attributes(bst)$evaluation_log))
|
||||
expect_false(is.null(attributes(bst)$evaluation_log$eval_error))
|
||||
expect_lt(attributes(bst)$evaluation_log[num_round, eval_error], 0.03)
|
||||
})
|
||||
|
||||
test_that("custom objective in CV works", {
|
||||
@ -50,7 +50,7 @@ test_that("custom objective in CV works", {
|
||||
test_that("custom objective with early stop works", {
|
||||
bst <- xgb.train(param, dtrain, 10, watchlist)
|
||||
expect_equal(class(bst), "xgb.Booster")
|
||||
train_log <- bst$evaluation_log$train_error
|
||||
train_log <- attributes(bst)$evaluation_log$train_error
|
||||
expect_true(all(diff(train_log) <= 0))
|
||||
})
|
||||
|
||||
|
||||
@ -24,28 +24,28 @@ test_that("gblinear works", {
|
||||
bst <- xgb.train(param, dtrain, n, watchlist, verbose = VERB, feature_selector = 'shuffle')
|
||||
ypred <- predict(bst, dtest)
|
||||
expect_equal(length(getinfo(dtest, 'label')), 1611)
|
||||
expect_lt(bst$evaluation_log$eval_error[n], ERR_UL)
|
||||
expect_lt(attributes(bst)$evaluation_log$eval_error[n], ERR_UL)
|
||||
|
||||
bst <- xgb.train(param, dtrain, n, watchlist, verbose = VERB, feature_selector = 'cyclic',
|
||||
callbacks = list(cb.gblinear.history()))
|
||||
expect_lt(bst$evaluation_log$eval_error[n], ERR_UL)
|
||||
expect_lt(attributes(bst)$evaluation_log$eval_error[n], ERR_UL)
|
||||
h <- xgb.gblinear.history(bst)
|
||||
expect_equal(dim(h), c(n, ncol(dtrain) + 1))
|
||||
expect_is(h, "matrix")
|
||||
|
||||
param$updater <- 'coord_descent'
|
||||
bst <- xgb.train(param, dtrain, n, watchlist, verbose = VERB, feature_selector = 'cyclic')
|
||||
expect_lt(bst$evaluation_log$eval_error[n], ERR_UL)
|
||||
expect_lt(attributes(bst)$evaluation_log$eval_error[n], ERR_UL)
|
||||
|
||||
bst <- xgb.train(param, dtrain, n, watchlist, verbose = VERB, feature_selector = 'shuffle')
|
||||
expect_lt(bst$evaluation_log$eval_error[n], ERR_UL)
|
||||
expect_lt(attributes(bst)$evaluation_log$eval_error[n], ERR_UL)
|
||||
|
||||
bst <- xgb.train(param, dtrain, 2, watchlist, verbose = VERB, feature_selector = 'greedy')
|
||||
expect_lt(bst$evaluation_log$eval_error[2], ERR_UL)
|
||||
expect_lt(attributes(bst)$evaluation_log$eval_error[2], ERR_UL)
|
||||
|
||||
bst <- xgb.train(param, dtrain, n, watchlist, verbose = VERB, feature_selector = 'thrifty',
|
||||
top_k = 50, callbacks = list(cb.gblinear.history(sparse = TRUE)))
|
||||
expect_lt(bst$evaluation_log$eval_error[n], ERR_UL)
|
||||
expect_lt(attributes(bst)$evaluation_log$eval_error[n], ERR_UL)
|
||||
h <- xgb.gblinear.history(bst)
|
||||
expect_equal(dim(h), c(n, ncol(dtrain) + 1))
|
||||
expect_s4_class(h, "dgCMatrix")
|
||||
@ -72,10 +72,10 @@ test_that("gblinear early stopping works", {
|
||||
booster <- xgb.train(
|
||||
param, dtrain, n, list(eval = dtest, train = dtrain), early_stopping_rounds = es_round
|
||||
)
|
||||
expect_equal(booster$best_iteration, 5)
|
||||
expect_equal(xgb.attr(booster, "best_iteration"), 5)
|
||||
predt_es <- predict(booster, dtrain)
|
||||
|
||||
n <- booster$best_iteration + es_round
|
||||
n <- xgb.attr(booster, "best_iteration") + es_round
|
||||
booster <- xgb.train(
|
||||
param, dtrain, n, list(eval = dtest, train = dtrain), early_stopping_rounds = es_round
|
||||
)
|
||||
|
||||
@ -49,6 +49,9 @@ mbst.GLM <- xgb.train(data = xgb.DMatrix(as.matrix(iris[, -5]), label = mlabel),
|
||||
booster = "gblinear", eta = 0.1, nthread = 1, nrounds = nrounds,
|
||||
objective = "multi:softprob", num_class = nclass, base_score = 0)
|
||||
|
||||
# without feature names
|
||||
bst.Tree.unnamed <- xgb.copy.Booster(bst.Tree)
|
||||
setinfo(bst.Tree.unnamed, "feature_name", NULL)
|
||||
|
||||
test_that("xgb.dump works", {
|
||||
.skip_if_vcd_not_available()
|
||||
@ -204,7 +207,7 @@ test_that("xgb-attribute functionality", {
|
||||
list.ch <- list.val[order(names(list.val))]
|
||||
list.ch <- lapply(list.ch, as.character)
|
||||
# note: iter is 0-index in xgb attributes
|
||||
list.default <- list(niter = as.character(nrounds - 1))
|
||||
list.default <- list()
|
||||
list.ch <- c(list.ch, list.default)
|
||||
# proper input:
|
||||
expect_error(xgb.attr(bst.Tree, NULL))
|
||||
@ -212,24 +215,25 @@ test_that("xgb-attribute functionality", {
|
||||
# set & get:
|
||||
expect_null(xgb.attr(bst.Tree, "asdf"))
|
||||
expect_equal(xgb.attributes(bst.Tree), list.default)
|
||||
xgb.attr(bst.Tree, "my_attr") <- val
|
||||
expect_equal(xgb.attr(bst.Tree, "my_attr"), val)
|
||||
xgb.attributes(bst.Tree) <- list.val
|
||||
expect_equal(xgb.attributes(bst.Tree), list.ch)
|
||||
bst.Tree.copy <- xgb.copy.Booster(bst.Tree)
|
||||
xgb.attr(bst.Tree.copy, "my_attr") <- val
|
||||
expect_equal(xgb.attr(bst.Tree.copy, "my_attr"), val)
|
||||
xgb.attributes(bst.Tree.copy) <- list.val
|
||||
expect_equal(xgb.attributes(bst.Tree.copy), list.ch)
|
||||
# serializing:
|
||||
fname <- file.path(tempdir(), "xgb.model")
|
||||
xgb.save(bst.Tree, fname)
|
||||
fname <- file.path(tempdir(), "xgb.ubj")
|
||||
xgb.save(bst.Tree.copy, fname)
|
||||
bst <- xgb.load(fname)
|
||||
expect_equal(xgb.attr(bst, "my_attr"), val)
|
||||
expect_equal(xgb.attributes(bst), list.ch)
|
||||
# deletion:
|
||||
xgb.attr(bst, "my_attr") <- NULL
|
||||
expect_null(xgb.attr(bst, "my_attr"))
|
||||
expect_equal(xgb.attributes(bst), list.ch[c("a", "b", "niter")])
|
||||
expect_equal(xgb.attributes(bst), list.ch[c("a", "b")])
|
||||
xgb.attributes(bst) <- list(a = NULL, b = NULL)
|
||||
expect_equal(xgb.attributes(bst), list.default)
|
||||
xgb.attributes(bst) <- list(niter = NULL)
|
||||
expect_null(xgb.attributes(bst))
|
||||
expect_equal(xgb.attributes(bst), list())
|
||||
})
|
||||
|
||||
if (grepl('Windows', Sys.info()[['sysname']], fixed = TRUE) ||
|
||||
@ -262,21 +266,17 @@ test_that("xgb.Booster serializing as R object works", {
|
||||
dtrain <- xgb.DMatrix(sparse_matrix, label = label, nthread = 2)
|
||||
expect_equal(predict(bst.Tree, dtrain), predict(bst, dtrain), tolerance = float_tolerance)
|
||||
expect_equal(xgb.dump(bst.Tree), xgb.dump(bst))
|
||||
|
||||
fname_bin <- file.path(tempdir(), "xgb.model")
|
||||
xgb.save(bst, fname_bin)
|
||||
bst <- readRDS(fname_rds)
|
||||
nil_ptr <- new("externalptr")
|
||||
class(nil_ptr) <- "xgb.Booster.handle"
|
||||
expect_true(identical(bst$handle, nil_ptr))
|
||||
bst <- xgb.Booster.complete(bst)
|
||||
expect_true(!identical(bst$handle, nil_ptr))
|
||||
expect_equal(predict(bst.Tree, dtrain), predict(bst, dtrain), tolerance = float_tolerance)
|
||||
})
|
||||
|
||||
test_that("xgb.model.dt.tree works with and without feature names", {
|
||||
.skip_if_vcd_not_available()
|
||||
names.dt.trees <- c("Tree", "Node", "ID", "Feature", "Split", "Yes", "No", "Missing", "Gain", "Cover")
|
||||
dt.tree <- xgb.model.dt.tree(feature_names = feature.names, model = bst.Tree)
|
||||
dt.tree <- xgb.model.dt.tree(model = bst.Tree)
|
||||
expect_equal(names.dt.trees, names(dt.tree))
|
||||
if (!flag_32bit)
|
||||
expect_equal(dim(dt.tree), c(188, 10))
|
||||
@ -286,9 +286,7 @@ test_that("xgb.model.dt.tree works with and without feature names", {
|
||||
expect_equal(dt.tree, dt.tree.0)
|
||||
|
||||
# when model contains no feature names:
|
||||
bst.Tree.x <- bst.Tree
|
||||
bst.Tree.x$feature_names <- NULL
|
||||
dt.tree.x <- xgb.model.dt.tree(model = bst.Tree.x)
|
||||
dt.tree.x <- xgb.model.dt.tree(model = bst.Tree.unnamed)
|
||||
expect_output(str(dt.tree.x), 'Feature.*\\"3\\"')
|
||||
expect_equal(dt.tree[, -4, with = FALSE], dt.tree.x[, -4, with = FALSE])
|
||||
|
||||
@ -316,9 +314,7 @@ test_that("xgb.importance works with and without feature names", {
|
||||
expect_equal(importance.Tree, importance.Tree.0, tolerance = float_tolerance)
|
||||
|
||||
# when model contains no feature names:
|
||||
bst.Tree.x <- bst.Tree
|
||||
bst.Tree.x$feature_names <- NULL
|
||||
importance.Tree.x <- xgb.importance(model = bst.Tree)
|
||||
importance.Tree.x <- xgb.importance(model = bst.Tree.unnamed)
|
||||
expect_equal(importance.Tree[, -1, with = FALSE], importance.Tree.x[, -1, with = FALSE],
|
||||
tolerance = float_tolerance)
|
||||
|
||||
@ -334,7 +330,7 @@ test_that("xgb.importance works with and without feature names", {
|
||||
importance <- xgb.importance(feature_names = feature.names, model = bst.Tree, trees = trees)
|
||||
|
||||
importance_from_dump <- function() {
|
||||
model_text_dump <- xgb.dump(model = bst.Tree, with_stats = TRUE, trees = trees)
|
||||
model_text_dump <- xgb.dump(model = bst.Tree.unnamed, with_stats = TRUE, trees = trees)
|
||||
imp <- xgb.model.dt.tree(
|
||||
feature_names = feature.names,
|
||||
text = model_text_dump,
|
||||
@ -414,13 +410,13 @@ test_that("xgb.plot.importance de-duplicates features", {
|
||||
|
||||
test_that("xgb.plot.tree works with and without feature names", {
|
||||
.skip_if_vcd_not_available()
|
||||
expect_silent(xgb.plot.tree(feature_names = feature.names, model = bst.Tree))
|
||||
expect_silent(xgb.plot.tree(feature_names = feature.names, model = bst.Tree.unnamed))
|
||||
expect_silent(xgb.plot.tree(model = bst.Tree))
|
||||
})
|
||||
|
||||
test_that("xgb.plot.multi.trees works with and without feature names", {
|
||||
.skip_if_vcd_not_available()
|
||||
xgb.plot.multi.trees(model = bst.Tree, feature_names = feature.names, features_keep = 3)
|
||||
xgb.plot.multi.trees(model = bst.Tree.unnamed, feature_names = feature.names, features_keep = 3)
|
||||
xgb.plot.multi.trees(model = bst.Tree, features_keep = 3)
|
||||
})
|
||||
|
||||
|
||||
@ -17,8 +17,8 @@ test_that("load/save raw works", {
|
||||
ubj_bytes <- xgb.save.raw(booster, raw_format = "ubj")
|
||||
old_bytes <- xgb.save.raw(booster, raw_format = "deprecated")
|
||||
|
||||
from_json <- xgb.load.raw(json_bytes, as_booster = TRUE)
|
||||
from_ubj <- xgb.load.raw(ubj_bytes, as_booster = TRUE)
|
||||
from_json <- xgb.load.raw(json_bytes)
|
||||
from_ubj <- xgb.load.raw(ubj_bytes)
|
||||
|
||||
json2old <- xgb.save.raw(from_json, raw_format = "deprecated")
|
||||
ubj2old <- xgb.save.raw(from_ubj, raw_format = "deprecated")
|
||||
@ -26,3 +26,46 @@ test_that("load/save raw works", {
|
||||
expect_equal(json2old, ubj2old)
|
||||
expect_equal(json2old, old_bytes)
|
||||
})
|
||||
|
||||
test_that("saveRDS preserves C and R attributes", {
|
||||
data(mtcars)
|
||||
y <- mtcars$mpg
|
||||
x <- as.matrix(mtcars[, -1])
|
||||
dm <- xgb.DMatrix(x, label = y, nthread = 1)
|
||||
model <- xgb.train(
|
||||
data = dm,
|
||||
params = list(nthread = 1, max_depth = 2),
|
||||
nrounds = 5
|
||||
)
|
||||
attributes(model)$my_attr <- "qwerty"
|
||||
xgb.attr(model, "c_attr") <- "asdf"
|
||||
|
||||
fname <- file.path(tempdir(), "xgb_model.Rds")
|
||||
saveRDS(model, fname)
|
||||
model_new <- readRDS(fname)
|
||||
|
||||
expect_equal(attributes(model_new)$my_attr, attributes(model)$my_attr)
|
||||
expect_equal(xgb.attr(model, "c_attr"), xgb.attr(model_new, "c_attr"))
|
||||
})
|
||||
|
||||
test_that("R serializers keep C config", {
|
||||
data(mtcars)
|
||||
y <- mtcars$mpg
|
||||
x <- as.matrix(mtcars[, -1])
|
||||
dm <- xgb.DMatrix(x, label = y, nthread = 1)
|
||||
model <- xgb.train(
|
||||
data = dm,
|
||||
params = list(
|
||||
tree_method = "approx",
|
||||
nthread = 1,
|
||||
max_depth = 2
|
||||
),
|
||||
nrounds = 3
|
||||
)
|
||||
model_new <- unserialize(serialize(model, NULL))
|
||||
expect_equal(
|
||||
xgb.config(model)$learner$gradient_booster$gbtree_train_param$tree_method,
|
||||
xgb.config(model_new)$learner$gradient_booster$gbtree_train_param$tree_method
|
||||
)
|
||||
expect_equal(variable.names(model), variable.names(model_new))
|
||||
})
|
||||
|
||||
@ -23,11 +23,7 @@ get_num_tree <- function(booster) {
|
||||
}
|
||||
|
||||
run_booster_check <- function(booster, name) {
|
||||
# If given a handle, we need to call xgb.Booster.complete() prior to using xgb.config().
|
||||
if (inherits(booster, "xgb.Booster") && xgboost:::is.null.handle(booster$handle)) {
|
||||
booster <- xgb.Booster.complete(booster)
|
||||
}
|
||||
config <- jsonlite::fromJSON(xgb.config(booster))
|
||||
config <- xgb.config(booster)
|
||||
run_model_param_check(config)
|
||||
if (name == 'cls') {
|
||||
testthat::expect_equal(get_num_tree(booster),
|
||||
@ -76,6 +72,10 @@ test_that("Models from previous versions of XGBoost can be loaded", {
|
||||
name <- m[3]
|
||||
is_rds <- endsWith(model_file, '.rds')
|
||||
is_json <- endsWith(model_file, '.json')
|
||||
# TODO: update this test for new RDS format
|
||||
if (is_rds) {
|
||||
return(NULL)
|
||||
}
|
||||
# Expect an R warning when a model is loaded from RDS and it was generated by version < 1.1.x
|
||||
if (is_rds && compareVersion(model_xgb_ver, '1.1.1.1') < 0) {
|
||||
booster <- readRDS(model_file)
|
||||
|
||||
@ -19,12 +19,12 @@ bst <- xgb.train(data = dtrain,
|
||||
objective = "binary:logistic")
|
||||
|
||||
test_that("call is exposed to R", {
|
||||
expect_false(is.null(bst$call))
|
||||
expect_is(bst$call, "call")
|
||||
expect_false(is.null(attributes(bst)$call))
|
||||
expect_is(attributes(bst)$call, "call")
|
||||
})
|
||||
|
||||
test_that("params is exposed to R", {
|
||||
model_params <- bst$params
|
||||
model_params <- attributes(bst)$params
|
||||
expect_is(model_params, "list")
|
||||
expect_equal(model_params$eta, 1)
|
||||
expect_equal(model_params$max_depth, 2)
|
||||
|
||||
@ -17,8 +17,8 @@ test_that('Test ranking with unweighted data', {
|
||||
eval_metric = 'auc', eval_metric = 'aucpr', nthread = n_threads)
|
||||
bst <- xgb.train(params, dtrain, nrounds = 10, watchlist = list(train = dtrain))
|
||||
# Check if the metric is monotone increasing
|
||||
expect_true(all(diff(bst$evaluation_log$train_auc) >= 0))
|
||||
expect_true(all(diff(bst$evaluation_log$train_aucpr) >= 0))
|
||||
expect_true(all(diff(attributes(bst)$evaluation_log$train_auc) >= 0))
|
||||
expect_true(all(diff(attributes(bst)$evaluation_log$train_aucpr) >= 0))
|
||||
})
|
||||
|
||||
test_that('Test ranking with weighted data', {
|
||||
@ -41,8 +41,8 @@ test_that('Test ranking with weighted data', {
|
||||
)
|
||||
bst <- xgb.train(params, dtrain, nrounds = 10, watchlist = list(train = dtrain))
|
||||
# Check if the metric is monotone increasing
|
||||
expect_true(all(diff(bst$evaluation_log$train_auc) >= 0))
|
||||
expect_true(all(diff(bst$evaluation_log$train_aucpr) >= 0))
|
||||
expect_true(all(diff(attributes(bst)$evaluation_log$train_auc) >= 0))
|
||||
expect_true(all(diff(attributes(bst)$evaluation_log$train_aucpr) >= 0))
|
||||
for (i in 1:10) {
|
||||
pred <- predict(bst, newdata = dtrain, ntreelimit = i)
|
||||
# is_sorted[i]: is i-th group correctly sorted by the ranking predictor?
|
||||
|
||||
@ -40,7 +40,12 @@ test_that("updating the model works", {
|
||||
bst1r <- xgb.train(p1r, dtrain, nrounds = 10, watchlist, verbose = 0)
|
||||
tr1r <- xgb.model.dt.tree(model = bst1r)
|
||||
# all should be the same when no subsampling
|
||||
expect_equal(bst1$evaluation_log, bst1r$evaluation_log)
|
||||
expect_equal(attributes(bst1)$evaluation_log, attributes(bst1r)$evaluation_log)
|
||||
expect_equal(
|
||||
jsonlite::fromJSON(rawToChar(xgb.save.raw(bst1, raw_format = "json"))),
|
||||
jsonlite::fromJSON(rawToChar(xgb.save.raw(bst1r, raw_format = "json"))),
|
||||
tolerance = 1e-6
|
||||
)
|
||||
if (!win32_flag) {
|
||||
expect_equal(tr1, tr1r, tolerance = 0.00001, check.attributes = FALSE)
|
||||
}
|
||||
@ -51,7 +56,7 @@ test_that("updating the model works", {
|
||||
bst2r <- xgb.train(p2r, dtrain, nrounds = 10, watchlist, verbose = 0)
|
||||
tr2r <- xgb.model.dt.tree(model = bst2r)
|
||||
# should be the same evaluation but different gains and larger cover
|
||||
expect_equal(bst2$evaluation_log, bst2r$evaluation_log)
|
||||
expect_equal(attributes(bst2)$evaluation_log, attributes(bst2r)$evaluation_log)
|
||||
if (!win32_flag) {
|
||||
expect_equal(tr2[Feature == 'Leaf']$Gain, tr2r[Feature == 'Leaf']$Gain)
|
||||
}
|
||||
@ -59,11 +64,25 @@ test_that("updating the model works", {
|
||||
expect_gt(sum(tr2r$Cover) / sum(tr2$Cover), 1.5)
|
||||
|
||||
# process type 'update' for no-subsampling model, refreshing the tree stats AND leaves from training data:
|
||||
set.seed(123)
|
||||
p1u <- modifyList(p1, list(process_type = 'update', updater = 'refresh', refresh_leaf = TRUE))
|
||||
bst1u <- xgb.train(p1u, dtrain, nrounds = 10, watchlist, verbose = 0, xgb_model = bst1)
|
||||
tr1u <- xgb.model.dt.tree(model = bst1u)
|
||||
# all should be the same when no subsampling
|
||||
expect_equal(bst1$evaluation_log, bst1u$evaluation_log)
|
||||
expect_equal(attributes(bst1)$evaluation_log, attributes(bst1u)$evaluation_log)
|
||||
expect_equal(
|
||||
jsonlite::fromJSON(rawToChar(xgb.save.raw(bst1, raw_format = "json"))),
|
||||
jsonlite::fromJSON(rawToChar(xgb.save.raw(bst1u, raw_format = "json"))),
|
||||
tolerance = 1e-6
|
||||
)
|
||||
expect_equal(tr1, tr1u, tolerance = 0.00001, check.attributes = FALSE)
|
||||
|
||||
# same thing but with a serialized model
|
||||
set.seed(123)
|
||||
bst1u <- xgb.train(p1u, dtrain, nrounds = 10, watchlist, verbose = 0, xgb_model = xgb.save.raw(bst1))
|
||||
tr1u <- xgb.model.dt.tree(model = bst1u)
|
||||
# all should be the same when no subsampling
|
||||
expect_equal(attributes(bst1)$evaluation_log, attributes(bst1u)$evaluation_log)
|
||||
expect_equal(tr1, tr1u, tolerance = 0.00001, check.attributes = FALSE)
|
||||
|
||||
# process type 'update' for model with subsampling, refreshing only the tree stats from training data:
|
||||
@ -71,12 +90,12 @@ test_that("updating the model works", {
|
||||
bst2u <- xgb.train(p2u, dtrain, nrounds = 10, watchlist, verbose = 0, xgb_model = bst2)
|
||||
tr2u <- xgb.model.dt.tree(model = bst2u)
|
||||
# should be the same evaluation but different gains and larger cover
|
||||
expect_equal(bst2$evaluation_log, bst2u$evaluation_log)
|
||||
expect_equal(attributes(bst2)$evaluation_log, attributes(bst2u)$evaluation_log)
|
||||
expect_equal(tr2[Feature == 'Leaf']$Gain, tr2u[Feature == 'Leaf']$Gain)
|
||||
expect_gt(sum(abs(tr2[Feature != 'Leaf']$Gain - tr2u[Feature != 'Leaf']$Gain)), 100)
|
||||
expect_gt(sum(tr2u$Cover) / sum(tr2$Cover), 1.5)
|
||||
# the results should be the same as for the model with an extra 'refresh' updater
|
||||
expect_equal(bst2r$evaluation_log, bst2u$evaluation_log)
|
||||
expect_equal(attributes(bst2r)$evaluation_log, attributes(bst2u)$evaluation_log)
|
||||
if (!win32_flag) {
|
||||
expect_equal(tr2r, tr2u, tolerance = 0.00001, check.attributes = FALSE)
|
||||
}
|
||||
@ -86,7 +105,7 @@ test_that("updating the model works", {
|
||||
bst1ut <- xgb.train(p1ut, dtest, nrounds = 10, watchlist, verbose = 0, xgb_model = bst1)
|
||||
tr1ut <- xgb.model.dt.tree(model = bst1ut)
|
||||
# should be the same evaluations but different gains and smaller cover (test data is smaller)
|
||||
expect_equal(bst1$evaluation_log, bst1ut$evaluation_log)
|
||||
expect_equal(attributes(bst1)$evaluation_log, attributes(bst1ut)$evaluation_log)
|
||||
expect_equal(tr1[Feature == 'Leaf']$Gain, tr1ut[Feature == 'Leaf']$Gain)
|
||||
expect_gt(sum(abs(tr1[Feature != 'Leaf']$Gain - tr1ut[Feature != 'Leaf']$Gain)), 100)
|
||||
expect_lt(sum(tr1ut$Cover) / sum(tr1$Cover), 0.5)
|
||||
@ -106,11 +125,12 @@ test_that("updating works for multiclass & multitree", {
|
||||
|
||||
# run update process for an original model with subsampling
|
||||
p0u <- modifyList(p0, list(process_type = 'update', updater = 'refresh', refresh_leaf = FALSE))
|
||||
bst0u <- xgb.train(p0u, dtr, nrounds = bst0$niter, watchlist, xgb_model = bst0, verbose = 0)
|
||||
bst0u <- xgb.train(p0u, dtr, nrounds = xgb.get.num.boosted.rounds(bst0),
|
||||
watchlist, xgb_model = bst0, verbose = 0)
|
||||
tr0u <- xgb.model.dt.tree(model = bst0u)
|
||||
|
||||
# should be the same evaluation but different gains and larger cover
|
||||
expect_equal(bst0$evaluation_log, bst0u$evaluation_log)
|
||||
expect_equal(attributes(bst0)$evaluation_log, attributes(bst0u)$evaluation_log)
|
||||
expect_equal(tr0[Feature == 'Leaf']$Gain, tr0u[Feature == 'Leaf']$Gain)
|
||||
expect_gt(sum(abs(tr0[Feature != 'Leaf']$Gain - tr0u[Feature != 'Leaf']$Gain)), 100)
|
||||
expect_gt(sum(tr0u$Cover) / sum(tr0$Cover), 1.5)
|
||||
|
||||
@ -1,223 +0,0 @@
|
||||
\documentclass{article}
|
||||
\RequirePackage{url}
|
||||
\usepackage{hyperref}
|
||||
\RequirePackage{amsmath}
|
||||
\RequirePackage{natbib}
|
||||
\RequirePackage[a4paper,lmargin={1.25in},rmargin={1.25in},tmargin={1in},bmargin={1in}]{geometry}
|
||||
|
||||
\makeatletter
|
||||
% \VignetteIndexEntry{xgboost: eXtreme Gradient Boosting}
|
||||
%\VignetteKeywords{xgboost, gbm, gradient boosting machines}
|
||||
%\VignettePackage{xgboost}
|
||||
% \VignetteEngine{knitr::knitr}
|
||||
\makeatother
|
||||
|
||||
\begin{document}
|
||||
%\SweaveOpts{concordance=TRUE}
|
||||
|
||||
<<knitropts,echo=FALSE,message=FALSE>>=
|
||||
if (require('knitr')) opts_chunk$set(fig.width = 5, fig.height = 5, fig.align = 'center', tidy = FALSE, warning = FALSE, cache = TRUE)
|
||||
@
|
||||
|
||||
%
|
||||
<<prelim,echo=FALSE>>=
|
||||
xgboost.version <- packageDescription("xgboost")$Version
|
||||
|
||||
@
|
||||
%
|
||||
|
||||
\begin{center}
|
||||
\vspace*{6\baselineskip}
|
||||
\rule{\textwidth}{1.6pt}\vspace*{-\baselineskip}\vspace*{2pt}
|
||||
\rule{\textwidth}{0.4pt}\\[2\baselineskip]
|
||||
{\LARGE \textbf{xgboost: eXtreme Gradient Boosting}}\\[1.2\baselineskip]
|
||||
\rule{\textwidth}{0.4pt}\vspace*{-\baselineskip}\vspace{3.2pt}
|
||||
\rule{\textwidth}{1.6pt}\\[2\baselineskip]
|
||||
{\Large Tianqi Chen, Tong He}\\[\baselineskip]
|
||||
{\large Package Version: \Sexpr{xgboost.version}}\\[\baselineskip]
|
||||
{\large \today}\par
|
||||
\vfill
|
||||
\end{center}
|
||||
|
||||
\thispagestyle{empty}
|
||||
|
||||
\clearpage
|
||||
|
||||
\setcounter{page}{1}
|
||||
|
||||
\section{Introduction}
|
||||
|
||||
This is an introductory document of using the \verb@xgboost@ package in R.
|
||||
|
||||
\verb@xgboost@ is short for eXtreme Gradient Boosting package. It is an efficient
|
||||
and scalable implementation of gradient boosting framework by \citep{friedman2001greedy} \citep{friedman2000additive}.
|
||||
The package includes efficient linear model solver and tree learning algorithm.
|
||||
It supports various objective functions, including regression, classification
|
||||
and ranking. The package is made to be extendible, so that users are also allowed to define their own objectives easily. It has several features:
|
||||
\begin{enumerate}
|
||||
\item{Speed: }{\verb@xgboost@ can automatically do parallel computation on
|
||||
Windows and Linux, with openmp. It is generally over 10 times faster than
|
||||
\verb@gbm@.}
|
||||
\item{Input Type: }{\verb@xgboost@ takes several types of input data:}
|
||||
\begin{itemize}
|
||||
\item{Dense Matrix: }{R's dense matrix, i.e. \verb@matrix@}
|
||||
\item{Sparse Matrix: }{R's sparse matrix \verb@Matrix::dgCMatrix@}
|
||||
\item{Data File: }{Local data files}
|
||||
\item{xgb.DMatrix: }{\verb@xgboost@'s own class. Recommended.}
|
||||
\end{itemize}
|
||||
\item{Sparsity: }{\verb@xgboost@ accepts sparse input for both tree booster
|
||||
and linear booster, and is optimized for sparse input.}
|
||||
\item{Customization: }{\verb@xgboost@ supports customized objective function
|
||||
and evaluation function}
|
||||
\item{Performance: }{\verb@xgboost@ has better performance on several different
|
||||
datasets.}
|
||||
\end{enumerate}
|
||||
|
||||
|
||||
\section{Example with Mushroom data}
|
||||
|
||||
In this section, we will illustrate some common usage of \verb@xgboost@. The
|
||||
Mushroom data is cited from UCI Machine Learning Repository. \citep{Bache+Lichman:2013}
|
||||
|
||||
<<Training and prediction with iris>>=
|
||||
library(xgboost)
|
||||
data(agaricus.train, package='xgboost')
|
||||
data(agaricus.test, package='xgboost')
|
||||
train <- agaricus.train
|
||||
test <- agaricus.test
|
||||
bst <- xgboost(data = train$data, label = train$label, max_depth = 2, eta = 1,
|
||||
nrounds = 2, objective = "binary:logistic", nthread = 2)
|
||||
xgb.save(bst, 'model.save')
|
||||
bst = xgb.load('model.save')
|
||||
xgb.parameters(bst) <- list(nthread = 2)
|
||||
pred <- predict(bst, test$data)
|
||||
@
|
||||
|
||||
\verb@xgboost@ is the main function to train a \verb@Booster@, i.e. a model.
|
||||
\verb@predict@ does prediction on the model.
|
||||
|
||||
Here we can save the model to a binary local file, and load it when needed.
|
||||
We can't inspect the trees inside. However we have another function to save the
|
||||
model in plain text.
|
||||
<<Dump Model>>=
|
||||
xgb.dump(bst, 'model.dump')
|
||||
@
|
||||
|
||||
The output looks like
|
||||
|
||||
\begin{verbatim}
|
||||
booster[0]:
|
||||
0:[f28<1.00001] yes=1,no=2,missing=2
|
||||
1:[f108<1.00001] yes=3,no=4,missing=4
|
||||
3:leaf=1.85965
|
||||
4:leaf=-1.94071
|
||||
2:[f55<1.00001] yes=5,no=6,missing=6
|
||||
5:leaf=-1.70044
|
||||
6:leaf=1.71218
|
||||
booster[1]:
|
||||
0:[f59<1.00001] yes=1,no=2,missing=2
|
||||
1:leaf=-6.23624
|
||||
2:[f28<1.00001] yes=3,no=4,missing=4
|
||||
3:leaf=-0.96853
|
||||
4:leaf=0.784718
|
||||
\end{verbatim}
|
||||
|
||||
It is important to know \verb@xgboost@'s own data type: \verb@xgb.DMatrix@.
|
||||
It speeds up \verb@xgboost@, and is needed for advanced features such as
|
||||
training from initial prediction value, weighted training instance.
|
||||
|
||||
We can use \verb@xgb.DMatrix@ to construct an \verb@xgb.DMatrix@ object:
|
||||
<<xgb.DMatrix>>=
|
||||
dtrain <- xgb.DMatrix(train$data, label = train$label, nthread = 2)
|
||||
class(dtrain)
|
||||
head(getinfo(dtrain,'label'))
|
||||
@
|
||||
|
||||
We can also save the matrix to a binary file. Then load it simply with
|
||||
\verb@xgb.DMatrix@
|
||||
<<save model>>=
|
||||
xgb.DMatrix.save(dtrain, 'xgb.DMatrix')
|
||||
dtrain = xgb.DMatrix('xgb.DMatrix')
|
||||
@
|
||||
|
||||
\section{Advanced Examples}
|
||||
|
||||
The function \verb@xgboost@ is a simple function with less parameter, in order
|
||||
to be R-friendly. The core training function is wrapped in \verb@xgb.train@. It is more flexible than \verb@xgboost@, but it requires users to read the document a bit more carefully.
|
||||
|
||||
\verb@xgb.train@ only accept a \verb@xgb.DMatrix@ object as its input, while it supports advanced features as custom objective and evaluation functions.
|
||||
|
||||
<<Customized loss function>>=
|
||||
logregobj <- function(preds, dtrain) {
|
||||
labels <- getinfo(dtrain, "label")
|
||||
preds <- 1/(1 + exp(-preds))
|
||||
grad <- preds - labels
|
||||
hess <- preds * (1 - preds)
|
||||
return(list(grad = grad, hess = hess))
|
||||
}
|
||||
|
||||
evalerror <- function(preds, dtrain) {
|
||||
labels <- getinfo(dtrain, "label")
|
||||
err <- sqrt(mean((preds-labels)^2))
|
||||
return(list(metric = "MSE", value = err))
|
||||
}
|
||||
|
||||
dtest <- xgb.DMatrix(test$data, label = test$label, nthread = 2)
|
||||
watchlist <- list(eval = dtest, train = dtrain)
|
||||
param <- list(max_depth = 2, eta = 1, nthread = 2)
|
||||
|
||||
bst <- xgb.train(param, dtrain, nrounds = 2, watchlist, logregobj, evalerror, maximize = FALSE)
|
||||
@
|
||||
|
||||
The gradient and second order gradient is required for the output of customized
|
||||
objective function.
|
||||
|
||||
We also have \verb@slice@ for row extraction. It is useful in
|
||||
cross-validation.
|
||||
|
||||
For a walkthrough demo, please see \verb@R-package/demo/@ for further
|
||||
details.
|
||||
|
||||
\section{The Higgs Boson competition}
|
||||
|
||||
We have made a demo for \href{http://www.kaggle.com/c/higgs-boson}{the Higgs
|
||||
Boson Machine Learning Challenge}.
|
||||
|
||||
Here are the instructions to make a submission
|
||||
\begin{enumerate}
|
||||
\item Download the \href{http://www.kaggle.com/c/higgs-boson/data}{datasets}
|
||||
and extract them to \verb@data/@.
|
||||
\item Run scripts under \verb@xgboost/demo/kaggle-higgs/@:
|
||||
\href{https://github.com/tqchen/xgboost/blob/master/demo/kaggle-higgs/higgs-train.R}{higgs-train.R}
|
||||
and \href{https://github.com/tqchen/xgboost/blob/master/demo/kaggle-higgs/higgs-pred.R}{higgs-pred.R}.
|
||||
The computation will take less than a minute on Intel i7.
|
||||
\item Go to the \href{http://www.kaggle.com/c/higgs-boson/submissions/attach}{submission page}
|
||||
and submit your result.
|
||||
\end{enumerate}
|
||||
|
||||
We provide \href{https://github.com/tqchen/xgboost/blob/master/demo/kaggle-higgs/speedtest.R}{a script}
|
||||
to compare the time cost on the higgs dataset with \verb@gbm@ and \verb@xgboost@.
|
||||
The training set contains 350000 records and 30 features.
|
||||
|
||||
\verb@xgboost@ can automatically do parallel computation. On a machine with Intel
|
||||
i7-4700MQ and 24GB memories, we found that \verb@xgboost@ costs about 35 seconds, which is about 20 times faster
|
||||
than \verb@gbm@. When we limited \verb@xgboost@ to use only one thread, it was
|
||||
still about two times faster than \verb@gbm@.
|
||||
|
||||
Meanwhile, the result from \verb@xgboost@ reaches
|
||||
\href{http://www.kaggle.com/c/higgs-boson/details/evaluation}{3.60@AMS} with a
|
||||
single model. This results stands in the
|
||||
\href{http://www.kaggle.com/c/higgs-boson/leaderboard}{top 30\%} of the
|
||||
competition.
|
||||
|
||||
\bibliographystyle{jss}
|
||||
\nocite{*} % list uncited references
|
||||
\bibliography{xgboost}
|
||||
|
||||
\end{document}
|
||||
|
||||
<<Temp file cleaning, include=FALSE>>=
|
||||
file.remove("xgb.DMatrix")
|
||||
file.remove("model.dump")
|
||||
file.remove("model.save")
|
||||
@
|
||||
@ -107,7 +107,7 @@ train <- agaricus.train
|
||||
test <- agaricus.test
|
||||
```
|
||||
|
||||
> In the real world, it would be up to you to make this division between `train` and `test` data. The way to do it is out of the purpose of this article, however `caret` package may [help](http://topepo.github.io/caret/data-splitting.html).
|
||||
> In the real world, it would be up to you to make this division between `train` and `test` data.
|
||||
|
||||
Each variable is a `list` containing two things, `label` and `data`:
|
||||
|
||||
@ -155,11 +155,13 @@ We will train decision tree model using the following parameters:
|
||||
bstSparse <- xgboost(
|
||||
data = train$data
|
||||
, label = train$label
|
||||
, max_depth = 2
|
||||
, eta = 1
|
||||
, nthread = 2
|
||||
, params = list(
|
||||
max_depth = 2
|
||||
, eta = 1
|
||||
, nthread = 2
|
||||
, objective = "binary:logistic"
|
||||
)
|
||||
, nrounds = 2
|
||||
, objective = "binary:logistic"
|
||||
)
|
||||
```
|
||||
|
||||
@ -175,11 +177,13 @@ Alternatively, you can put your dataset in a *dense* matrix, i.e. a basic **R**
|
||||
bstDense <- xgboost(
|
||||
data = as.matrix(train$data),
|
||||
label = train$label,
|
||||
max_depth = 2,
|
||||
eta = 1,
|
||||
nthread = 2,
|
||||
nrounds = 2,
|
||||
objective = "binary:logistic"
|
||||
params = list(
|
||||
max_depth = 2,
|
||||
eta = 1,
|
||||
nthread = 2,
|
||||
objective = "binary:logistic"
|
||||
),
|
||||
nrounds = 2
|
||||
)
|
||||
```
|
||||
|
||||
@ -191,11 +195,13 @@ bstDense <- xgboost(
|
||||
dtrain <- xgb.DMatrix(data = train$data, label = train$label, nthread = 2)
|
||||
bstDMatrix <- xgboost(
|
||||
data = dtrain,
|
||||
max_depth = 2,
|
||||
eta = 1,
|
||||
nthread = 2,
|
||||
nrounds = 2,
|
||||
objective = "binary:logistic"
|
||||
params = list(
|
||||
max_depth = 2,
|
||||
eta = 1,
|
||||
nthread = 2,
|
||||
objective = "binary:logistic"
|
||||
),
|
||||
nrounds = 2
|
||||
)
|
||||
```
|
||||
|
||||
@ -209,11 +215,13 @@ One of the simplest way to see the training progress is to set the `verbose` opt
|
||||
# verbose = 0, no message
|
||||
bst <- xgboost(
|
||||
data = dtrain
|
||||
, max_depth = 2
|
||||
, eta = 1
|
||||
, nthread = 2
|
||||
, params = list(
|
||||
max_depth = 2
|
||||
, eta = 1
|
||||
, nthread = 2
|
||||
, objective = "binary:logistic"
|
||||
)
|
||||
, nrounds = 2
|
||||
, objective = "binary:logistic"
|
||||
, verbose = 0
|
||||
)
|
||||
```
|
||||
@ -222,11 +230,13 @@ bst <- xgboost(
|
||||
# verbose = 1, print evaluation metric
|
||||
bst <- xgboost(
|
||||
data = dtrain
|
||||
, max_depth = 2
|
||||
, eta = 1
|
||||
, nthread = 2
|
||||
, params = list(
|
||||
max_depth = 2
|
||||
, eta = 1
|
||||
, nthread = 2
|
||||
, objective = "binary:logistic"
|
||||
)
|
||||
, nrounds = 2
|
||||
, objective = "binary:logistic"
|
||||
, verbose = 1
|
||||
)
|
||||
```
|
||||
@ -235,11 +245,13 @@ bst <- xgboost(
|
||||
# verbose = 2, also print information about tree
|
||||
bst <- xgboost(
|
||||
data = dtrain
|
||||
, max_depth = 2
|
||||
, eta = 1
|
||||
, nthread = 2
|
||||
, params = list(
|
||||
max_depth = 2
|
||||
, eta = 1
|
||||
, nthread = 2
|
||||
, objective = "binary:logistic"
|
||||
)
|
||||
, nrounds = 2
|
||||
, objective = "binary:logistic"
|
||||
, verbose = 2
|
||||
)
|
||||
```
|
||||
@ -336,12 +348,14 @@ watchlist <- list(train = dtrain, test = dtest)
|
||||
|
||||
bst <- xgb.train(
|
||||
data = dtrain
|
||||
, max_depth = 2
|
||||
, eta = 1
|
||||
, nthread = 2
|
||||
, params = list(
|
||||
max_depth = 2
|
||||
, eta = 1
|
||||
, nthread = 2
|
||||
, objective = "binary:logistic"
|
||||
)
|
||||
, nrounds = 2
|
||||
, watchlist = watchlist
|
||||
, objective = "binary:logistic"
|
||||
)
|
||||
```
|
||||
|
||||
@ -349,7 +363,7 @@ bst <- xgb.train(
|
||||
|
||||
Both training and test error related metrics are very similar, and in some way, it makes sense: what we have learned from the training dataset matches the observations from the test dataset.
|
||||
|
||||
If with your own dataset you have not such results, you should think about how you divided your dataset in training and test. May be there is something to fix. Again, `caret` package may [help](http://topepo.github.io/caret/data-splitting.html).
|
||||
If with your own dataset you have not such results, you should think about how you divided your dataset in training and test. May be there is something to fix.
|
||||
|
||||
For a better understanding of the learning progression, you may want to have some specific metric or even use multiple evaluation metrics.
|
||||
|
||||
@ -357,13 +371,15 @@ For a better understanding of the learning progression, you may want to have som
|
||||
bst <- xgb.train(
|
||||
data = dtrain
|
||||
, max_depth = 2
|
||||
, eta = 1
|
||||
, nthread = 2
|
||||
, params = list(
|
||||
eta = 1
|
||||
, nthread = 2
|
||||
, objective = "binary:logistic"
|
||||
, eval_metric = "error"
|
||||
, eval_metric = "logloss"
|
||||
)
|
||||
, nrounds = 2
|
||||
, watchlist = watchlist
|
||||
, eval_metric = "error"
|
||||
, eval_metric = "logloss"
|
||||
, objective = "binary:logistic"
|
||||
)
|
||||
```
|
||||
|
||||
@ -377,14 +393,15 @@ Until now, all the learnings we have performed were based on boosting trees. **X
|
||||
```{r linearBoosting, message=F, warning=F}
|
||||
bst <- xgb.train(
|
||||
data = dtrain
|
||||
, booster = "gblinear"
|
||||
, max_depth = 2
|
||||
, nthread = 2
|
||||
, params = list(
|
||||
booster = "gblinear"
|
||||
, nthread = 2
|
||||
, objective = "binary:logistic"
|
||||
, eval_metric = "error"
|
||||
, eval_metric = "logloss"
|
||||
)
|
||||
, nrounds = 2
|
||||
, watchlist = watchlist
|
||||
, eval_metric = "error"
|
||||
, eval_metric = "logloss"
|
||||
, objective = "binary:logistic"
|
||||
)
|
||||
```
|
||||
|
||||
@ -406,12 +423,14 @@ xgb.DMatrix.save(dtrain, fname)
|
||||
dtrain2 <- xgb.DMatrix(fname)
|
||||
bst <- xgb.train(
|
||||
data = dtrain2
|
||||
, max_depth = 2
|
||||
, eta = 1
|
||||
, nthread = 2
|
||||
, params = list(
|
||||
max_depth = 2
|
||||
, eta = 1
|
||||
, nthread = 2
|
||||
, objective = "binary:logistic"
|
||||
)
|
||||
, nrounds = 2
|
||||
, watchlist = watchlist
|
||||
, objective = "binary:logistic"
|
||||
)
|
||||
```
|
||||
|
||||
@ -492,17 +511,17 @@ file.remove(fname)
|
||||
|
||||
> result is `0`? We are good!
|
||||
|
||||
In some very specific cases, like when you want to pilot **XGBoost** from `caret` package, you will want to save the model as a *R* binary vector. See below how to do it.
|
||||
In some very specific cases, you will want to save the model as a *R* binary vector. See below how to do it.
|
||||
|
||||
```{r saveLoadRBinVectorModel, message=F, warning=F}
|
||||
# save model to R's raw vector
|
||||
rawVec <- xgb.serialize(bst)
|
||||
rawVec <- xgb.save.raw(bst)
|
||||
|
||||
# print class
|
||||
print(class(rawVec))
|
||||
|
||||
# load binary model to R
|
||||
bst3 <- xgb.load(rawVec)
|
||||
bst3 <- xgb.load.raw(rawVec)
|
||||
xgb.parameters(bst3) <- list(nthread = 2)
|
||||
pred3 <- predict(bst3, test$data)
|
||||
|
||||
|
||||
@ -53,11 +53,10 @@ labels <- c(1, 1, 1,
|
||||
data <- data.frame(dates = dates, labels = labels)
|
||||
|
||||
bst <- xgb.train(
|
||||
data = xgb.DMatrix(as.matrix(data$dates), label = labels),
|
||||
data = xgb.DMatrix(as.matrix(data$dates), label = labels, missing = NA),
|
||||
nthread = 2,
|
||||
nrounds = 1,
|
||||
objective = "binary:logistic",
|
||||
missing = NA,
|
||||
max_depth = 1
|
||||
)
|
||||
```
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user