R-callbacks refactor

This commit is contained in:
Vadim Khotilovich 2016-06-09 02:46:13 -05:00
parent 754f3a6e07
commit 422b0000a8
5 changed files with 1109 additions and 515 deletions

500
R-package/R/callbacks.R Normal file
View File

@ -0,0 +1,500 @@
#' Callback closures for booster training.
#'
#' These are used to perform various service tasks either during boosting iterations or at the end.
#' This approach helps to modularize many of such tasks without bloating the main training methods,
#' and it offers .
#'
#' @details
#' By default, a callback function is run after each boosting iteration.
#' An R-attribute \code{is_pre_iteration} could be set for a callback to define a pre-iteration function.
#'
#' When a callback function has \code{finalize} parameter, its finalizer part will also be run after
#' the boosting is completed.
#'
#' WARNING: side-effects!!! Be aware that these callback functions access and modify things in
#' the environment from which they are called from, which is a fairly uncommon thing to do in R.
#'
#' To write a custom callback closure, make sure you first understand the main concepts about R envoronments.
#' Check either the R docs on \code{\link[base]{environment}} or the
#' \href{http://adv-r.had.co.nz/Environments.html}{Environments chapter} from Hadley Wickham's "Advanced R" book.
#' Then take a look at the code of \code{cb.reset_learning_rate} for a simple example,
#' and see the \code{cb.log_evaluation} code for something more involved.
#' Also, you would need to get familiar with the objects available inside of the \code{xgb.train} internal environment.
#'
#' @seealso
#' \code{\link{cb.print_evaluation}},
#' \code{\link{cb.log_evaluation}},
#' \code{\link{cb.reset_parameters}},
#' \code{\link{cb.early_stop}},
#' \code{\link{cb.save_model}},
#' \code{\link{xgb.train}},
#' \code{\link{xgb.cv}}
#'
#' @name callbacks
NULL
#
# Callbacks -------------------------------------------------------------------
#
#' Callback closure for printing the result of evaluation
#'
#' @param period results would be printed every number of periods
#'
#' @details
#' The callback function prints the result of evaluation at every \code{period} iterations.
#' The initial and the last iteration's evaluations are always printed.
#'
#' Callback function expects the following values to be set in its calling frame:
#' \code{bst_evaluation} (also \code{bst_evaluation_err} when available),
#' \code{iteration},
#' \code{begin_iteration},
#' \code{end_iteration}.
#'
#' @seealso
#' \code{\link{callbacks}}
#'
#' @export
cb.print_evaluation <- function(period=1) {
callback <- function(env = parent.frame()) {
if (length(env$bst_evaluation) == 0 ||
period == 0 ||
NVL(env$rank, 0) != 0 )
return()
i <- env$iteration
if ((i-1) %% period == 0 ||
i == env$begin_iteration ||
i == env$end_iteration) {
msg <- format_eval_string(i, env$bst_evaluation, env$bst_evaluation_err)
cat(msg, '\n')
}
}
attr(callback, 'call') <- match.call()
attr(callback, 'name') <- 'cb.print_evaluation'
callback
}
#' Callback closure for logging the evaluation history
#'
#' @details
#' This callback function appends the current iteration evaluation results \code{bst_evaluation}
#' available in the calling parent frame to the \code{evaluation_log} list in a calling frame.
#'
#' The finalizer callback (called with \code{finalize = TURE} in the end) converts
#' the \code{evaluation_log} list into a final data.table.
#'
#' The iteration evaluation result \code{bst_evaluation} must be a named numeric vector.
#'
#' Note: in the column names of the final data.table, the dash '-' character is replaced with
#' the underscore '_' in order to make the column names more like regular R identifiers.
#'
#' Callback function expects the following values to be set in its calling frame:
#' \code{evaluation_log},
#' \code{bst_evaluation},
#' \code{iteration}.
#'
#' @seealso
#' \code{\link{callbacks}}
#'
#' @export
cb.log_evaluation <- function() {
mnames <- NULL
init <- function(env) {
if (!is.list(env$evaluation_log))
stop("'evaluation_log' has to be a list")
mnames <<- names(env$bst_evaluation)
if (is.null(mnames) || any(mnames == ""))
stop("bst_evaluation must have non-empty names")
mnames <<- gsub('-', '_', names(env$bst_evaluation))
if(!is.null(env$bst_evaluation_err))
mnames <<- c(paste0(mnames, '_mean'), paste0(mnames, '_std'))
}
finalizer <- function(env) {
env$evaluation_log <- as.data.table(t(simplify2array(env$evaluation_log)))
setnames(env$evaluation_log, c('iter', mnames))
if(!is.null(env$bst_evaluation_err)) {
# rearrange col order from _mean,_mean,...,_std,_std,...
# to be _mean,_std,_mean,_std,...
len <- length(mnames)
means <- mnames[1:(len/2)]
stds <- mnames[(len/2 + 1):len]
cnames <- numeric(len)
cnames[c(TRUE, FALSE)] <- means
cnames[c(FALSE, TRUE)] <- stds
env$evaluation_log <- env$evaluation_log[, c('iter', cnames), with=FALSE]
}
}
callback <- function(env = parent.frame(), finalize = FALSE) {
if (is.null(mnames))
init(env)
if (finalize)
return(finalizer(env))
ev <- env$bst_evaluation
if(!is.null(env$bst_evaluation_err))
ev <- c(ev, env$bst_evaluation_err)
env$evaluation_log <- c(env$evaluation_log,
list(c(iter = env$iteration, ev)))
}
attr(callback, 'call') <- match.call()
attr(callback, 'name') <- 'cb.log_evaluation'
callback
}
#' Callback closure for restetting the booster's parameters at each iteration.
#'
#' @param new_params a list where each element corresponds to a parameter that needs to be reset.
#' Each element's value must be either a vector of values of length \code{nrounds}
#' to be set at each iteration,
#' or a function of two parameters \code{learning_rates(iteration, nrounds)}
#' which returns a new parameter value by using the current iteration number
#' and the total number of boosting rounds.
#'
#' @details
#' This is a "pre-iteration" callback function used to reset booster's parameters
#' at the beginning of each iteration.
#'
#' Note that when training is resumed from some previous model, and a function is used to
#' reset a parameter value, the \code{nround} argument in this function would be the
#' the number of boosting rounds in the current training.
#'
#' Callback function expects the following values to be set in its calling frame:
#' \code{bst} or \code{bst_folds},
#' \code{iteration},
#' \code{begin_iteration},
#' \code{end_iteration}.
#'
#' @seealso
#' \code{\link{callbacks}}
#'
#' @export
cb.reset_parameters <- function(new_params) {
if (typeof(new_params) != "list")
stop("'new_params' must be a list")
pnames <- gsub("\\.", "_", names(new_params))
# TODO: restrict the set of parameters that could be reset?
nrounds <- NULL
# run some checks in the begining
init <- function(env) {
nrounds <<- env$end_iteration - env$begin_iteration + 1
for (n in pnames) {
p <- new_params[[n]]
if (is.function(p)) {
if (length(formals(p)) != 2)
stop("Parameter '", n, "' is a function but not of two arguments")
} else if (is.numeric(p) || is.character(p)) {
if (length(p) != nrounds)
stop("Length of '", n, "' has to be equal to 'nrounds'")
} else {
stop("Parameter '", n, "' is not a function or a vector")
}
}
}
callback <- function(env = parent.frame()) {
if (is.null(nrounds))
init(env)
i <- env$iteration
pars <- lapply(new_params, function(p) {
if (is.function(p))
return(p(i, nrounds))
p[i]
})
if (!is.null(env$bst)) {
xgb.parameters(env$bst$handle) <- pars
} else {
for (fd in env$bst_folds)
xgb.parameters(fd$bst$handle) <- pars
}
}
attr(callback, 'is_pre_iteration') <- TRUE
attr(callback, 'call') <- match.call()
attr(callback, 'name') <- 'cb.reset_parameters'
callback
}
#' Callback closure to activate the early stopping.
#'
#' @param stopping_rounds The number of rounds with no improvement in
#' the evaluation metric in order to stop the training.
#' @param maximize whether to maximize the evaluation metric
#' @param metric_name the name of an evaluation column to use as a criteria for early
#' stopping. If not set, the last column would be used.
#' Let's say the test data in \code{watchlist} was labelled as \code{dtest},
#' and one wants to use the AUC in test data for early stopping regardless of where
#' it is in the \code{watchlist}, then one of the following would need to be set:
#' \code{metric_name='dtest-auc'} or \code{metric_name='dtest_auc'}.
#' All dash '-' characters in metric names are considered equivalent to '_'.
#' @param verbose whether to print the early stopping information.
#'
#' @details
#' This callback function determines the condition for early stopping
#' by setting the \code{stop_condition = TRUE} flag in its calling frame.
#'
#' The following additional fields are assigned to the model R object:
#' \itemize{
#' \item \code{best_score} the evaluation score at the best iteration
#' \item \code{best_iteration} at which boosting iteration the best score has occurred (1-based index)
#' \item \code{best_ntreelimit} to use with the \code{ntreelimit} parameter in \code{predict}.
#' It differs from \code{best_iteration} in multiclass or random forest settings.
#' }
#'
#' The Same values are also stored as xgb-attributes, however:
#' \itemize{
#' \item \code{best_iteration} is stored as a 0-based iteration index (for interoperability of binary models)
#' \item \code{best_msg} message string is also stored.
#' }
#'
#' At least one data element is required in the evaluation watchlist for early stopping to work.
#'
#' Callback function expects the following values to be set in its calling frame:
#' \code{stop_condition},
#' \code{bst_evaluation},
#' \code{rank},
#' \code{bst} or \code{bst_folds},
#' \code{iteration},
#' \code{begin_iteration},
#' \code{end_iteration},
#' \code{num_parallel_tree},
#' \code{num_class}.
#'
#' @seealso
#' \code{\link{callbacks}},
#' \code{\link{xgb.attr}}
#'
#' @export
cb.early_stop <- function(stopping_rounds, maximize=FALSE,
metric_name=NULL, verbose=TRUE) {
# state variables
best_iteration <- -1
best_score <- Inf
best_msg <- NULL
metric_idx <- 1
init <- function(env) {
if (length(env$bst_evaluation) == 0)
stop("For early stopping, watchlist must have at least one element")
eval_names <- gsub('-', '_', names(env$bst_evaluation))
if (!is.null(metric_name)) {
metric_idx <<- which(gsub('-', '_', metric_name) == eval_names)
if (length(metric_idx) == 0)
stop("'metric_name' for early stopping is not one of the following:\n",
paste(eval_names, collapse=' '), '\n')
}
if (is.null(metric_name) &&
length(env$bst_evaluation) > 1) {
metric_idx <<- length(eval_names)
if (verbose)
cat('Multiple eval metrics are present. Will use ',
eval_names[metric_idx], ' for early stopping.\n', sep = '')
}
metric_name <<- eval_names[metric_idx]
# maximixe is usually NULL when not set in xgb.train and built-in metrics
if (is.null(maximize))
maximize <<- ifelse(grepl('(_auc|_map|_ndcg)', metric_name), TRUE, FALSE)
if (verbose && NVL(env$rank, 0) == 0)
cat("Will train until ", metric_name, " hasn't improved in ",
stopping_rounds, " rounds.\n\n", sep = '')
best_iteration <<- 1
if (maximize) best_score <<- -Inf
env$stop_condition <- FALSE
if (!is.null(env$bst)) {
if (class(env$bst) != 'xgb.Booster')
stop("'bst' in the parent frame must be an 'xgb.Booster'")
if (!is.null(best_score <- xgb.attr(env$bst$handle, 'best_score'))) {
best_score <<- as.numeric(best_score)
best_iteration <<- as.numeric(xgb.attr(env$bst$handle, 'best_iteration')) + 1
best_msg <<- as.numeric(xgb.attr(env$bst$handle, 'best_msg'))
} else {
xgb.attributes(env$bst$handle) <- list(best_iteration = best_iteration - 1,
best_score = best_score)
}
} else if (is.null(env$bst_folds)) {
stop("Parent frame has neither 'bst' nor 'bst_folds'")
}
}
finalizer <- function(env) {
best_ntreelimit = best_iteration * env$num_parallel_tree * env$num_class
if (!is.null(env$bst)) {
attr_best_score = as.numeric(xgb.attr(env$bst$handle, 'best_score'))
if (best_score != attr_best_score)
stop("Inconsistent 'best_score' between the state: ", best_score,
" and the xgb.attr: ", attr_best_score)
env$bst$best_score = best_score
env$bst$best_iteration = best_iteration
env$bst$best_ntreelimit = best_ntreelimit
} else {
attr(env$bst_folds, 'best_iteration') <- best_iteration
attr(env$bst_folds, 'best_ntreelimit') <- best_ntreelimit
}
}
callback <- function(env = parent.frame(), finalize = FALSE) {
if (best_iteration < 0)
init(env)
if (finalize)
return(finalizer(env))
i <- env$iteration
score = env$bst_evaluation[metric_idx]
if (( maximize && score > best_score) ||
(!maximize && score < best_score)) {
best_msg <<- format_eval_string(i, env$bst_evaluation, env$bst_evaluation_err)
best_score <<- score
best_iteration <<- i
# save the property to attributes, so they will occur in checkpoint
if (!is.null(env$bst)) {
xgb.attributes(env$bst) <- list(
best_iteration = best_iteration - 1, # convert to 0-based index
best_score = best_score,
best_msg = best_msg,
best_ntreelimit = best_iteration * env$num_parallel_tree * env$num_class)
}
} else if (i - best_iteration >= stopping_rounds) {
env$stop_condition <- TRUE
env$end_iteration <- i
if (verbose && NVL(env$rank, 0) == 0)
cat("Stopping. Best iteration:\n", best_msg, "\n\n", sep = '')
}
}
attr(callback, 'call') <- match.call()
attr(callback, 'name') <- 'cb.early_stop'
callback
}
#' Callback closure for saving a model file.
#'
#' @param save_period save the model to disk after every
#' \code{save_period} iterations; 0 means save the model at the end.
#' @param save_name the name or path for the saved model file.
#' It can contain a \code{\link[base]{sprintf}} formatting specifier
#' to include the integer iteration number in the file name.
#' E.g., with \code{save_name} = 'xgboost_%04d.model',
#' the file saved at iteration 50 would be named "xgboost_0050.model".
#'
#' @details
#' This callback function allows to save an xgb-model file, either periodically after each \code{save_period}'s or at the end.
#'
#' Callback function expects the following values to be set in its calling frame:
#' \code{bst},
#' \code{iteration},
#' \code{begin_iteration},
#' \code{end_iteration}.
#'
#' @seealso
#' \code{\link{callbacks}}
#'
#' @export
cb.save_model <- function(save_period = 0, save_name = "xgboost.model") {
if (save_period < 0)
stop("'save_period' cannot be negative")
callback <- function(env = parent.frame()) {
if (is.null(env$bst))
stop("'save_model' callback requires the 'bst' booster object in its calling frame")
if ((save_period > 0 && (env$iteration - env$begin_iteration) %% save_period == 0) ||
(save_period == 0 && env$iteration == env$end_iteration))
xgb.save(env$bst, sprintf(save_name, env$iteration))
}
attr(callback, 'call') <- match.call()
attr(callback, 'name') <- 'cb.save_model'
callback
}
#
# Internal utility functions for callbacks ------------------------------------
#
# Format the evaluation metric string
format_eval_string <- function(iter, eval_res, eval_err=NULL) {
if (length(eval_res) == 0)
stop('no evaluation results')
enames <- names(eval_res)
if (is.null(enames))
stop('evaluation results must have names')
iter <- sprintf('[%d]\t', iter)
if (!is.null(eval_err)) {
if (length(eval_res) != length(eval_err))
stop('eval_res & eval_err lengths mismatch')
res <- paste0(sprintf("%s:%f+%f", enames, eval_res, eval_err), collapse='\t')
} else {
res <- paste0(sprintf("%s:%f", enames, eval_res), collapse='\t')
}
return(paste0(iter, res))
}
# Extract callback names from the list of callbacks
callback.names <- function(cb.list) {
unlist(lapply(cb.list, function(x) attr(x, 'name')))
}
# Extract callback calls from the list of callbacks
callback.calls <- function(cb.list) {
unlist(lapply(cb.list, function(x) attr(x, 'call')))
}
# Sort callbacks list into categories
categorize.callbacks <- function(cb.list) {
list(
pre_iter = Filter(function(x) {
pre <- attr(x, 'is_pre_iteration')
!is.null(pre) && pre
}, cb.list),
post_iter = Filter(function(x) {
pre <- attr(x, 'is_pre_iteration')
is.null(pre) || !pre
}, cb.list),
finalize = Filter(function(x) {
'finalize' %in% names(formals(x))
}, cb.list)
)
}
# Check whether all callback functions with names given by 'query.names' are present in the 'cb.list'.
has.callbacks <- function(cb.list, query.names) {
if (length(cb.list) < length(query.names))
return(FALSE)
if (!is.list(cb.list) ||
!all(sapply(cb.list, class) == 'function'))
stop('`cb.list`` must be a list of callback functions')
cb.names <- callback.names(cb.list)
if (!is.character(cb.names) ||
length(cb.names) != length(cb.list) ||
any(cb.names == ""))
stop('All callbacks in the `cb.list` must have a non-empty `name` attribute')
if (!is.character(query.names) ||
length(query.names) == 0 ||
any(query.names == ""))
stop('query.names must be a non-empty vector of non-empty character names')
return(all(query.names %in% cb.names))
}

View File

@ -1,181 +1,219 @@
#' @importClassesFrom Matrix dgCMatrix dgeMatrix #
#' @import methods # This file is for the low level reuseable utility functions
# that are not supposed to be visibe to a user.
#
# depends on matrix #
.onLoad <- function(libname, pkgname) { # General helper utilities ----------------------------------------------------
library.dynam("xgboost", pkgname, libname) #
}
.onUnload <- function(libpath) {
library.dynam.unload("xgboost", libpath)
}
# SQL-style NVL shortcut.
## ----the following are low level iterative functions, not needed if NVL <- function(x, val) {
## you do not want to use them --------------------------------------- if (is.null(x))
return(val)
# iteratively update booster with customized statistics if (is.vector(x)) {
xgb.iter.boost <- function(booster, dtrain, gpair) { x[is.na(x)] <- val
if (class(booster) != "xgb.Booster.handle") { return(x)
stop("xgb.iter.update: first argument must be type xgb.Booster.handle")
} }
if (class(dtrain) != "xgb.DMatrix") { if (typeof(x) == 'closure')
stop("xgb.iter.update: second argument must be type xgb.DMatrix") return(x)
} stop('x of unsupported for NVL type')
.Call("XGBoosterBoostOneIter_R", booster, dtrain, gpair$grad, gpair$hess, PACKAGE = "xgboost")
return(TRUE)
} }
# iteratively update booster with dtrain
#
# Low-level functions for boosting --------------------------------------------
#
# Merges booster params with whatever is provided in ...
# plus runs some checks
check.params <- function(params, ...) {
if (typeof(params) != "list")
stop("params must be a list")
# in R interface, allow for '.' instead of '_' in parameter names
names(params) <- gsub("\\.", "_", names(params))
# merge parameters from the params and the dots-expansion
dot.params <- list(...)
names(dot.params) <- gsub("\\.", "_", names(dot.params))
if (length(intersect(names(params), names(dot.params))) > 0)
stop("Same parameters in 'params' and in the call are not allowed. Please check your 'params' list.")
params <- c(params, dot.params)
# only multiple eval_metric's make sense
name.freqs <- table(names(params))
multi.names <- setdiff( names(name.freqs[name.freqs>1]), 'eval_metric')
if (length(multi.names) > 0) {
warning("The following parameters (other than 'eval_metric') were provided multiple times:\n\t",
paste(multi.names, collapse=', '), "\n Only the last value for each of them will be used.\n")
# While xgboost itself would choose the last value for a multi-parameter,
# will do some clean-up here b/c multi-parameters could be used further in R code, and R would
# pick the 1st (not the last) value when multiple elements with the same name are present in a list.
for (n in multi.names) {
del.idx <- which(n == names(params))
del.idx <- del.idx[-length(del.idx)]
params[[del.idx]] <- NULL
}
}
# for multiclass, expect num_class to be set
if (typeof(params[['objective']]) == "character" &&
substr(NVL(params[['objective']], 'x'), 1, 6) == 'multi:') {
if (as.numeric(NVL(params[['num_class']], 0)) < 2)
stop("'num_class' > 1 parameter must be set for multiclass classification")
}
return(params)
}
# Performs some checks related to custom objective function.
# WARNING: has side-effects and can modify 'params' and 'obj' in its calling frame
check.custom.obj <- function(env = parent.frame()) {
if (!is.null(env$params[['objective']]) && !is.null(env$obj))
stop("Setting objectives in 'params' and 'obj' at the same time is not allowed")
if (!is.null(env$obj) && typeof(env$obj) != 'closure')
stop("'obj' must be a function")
# handle the case when custom objective function was provided through params
if (!is.null(env$params[['objective']]) &&
typeof(env$params$objective) == 'closure') {
env$obj <- env$params$objective
p <- env$params
p$objective <- NULL
env$params <- p
}
}
# Performs some checks related to custom evaluation function.
# WARNING: has side-effects and can modify 'params' and 'feval' in its calling frame
check.custom.eval <- function(env = parent.frame()) {
if (!is.null(env$params[['eval_metric']]) && !is.null(env$feval))
stop("Setting evaluation metrics in 'params' and 'feval' at the same time is not allowed")
if (!is.null(env$feval) && typeof(env$feval) != 'closure')
stop("'feval' must be a function")
if (!is.null(env$feval) && is.null(env$maximize))
stop("Please set 'maximize' to indicate whether the metric needs to be maximized or not")
# handle the situation when custom eval function was provided through params
if (!is.null(env$params[['eval_metric']]) &&
typeof(env$params$eval_metric) == 'closure') {
env$feval <- env$params$eval_metric
p <- env$params
p[ which(names(p) == 'eval_metric') ] <- NULL
env$params <- p
}
}
# Update booster with dtrain for an iteration
xgb.iter.update <- function(booster, dtrain, iter, obj = NULL) { xgb.iter.update <- function(booster, dtrain, iter, obj = NULL) {
if (class(booster) != "xgb.Booster.handle") { if (class(booster) != "xgb.Booster.handle") {
stop("xgb.iter.update: first argument must be type xgb.Booster.handle") stop("first argument type must be xgb.Booster.handle")
} }
if (class(dtrain) != "xgb.DMatrix") { if (class(dtrain) != "xgb.DMatrix") {
stop("xgb.iter.update: second argument must be type xgb.DMatrix") stop("second argument type must be xgb.DMatrix")
} }
if (is.null(obj)) { if (is.null(obj)) {
.Call("XGBoosterUpdateOneIter_R", booster, as.integer(iter), dtrain, .Call("XGBoosterUpdateOneIter_R", booster, as.integer(iter), dtrain,
PACKAGE = "xgboost") PACKAGE = "xgboost")
} else { } else {
pred <- predict(booster, dtrain) pred <- predict(booster, dtrain)
gpair <- obj(pred, dtrain) gpair <- obj(pred, dtrain)
succ <- xgb.iter.boost(booster, dtrain, gpair) .Call("XGBoosterBoostOneIter_R", booster, dtrain, gpair$grad, gpair$hess, PACKAGE = "xgboost")
} }
return(TRUE) return(TRUE)
} }
# iteratively evaluate one iteration
xgb.iter.eval <- function(booster, watchlist, iter, feval = NULL, prediction = FALSE) { # Evaluate one iteration.
if (class(booster) != "xgb.Booster.handle") { # Returns a named vector of evaluation metrics
stop("xgb.eval: first argument must be type xgb.Booster") # with the names in a 'datasetname-metricname' format.
} xgb.iter.eval <- function(booster, watchlist, iter, feval = NULL) {
if (typeof(watchlist) != "list") { if (class(booster) != "xgb.Booster.handle")
stop("xgb.eval: only accepts list of DMatrix as watchlist") stop("first argument must be type xgb.Booster.handle")
} if (length(watchlist) == 0)
for (w in watchlist) { return(NULL)
if (class(w) != "xgb.DMatrix") {
stop("xgb.eval: watch list can only contain xgb.DMatrix") evnames <- names(watchlist)
} if (is.null(feval)) {
} msg <- .Call("XGBoosterEvalOneIter_R", booster, as.integer(iter), watchlist,
if (length(watchlist) != 0) { as.list(evnames), PACKAGE = "xgboost")
if (is.null(feval)) { msg <- str_split(msg, '(\\s+|:|\\s+)')[[1]][-1]
evnames <- list() res <- as.numeric(msg[c(FALSE,TRUE)]) # even indices are the values
for (i in 1:length(watchlist)) { names(res) <- msg[c(TRUE,FALSE)] # odds are the names
w <- watchlist[i]
if (length(names(w)) == 0) {
stop("xgb.eval: name tag must be presented for every elements in watchlist")
}
evnames <- append(evnames, names(w))
}
msg <- .Call("XGBoosterEvalOneIter_R", booster, as.integer(iter), watchlist,
evnames, PACKAGE = "xgboost")
} else {
msg <- paste("[", iter, "]", sep="")
for (j in 1:length(watchlist)) {
w <- watchlist[j]
if (length(names(w)) == 0) {
stop("xgb.eval: name tag must be presented for every elements in watchlist")
}
preds <- predict(booster, w[[1]])
ret <- feval(preds, w[[1]])
msg <- paste(msg, "\t", names(w), "-", ret$metric, ":", ret$value, sep="")
}
}
} else { } else {
msg <- "" res <- sapply(seq_along(watchlist), function(j) {
w <- watchlist[[j]]
preds <- predict(booster, w) # predict using all trees
eval_res <- feval(preds, w)
out <- eval_res$value
names(out) <- paste0(evnames[j], "-", eval_res$metric)
out
})
} }
if (prediction){ return(res)
preds <- predict(booster,watchlist[[2]])
return(list(msg,preds))
}
return(msg)
} }
#------------------------------------------
# helper functions for cross validation
# #
xgb.cv.mknfold <- function(dall, nfold, param, stratified, folds) { # Helper functions for cross validation ---------------------------------------
if (nfold <= 1) { #
stop("nfold must be bigger than 1")
} # Generates random (stratified if needed) CV folds
if(is.null(folds)) { generate.cv.folds <- function(nfold, nrows, stratified, label, params) {
if (exists('objective', where=param) && is.character(param$objective) &&
strtrim(param[['objective']], 5) == 'rank:') { # cannot do it for rank
stop("\tAutomatic creation of CV-folds is not implemented for ranking!\n", if (exists('objective', where=params) &&
"\tConsider providing pre-computed CV-folds through the folds parameter.") is.character(params$objective) &&
} strtrim(params$objective, 5) == 'rank:')
y <- getinfo(dall, 'label') stop("\n\tAutomatic generation of CV-folds is not implemented for ranking!\n",
randidx <- sample(1 : nrow(dall)) "\tConsider providing pre-computed CV-folds through the 'folds=' parameter.\n")
if (stratified & length(y) == length(randidx)) {
y <- y[randidx] # shuffle
# rnd.idx <- sample(1:nrows)
# WARNING: some heuristic logic is employed to identify classification setting! if (stratified &&
# length(label) == length(rnd.idx)) {
# For classification, need to convert y labels to factor before making the folds, y <- label[rnd.idx]
# and then do stratification by factor levels. # WARNING: some heuristic logic is employed to identify classification setting!
# For regression, leave y numeric and do stratification by quantiles. # - For classification, need to convert y labels to factor before making the folds,
if (exists('objective', where=param) && is.character(param$objective)) { # and then do stratification by factor levels.
# If 'objective' provided in params, assume that y is a classification label # - For regression, leave y numeric and do stratification by quantiles.
# unless objective is reg:linear if (exists('objective', where=params) &&
if (param[['objective']] != 'reg:linear') y <- factor(y) is.character(params$objective)) {
} else { # If 'objective' provided in params, assume that y is a classification label
# If no 'objective' given in params, it means that user either wants to use # unless objective is reg:linear
# the default 'reg:linear' objective or has provided a custom obj function. if (params$objective != 'reg:linear')
# Here, assume classification setting when y has 5 or less unique values: y <- factor(y)
if (length(unique(y)) <= 5) y <- factor(y)
}
folds <- xgb.createFolds(y, nfold)
} else { } else {
# make simple non-stratified folds # If no 'objective' given in params, it means that user either wants to use
kstep <- length(randidx) %/% nfold # the default 'reg:linear' objective or has provided a custom obj function.
folds <- list() # Here, assume classification setting when y has 5 or less unique values:
for (i in 1:(nfold - 1)) { if (length(unique(y)) <= 5)
folds[[i]] <- randidx[1:kstep] y <- factor(y)
randidx <- setdiff(randidx, folds[[i]])
}
folds[[nfold]] <- randidx
} }
} folds <- xgb.createFolds(y, nfold)
ret <- list() } else {
for (k in 1:nfold) { # make simple non-stratified folds
dtest <- slice(dall, folds[[k]]) kstep <- length(rnd.idx) %/% nfold
didx <- c() folds <- list()
for (i in 1:nfold) { for (i in 1:(nfold - 1)) {
if (i != k) { folds[[i]] <- rnd.idx[1:kstep]
didx <- append(didx, folds[[i]]) rnd.idx <- rnd.idx[-(1:kstep)]
}
} }
dtrain <- slice(dall, didx) folds[[nfold]] <- rnd.idx
bst <- xgb.Booster(param, list(dtrain, dtest))
watchlist <- list(train=dtrain, test=dtest)
ret[[k]] <- list(dtrain=dtrain, booster=bst, watchlist=watchlist, index=folds[[k]])
} }
return (ret) return(folds)
} }
xgb.cv.aggcv <- function(res, showsd = TRUE) { # Creates CV folds stratified by the values of y.
header <- res[[1]] # It was borrowed from caret::createFolds and simplified
ret <- header[1] # by always returning an unnamed list of fold indices.
for (i in 2:length(header)) {
kv <- strsplit(header[i], ":")[[1]]
ret <- paste(ret, "\t", kv[1], ":", sep="")
stats <- c()
stats[1] <- as.numeric(kv[2])
for (j in 2:length(res)) {
tkv <- strsplit(res[[j]][i], ":")[[1]]
stats[j] <- as.numeric(tkv[2])
}
ret <- paste(ret, sprintf("%f", mean(stats)), sep="")
if (showsd) {
ret <- paste(ret, sprintf("+%f", stats::sd(stats)), sep="")
}
}
return (ret)
}
# Shamelessly copied from caret::createFolds
# and simplified by always returning an unnamed list of test indices
xgb.createFolds <- function(y, k = 10) xgb.createFolds <- function(y, k = 10)
{ {
if(is.numeric(y)) { if(is.numeric(y)) {

View File

@ -2,17 +2,6 @@
#' #'
#' The cross valudation function of xgboost #' The cross valudation function of xgboost
#' #'
#' @importFrom data.table data.table
#' @importFrom data.table as.data.table
#' @importFrom magrittr %>%
#' @importFrom data.table :=
#' @importFrom data.table rbindlist
#' @importFrom stringr str_extract_all
#' @importFrom stringr str_extract
#' @importFrom stringr str_split
#' @importFrom stringr str_replace
#' @importFrom stringr str_match
#'
#' @param params the list of parameters. Commonly used ones are: #' @param params the list of parameters. Commonly used ones are:
#' \itemize{ #' \itemize{
#' \item \code{objective} objective function, common ones are #' \item \code{objective} objective function, common ones are
@ -35,7 +24,7 @@
#' value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values. #' value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values.
#' @param prediction A logical value indicating whether to return the prediction vector. #' @param prediction A logical value indicating whether to return the prediction vector.
#' @param showsd \code{boolean}, whether show standard deviation of cross validation #' @param showsd \code{boolean}, whether show standard deviation of cross validation
#' @param metrics, list of evaluation metrics to be used in corss validation, #' @param metrics, list of evaluation metrics to be used in cross validation,
#' when it is not specified, the evaluation metric is chosen according to objective function. #' when it is not specified, the evaluation metric is chosen according to objective function.
#' Possible options are: #' Possible options are:
#' \itemize{ #' \itemize{
@ -56,14 +45,16 @@
#' @param verbose \code{boolean}, print the statistics during the process #' @param verbose \code{boolean}, print the statistics during the process
#' @param print.every.n Print every N progress messages when \code{verbose>0}. Default is 1 which means all messages are printed. #' @param print.every.n Print every N progress messages when \code{verbose>0}. Default is 1 which means all messages are printed.
#' @param early.stop.round If \code{NULL}, the early stopping function is not triggered. #' @param early.stop.round If \code{NULL}, the early stopping function is not triggered.
#' If set to an integer \code{k}, training with a validation set will stop if the performance #' If set to an integer \code{k}, training with a validation set will stop if the performance
#' keeps getting worse consecutively for \code{k} rounds. #' doesn't improve for \code{k} rounds.
#' @param maximize If \code{feval} and \code{early.stop.round} are set, then \code{maximize} must be set as well. #' @param maximize If \code{feval} and \code{early.stop.round} are set, then \code{maximize} must be set as well.
#' \code{maximize=TRUE} means the larger the evaluation score the better. #' \code{maximize=TRUE} means the larger the evaluation score the better.
#' #'
#' @param ... other parameters to pass to \code{params}. #' @param ... other parameters to pass to \code{params}.
#' #'
#' @return #' @return
#' TODO: update this...
#'
#' If \code{prediction = TRUE}, a list with the following elements is returned: #' If \code{prediction = TRUE}, a list with the following elements is returned:
#' \itemize{ #' \itemize{
#' \item \code{dt} a \code{data.table} with each mean and standard deviation stat for training set and test set #' \item \code{dt} a \code{data.table} with each mean and standard deviation stat for training set and test set
@ -89,162 +80,209 @@
#' history <- xgb.cv(data = dtrain, nround=3, nthread = 2, nfold = 5, metrics=list("rmse","auc"), #' history <- xgb.cv(data = dtrain, nround=3, nthread = 2, nfold = 5, metrics=list("rmse","auc"),
#' max.depth =3, eta = 1, objective = "binary:logistic") #' max.depth =3, eta = 1, objective = "binary:logistic")
#' print(history) #' print(history)
#'
#' @export #' @export
xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing = NA, xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing = NA,
prediction = FALSE, showsd = TRUE, metrics=list(), prediction = FALSE, showsd = TRUE, metrics=list(),
obj = NULL, feval = NULL, stratified = TRUE, folds = NULL, verbose = T, print.every.n=1L, obj = NULL, feval = NULL, stratified = TRUE, folds = NULL,
early.stop.round = NULL, maximize = NULL, ...) { verbose = TRUE, print.every.n=1L,
if (typeof(params) != "list") { early.stop.round = NULL, maximize = NULL, callbacks = list(), ...) {
stop("xgb.cv: first argument params must be list")
#strategy <- match.arg(strategy)
params <- check.params(params, ...)
# TODO: should we deprecate the redundant 'metrics' parameter?
for (m in metrics)
params <- c(params, list("eval_metric" = m))
check.custom.obj()
check.custom.eval()
#if (is.null(params[['eval_metric']]) && is.null(feval))
# stop("Either 'eval_metric' or 'feval' must be provided for CV")
# Labels
if (class(data) == 'xgb.DMatrix')
labels <- getinfo(data, 'label')
if (is.null(labels))
stop("Labels must be provided for CV either through xgb.DMatrix, or through 'label=' when 'data' is matrix")
# CV folds
if(!is.null(folds)) {
if(class(folds) != "list" || length(folds) < 2)
stop("'folds' must be a list with 2 or more elements that are vectors of indices for each CV-fold")
nfold <- length(folds)
} else {
if (nfold <= 1)
stop("'nfold' must be > 1")
folds <- generate.cv.folds(nfold, nrow(data), stratified, label, params)
}
# Potential TODO: sequential CV
#if (strategy == 'sequential')
# stop('Sequential CV strategy is not yet implemented')
# verbosity & evaluation printing callback:
params <- c(params, list(silent = 1))
print.every.n <- max( as.integer(print.every.n), 1L)
if (!has.callbacks(callbacks, 'cb.print_evaluation') && verbose)
callbacks <- c(callbacks, cb.print_evaluation(print.every.n))
# evaluation log callback: always is on in CV
evaluation_log <- list()
if (!has.callbacks(callbacks, 'cb.log_evaluation'))
callbacks <- c(callbacks, cb.log_evaluation())
# Early stopping callback
stop_condition <- FALSE
if (!is.null(early.stop.round) &&
!has.callbacks(callbacks, 'cb.early_stop'))
callbacks <- c(callbacks, cb.early_stop(early.stop.round, maximize=maximize, verbose=verbose))
# Sort the callbacks into categories
names(callbacks) <- callback.names(callbacks)
cb <- categorize.callbacks(callbacks)
# create the booster-folds
dall <- xgb.get.DMatrix(data, label, missing)
bst_folds <- lapply(1:length(folds), function(k) {
dtest <- slice(dall, folds[[k]])
dtrain <- slice(dall, unlist(folds[-k]))
bst <- xgb.Booster(params, list(dtrain, dtest))
list(dtrain=dtrain, bst=bst, watchlist=list(train=dtrain, test=dtest), index=folds[[k]])
})
num_class <- max(as.numeric(NVL(params[['num_class']], 1)), 1)
num_parallel_tree <- max(as.numeric(NVL(params[['num_parallel_tree']], 1)), 1)
begin_iteration <- 1
end_iteration <- nrounds
# synchronous CV boosting: run CV folds' models within each iteration
for (iteration in begin_iteration:end_iteration) {
for (f in cb$pre_iter) f()
msg <- lapply(bst_folds, function(fd) {
xgb.iter.update(fd$bst, fd$dtrain, iteration - 1, obj)
xgb.iter.eval(fd$bst, fd$watchlist, iteration - 1, feval)
})
msg <- simplify2array(msg)
bst_evaluation <- rowMeans(msg)
bst_evaluation_err <- sqrt(rowMeans(msg^2) - bst_evaluation^2)
for (f in cb$post_iter) f()
if (stop_condition) break
}
for (f in cb$finalize) f(finalize=TRUE)
# the CV result
ret <- list(
call = match.call(),
params = params,
callbacks = callbacks,
evaluation_log = evaluation_log,
nboost = end_iteration,
ntree = end_iteration * num_parallel_tree * num_class
)
if (!is.null(attr(bst_folds, 'best_iteration'))) {
ret$best_iteration <- attr(bst_folds, 'best_iteration')
ret$best_ntreelimit <- attr(bst_folds, 'best_ntreelimit')
}
ret$folds <- folds
# TODO: should making prediction go
# a. into a callback?
# b. return folds' models, and have a separate method for predictions?
if (prediction) {
ret$pred <- ifelse(num_class > 1,
matrix(0, nrow(data), num_class),
rep(0, nrow(data)))
ntreelimit <- NVL(ret$best_ntreelimit, ret$ntree)
for (fd in bst_folds) {
pred <- predict(fd$bst, fd$watchlist[[2]], ntreelimit = ntreelimit)
if (is.matrix(ret$pred))
ret$pred[fd$index,] <- t(matrix(pred, num_class, length(fd$index)))
else
ret$pred[fd$index] <- pred
} }
if(!is.null(folds)) { ret$bst <- lapply(bst_folds, function(x) {
if(class(folds) != "list" | length(folds) < 2) { xgb.Booster.check(xgb.handleToBooster(x$bst), saveraw = TRUE)
stop("folds must be a list with 2 or more elements that are vectors of indices for each CV-fold") })
} }
nfold <- length(folds)
} class(ret) <- 'xgb.cv.synchronous'
if (nfold <= 1) { invisible(ret)
stop("nfold must be bigger than 1")
}
dtrain <- xgb.get.DMatrix(data, label, missing)
dot.params <- list(...)
nms.params <- names(params)
nms.dot.params <- names(dot.params)
if (length(intersect(nms.params,nms.dot.params)) > 0)
stop("Duplicated defined term in parameters. Please check your list of params.")
params <- append(params, dot.params)
params <- append(params, list(silent=1))
for (mc in metrics) {
params <- append(params, list("eval_metric"=mc))
}
# customized objective and evaluation metric interface
if (!is.null(params$objective) && !is.null(obj))
stop("xgb.cv: cannot assign two different objectives")
if (!is.null(params$objective))
if (class(params$objective) == 'function') {
obj <- params$objective
params[['objective']] <- NULL
}
# if (!is.null(params$eval_metric) && !is.null(feval))
# stop("xgb.cv: cannot assign two different evaluation metrics")
if (!is.null(params$eval_metric))
if (class(params$eval_metric) == 'function') {
feval <- params$eval_metric
params[['eval_metric']] <- NULL
}
# Early Stopping
if (!is.null(early.stop.round)){
if (!is.null(feval) && is.null(maximize))
stop('Please set maximize to note whether the model is maximizing the evaluation or not.')
if (is.null(maximize) && is.null(params$eval_metric))
stop('Please set maximize to note whether the model is maximizing the evaluation or not.')
if (is.null(maximize))
{
if (params$eval_metric %in% c('rmse','logloss','error','merror','mlogloss')) {
maximize <- FALSE
} else {
maximize <- TRUE
}
}
if (maximize) {
bestScore <- 0
} else {
bestScore <- Inf
}
bestInd <- 0
earlyStopflag <- FALSE
if (length(metrics) > 1)
warning('Only the first metric is used for early stopping process.')
}
xgb_folds <- xgb.cv.mknfold(dtrain, nfold, params, stratified, folds)
obj_type <- params[['objective']]
mat_pred <- FALSE
if (!is.null(obj_type) && obj_type == 'multi:softprob')
{
num_class <- params[['num_class']]
if (is.null(num_class))
stop('must set num_class to use softmax')
predictValues <- matrix(0, nrow(dtrain), num_class)
mat_pred <- TRUE
}
else
predictValues <- rep(0, nrow(dtrain))
history <- c()
print.every.n <- max(as.integer(print.every.n), 1L)
for (i in 1:nrounds) {
msg <- list()
for (k in 1:nfold) {
fd <- xgb_folds[[k]]
succ <- xgb.iter.update(fd$booster, fd$dtrain, i - 1, obj)
msg[[k]] <- xgb.iter.eval(fd$booster, fd$watchlist, i - 1, feval) %>% str_split("\t") %>% .[[1]]
}
ret <- xgb.cv.aggcv(msg, showsd)
history <- c(history, ret)
if(verbose)
if (0 == (i - 1L) %% print.every.n)
cat(ret, "\n", sep="")
# early_Stopping
if (!is.null(early.stop.round)){
score <- strsplit(ret,'\\s+')[[1]][1 + length(metrics) + 2]
score <- strsplit(score,'\\+|:')[[1]][[2]]
score <- as.numeric(score)
if ( (maximize && score > bestScore) || (!maximize && score < bestScore)) {
bestScore <- score
bestInd <- i
} else {
if (i - bestInd >= early.stop.round) {
earlyStopflag <- TRUE
cat('Stopping. Best iteration:', bestInd, '\n')
break
}
}
}
}
if (prediction) {
for (k in 1:nfold) {
fd <- xgb_folds[[k]]
if (!is.null(early.stop.round) && earlyStopflag) {
res <- xgb.iter.eval(fd$booster, fd$watchlist, bestInd - 1, feval, prediction)
} else {
res <- xgb.iter.eval(fd$booster, fd$watchlist, nrounds - 1, feval, prediction)
}
if (mat_pred) {
pred_mat <- matrix(res[[2]],num_class,length(fd$index))
predictValues[fd$index,] <- t(pred_mat)
} else {
predictValues[fd$index] <- res[[2]]
}
}
}
colnames <- str_split(string = history[1], pattern = "\t")[[1]] %>% .[2:length(.)] %>% str_extract(".*:") %>% str_replace(":","") %>% str_replace("-", ".")
colnamesMean <- paste(colnames, "mean")
if(showsd) colnamesStd <- paste(colnames, "std")
colnames <- c()
if(showsd) for(i in 1:length(colnamesMean)) colnames <- c(colnames, colnamesMean[i], colnamesStd[i])
else colnames <- colnamesMean
type <- rep(x = "numeric", times = length(colnames))
dt <- utils::read.table(text = "", colClasses = type, col.names = colnames) %>% as.data.table
split <- str_split(string = history, pattern = "\t")
for(line in split) dt <- line[2:length(line)] %>% str_extract_all(pattern = "\\d*\\.+\\d*") %>% unlist %>% as.numeric %>% as.list %>% {rbindlist( list( dt, .), use.names = F, fill = F)}
if (prediction) {
return( list( dt = dt,pred = predictValues))
}
return(dt)
} }
# Avoid error messages during CRAN check.
# The reason is that these variables are never declared
# They are mainly column names inferred by Data.table... #' Print xgb.cv result
globalVariables(".") #'
#' Prints formatted results of \code{xgb.cv}.
#'
#' @param x an \code{xgb.cv.synchronous} object
#' @param verbose whether to print detailed data
#' @param ... passed to \code{data.table.print}
#'
#' @details
#' When not verbose, it would only print the evaluation results,
#' including the best iteration (when available).
#'
#' @examples
#' data(agaricus.train, package='xgboost')
#' train <- agaricus.train
#' cv <- xgbcv(data = train$data, label = train$label, max.depth = 2,
#' eta = 1, nthread = 2, nround = 2, objective = "binary:logistic")
#' print(cv)
#' print(cv, verbose=TRUE)
#'
#' @rdname print.xgb.cv
#' @export
print.xgb.cv.synchronous <- function(x, verbose=FALSE, ...) {
cat('##### xgb.cv ', length(x$folds), '-folds\n', sep='')
if (verbose) {
if (!is.null(x$call)) {
cat('call:\n ')
print(x$call)
}
if (!is.null(x$params)) {
cat('params (as set within xgb.cv):\n')
cat( ' ',
paste(names(x$params),
paste0('"', unlist(x$params), '"'),
sep=' = ', collapse=', '), '\n', sep='')
}
if (!is.null(x$callbacks) && length(x$callbacks) > 0) {
cat('callbacks:\n')
lapply(callback.calls(x$callbacks), function(x) {
cat(' ')
print(x)
})
}
for (n in c('nboost', 'ntree', 'best_iteration', 'best_ntreelimit')) {
if (is.null(x[[n]]))
next
cat(n, ': ', x[[n]], '\n', sep='')
}
cat('nfolds: ', length(x$folds), '\n', sep='')
if (!is.null(x$pred)) {
cat('pred:\n')
str(x$pred)
}
}
if (verbose)
cat('evaluation_log:\n')
print(x$evaluation_log, row.names = FALSE, ...)
if (!is.null(x$best_iteration)) {
cat('Best iteration:\n')
print(x$evaluation_log[x$best_iteration], row.names = FALSE, ...)
}
invisible(x)
}

View File

@ -1,8 +1,10 @@
#' eXtreme Gradient Boosting Training #' eXtreme Gradient Boosting Training
#' #'
#' An advanced interface for training xgboost model. Look at \code{\link{xgboost}} function for a simpler interface. #' \code{xgb.train} is an advanced interface for training an xgboost model. The \code{xgboost} function provides a simpler interface.
#' #'
#' @param params the list of parameters. #' @param params the list of parameters.
#' The complete list of parameters is available at \url{http://xgboost.readthedocs.io/en/latest/parameter.html}.
#' Below is a shorter summary:
#' #'
#' 1. General Parameters #' 1. General Parameters
#' #'
@ -51,60 +53,98 @@
#' \item \code{eval_metric} evaluation metrics for validation data. Users can pass a self-defined function to it. Default: metric will be assigned according to objective(rmse for regression, and error for classification, mean average precision for ranking). List is provided in detail section. #' \item \code{eval_metric} evaluation metrics for validation data. Users can pass a self-defined function to it. Default: metric will be assigned according to objective(rmse for regression, and error for classification, mean average precision for ranking). List is provided in detail section.
#' } #' }
#' #'
#' @param data takes an \code{xgb.DMatrix} as the input. #' @param data input dataset. \code{xgb.train} takes only an \code{xgb.DMatrix} as the input.
#' \code{xgboost}, in addition, also accepts \code{matrix}, \code{dgCMatrix}, or local data file.
#' @param nrounds the max number of iterations #' @param nrounds the max number of iterations
#' @param watchlist what information should be printed when \code{verbose=1} or #' @param watchlist what information should be printed when \code{verbose=1} or
#' \code{verbose=2}. Watchlist is used to specify validation set monitoring #' \code{verbose=2}. Watchlist is used to specify validation set monitoring
#' during training. For example user can specify #' during training. For example user can specify
#' watchlist=list(validation1=mat1, validation2=mat2) to watch #' watchlist=list(validation1=mat1, validation2=mat2) to watch
#' the performance of each round's model on mat1 and mat2 #' the performance of each round's model on mat1 and mat2
#' #'
#' @param obj customized objective function. Returns gradient and second order #' @param obj customized objective function. Returns gradient and second order
#' gradient with given prediction and dtrain, #' gradient with given prediction and dtrain,
#' @param feval custimized evaluation function. Returns #' @param feval custimized evaluation function. Returns
#' \code{list(metric='metric-name', value='metric-value')} with given #' \code{list(metric='metric-name', value='metric-value')} with given
#' prediction and dtrain, #' prediction and dtrain,
#' @param verbose If 0, xgboost will stay silent. If 1, xgboost will print #' @param verbose If 0, xgboost will stay silent. If 1, xgboost will print
#' information of performance. If 2, xgboost will print information of both #' information of performance. If 2, xgboost will print information of both
#' @param print.every.n Print every N progress messages when \code{verbose>0}. Default is 1 which means all messages are printed. #' @param print.every.n Print every N progress messages when \code{verbose>0}.
#' Default is 1 which means all messages are printed.
#' @param early.stop.round If \code{NULL}, the early stopping function is not triggered. #' @param early.stop.round If \code{NULL}, the early stopping function is not triggered.
#' If set to an integer \code{k}, training with a validation set will stop if the performance #' If set to an integer \code{k}, training with a validation set will stop if the performance
#' keeps getting worse consecutively for \code{k} rounds. #' keeps getting worse consecutively for \code{k} rounds.
#' @param maximize If \code{feval} and \code{early.stop.round} are set, then \code{maximize} must be set as well. #' @param maximize If \code{feval} and \code{early.stop.round} are set,
#' \code{maximize=TRUE} means the larger the evaluation score the better. #' then \code{maximize} must be set as well.
#' @param save_period save the model to the disk in every \code{save_period} rounds, 0 means no such action. #' \code{maximize=TRUE} means the larger the evaluation score the better.
#' @param save_name the name or path for periodically saved model file. #' @param save_period save the model to the disk after every \code{save_period} rounds, 0 means save at the end.
#' @param save_name the name or path for periodically saved model file.
#' @param xgb_model the previously built model to continue the trainig from.
#' Could be either an object of class \code{xgb.Booster}, or its raw data, or the name of a
#' file with a previously saved model.
#' @param callbacks a list of callback functions to perform various task during boosting.
#' See \code{\link{callbacks}}. Some of the callbacks are currently automatically
#' created when specific parameters are set.
#' @param ... other parameters to pass to \code{params}. #' @param ... other parameters to pass to \code{params}.
#' @param label the response variable. User should not set this field,
#' if data is local data file or \code{xgb.DMatrix}.
#' @param missing by default is set to NA, which means that NA values should be considered as 'missing'
#' by the algorithm. Sometimes, 0 or other extreme value might be used to represent missing values.
#' This parameter is only used when input is dense matrix,
#' @param weight a vector indicating the weight for each row of the input.
#' #'
#' @details #' @details
#' This is the training function for \code{xgboost}. #' These are the training functions for \code{xgboost}.
#' #'
#' It supports advanced features such as \code{watchlist}, customized objective function (\code{feval}), #' The \code{xgb.train} interface supports advanced features such as \code{watchlist},
#' therefore it is more flexible than \code{\link{xgboost}} function. #' customized objective and evaluation metric functions, therefore it is more flexible
#' than the \code{\link{xgboost}} interface.
#' #'
#' Parallelization is automatically enabled if \code{OpenMP} is present. #' Parallelization is automatically enabled if \code{OpenMP} is present.
#' Number of threads can also be manually specified via \code{nthread} parameter. #' Number of threads can also be manually specified via \code{nthread} parameter.
#' #'
#' \code{eval_metric} parameter (not listed above) is set automatically by Xgboost but can be overriden by parameter. Below is provided the list of different metric optimized by Xgboost to help you to understand how it works inside or to use them with the \code{watchlist} parameter. #' The evaluation metric is chosen automatically by Xgboost (according to the objective)
#' when the \code{eval_metric} parameter is not provided.
#' User may set one or several \code{eval_metric} parameters.
#' Note that when using a customized metric, only this single metric can be used.
#' The folloiwing is the list of built-in metrics for which Xgboost provides optimized implementation:
#' \itemize{ #' \itemize{
#' \item \code{rmse} root mean square error. \url{http://en.wikipedia.org/wiki/Root_mean_square_error} #' \item \code{rmse} root mean square error. \url{http://en.wikipedia.org/wiki/Root_mean_square_error}
#' \item \code{logloss} negative log-likelihood. \url{http://en.wikipedia.org/wiki/Log-likelihood} #' \item \code{logloss} negative log-likelihood. \url{http://en.wikipedia.org/wiki/Log-likelihood}
#' \item \code{mlogloss} multiclass logloss. \url{https://www.kaggle.com/wiki/MultiClassLogLoss} #' \item \code{mlogloss} multiclass logloss. \url{https://www.kaggle.com/wiki/MultiClassLogLoss}
#' \item \code{error} Binary classification error rate. It is calculated as \code{(wrong cases) / (all cases)}. For the predictions, the evaluation will regard the instances with prediction value larger than 0.5 as positive instances, and the others as negative instances. #' \item \code{error} Binary classification error rate. It is calculated as \code{(wrong cases) / (all cases)}.
#' By default, it uses the 0.5 threshold for predicted values to define negative and positive instances.
#' Different threshold (e.g., 0.) could be specified as "error@0."
#' \item \code{merror} Multiclass classification error rate. It is calculated as \code{(wrong cases) / (all cases)}. #' \item \code{merror} Multiclass classification error rate. It is calculated as \code{(wrong cases) / (all cases)}.
#' \item \code{auc} Area under the curve. \url{http://en.wikipedia.org/wiki/Receiver_operating_characteristic#'Area_under_curve} for ranking evaluation. #' \item \code{auc} Area under the curve. \url{http://en.wikipedia.org/wiki/Receiver_operating_characteristic#'Area_under_curve} for ranking evaluation.
#' \item \code{ndcg} Normalized Discounted Cumulative Gain (for ranking task). \url{http://en.wikipedia.org/wiki/NDCG} #' \item \code{ndcg} Normalized Discounted Cumulative Gain (for ranking task). \url{http://en.wikipedia.org/wiki/NDCG}
#' } #' }
#'
#' Full list of parameters is available in the Wiki \url{https://github.com/dmlc/xgboost/wiki/Parameters}.
#' #'
#' This function only accepts an \code{\link{xgb.DMatrix}} object as the input. #' The following callbacks are automatically created when certain parameters are set:
#' \itemize{
#' \item \code{cb.print_evaluation} is turned on when \code{verbose > 0};
#' and the \code{print.every.n} parameter is passed to it.
#' \item \code{cb.log_evaluation} is on when \code{verbose > 0} and \code{watchlist} is present.
#' \item \code{cb.early_stop}: when \code{early.stop.round} is set.
#' \item \code{cb.save_model}: when \code{save_period > 0} is set.
#' }
#'
#' @return
#' TODO
#' #'
#' @examples #' @examples
#' data(agaricus.train, package='xgboost') #' data(agaricus.train, package='xgboost')
#' data(agaricus.test, package='xgboost')
#'
#' dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label) #' dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
#' dtest <- dtrain #' dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label)
#' watchlist <- list(eval = dtest, train = dtrain) #' watchlist <- list(eval = dtest, train = dtrain)
#'
#' ## A simple xgb.train example:
#' param <- list(max.depth = 2, eta = 1, silent = 1, objective="binary:logistic", eval_metric="auc")
#' bst <- xgb.train(param, dtrain, nthread = 2, nround = 2, watchlist)
#'
#' ## An xgb.train example where custom objective and evaluation metric are used:
#' logregobj <- function(preds, dtrain) { #' logregobj <- function(preds, dtrain) {
#' labels <- getinfo(dtrain, "label") #' labels <- getinfo(dtrain, "label")
#' preds <- 1/(1 + exp(-preds)) #' preds <- 1/(1 + exp(-preds))
@ -117,121 +157,139 @@
#' err <- as.numeric(sum(labels != (preds > 0)))/length(labels) #' err <- as.numeric(sum(labels != (preds > 0)))/length(labels)
#' return(list(metric = "error", value = err)) #' return(list(metric = "error", value = err))
#' } #' }
#' param <- list(max.depth = 2, eta = 1, silent = 1, objective=logregobj,eval_metric=evalerror)
#' bst <- xgb.train(param, dtrain, nthread = 2, nround = 2, watchlist) #' bst <- xgb.train(param, dtrain, nthread = 2, nround = 2, watchlist)
#'
#' ## An xgb.train example of using variable learning rates at each iteration:
#' my_etas <- list(eta = c(0.5, 0.1))
#' bst <- xgb.train(param, dtrain, nthread = 2, nround = 2, watchlist,
#' callbacks = list(cb.reset_parameters(my_etas)))
#'
#' ## Explicit use of the cb.log_evaluation callback allows to run
#' ## xgb.train silently but still store the evaluation results:
#' bst <- xgb.train(param, dtrain, nthread = 2, nround = 2, watchlist,
#' verbose = 0, callbacks = list(cb.log_evaluation()))
#' print(bst$evaluation_log)
#'
#' ## An 'xgboost' interface example:
#' bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max.depth = 2,
#' eta = 1, nthread = 2, nround = 2, objective = "binary:logistic")
#' pred <- predict(bst, agaricus.test$data)
#'
#' @rdname xgb.train
#' @export #' @export
xgb.train <- function(params=list(), data, nrounds, watchlist = list(), xgb.train <- function(params = list(), data, nrounds, watchlist = list(),
obj = NULL, feval = NULL, verbose = 1, print.every.n=1L, obj = NULL, feval = NULL, verbose = 1, print.every.n=1L,
early.stop.round = NULL, maximize = NULL, early.stop.round = NULL, maximize = NULL,
save_period = 0, save_name = "xgboost.model", ...) { save_period = NULL, save_name = "xgboost.model",
xgb_model = NULL, callbacks = list(), ...) {
params <- check.params(params, ...)
check.custom.obj()
check.custom.eval()
# data & watchlist checks
dtrain <- data dtrain <- data
if (typeof(params) != "list") { if (class(dtrain) != "xgb.DMatrix")
stop("xgb.train: first argument params must be list") stop("second argument dtrain must be xgb.DMatrix")
} if (length(watchlist) > 0) {
if (class(dtrain) != "xgb.DMatrix") { if (typeof(watchlist) != "list" ||
stop("xgb.train: second argument dtrain must be xgb.DMatrix") !all(sapply(watchlist, class) == "xgb.DMatrix"))
} stop("watchlist must be a list of xgb.DMatrix elements")
if (verbose > 1) { evnames <- names(watchlist)
params <- append(params, list(silent = 0)) if (is.null(evnames) || any(evnames == ""))
} else { stop("each element of the watchlist must have a name tag")
params <- append(params, list(silent = 1))
}
if (length(watchlist) != 0 && verbose == 0) {
warning('watchlist is provided but verbose=0, no evaluation information will be printed')
} }
fit.call <- match.call() # evaluation printing callback
dot.params <- list(...) params <- c(params, list(silent = ifelse(verbose > 1, 0, 1)))
nms.params <- names(params)
nms.dot.params <- names(dot.params)
if (length(intersect(nms.params,nms.dot.params)) > 0)
stop("Duplicated term in parameters. Please check your list of params.")
params <- append(params, dot.params)
# customized objective and evaluation metric interface
if (!is.null(params$objective) && !is.null(obj))
stop("xgb.train: cannot assign two different objectives")
if (!is.null(params$objective))
if (class(params$objective) == 'function') {
obj <- params$objective
params$objective <- NULL
}
if (!is.null(params$eval_metric) && !is.null(feval))
stop("xgb.train: cannot assign two different evaluation metrics")
if (!is.null(params$eval_metric))
if (class(params$eval_metric) == 'function') {
feval <- params$eval_metric
params$eval_metric <- NULL
}
# Early stopping
if (!is.null(early.stop.round)){
if (!is.null(feval) && is.null(maximize))
stop('Please set maximize to note whether the model is maximizing the evaluation or not.')
if (length(watchlist) == 0)
stop('For early stopping you need at least one set in watchlist.')
if (is.null(maximize) && is.null(params$eval_metric))
stop('Please set maximize to note whether the model is maximizing the evaluation or not.')
if (is.null(maximize))
{
if (params$eval_metric %in% c('rmse','logloss','error','merror','mlogloss')) {
maximize <- FALSE
} else {
maximize <- TRUE
}
}
if (maximize) {
bestScore <- 0
} else {
bestScore <- Inf
}
bestInd <- 0
earlyStopflag = FALSE
if (length(watchlist) > 1)
warning('Only the first data set in watchlist is used for early stopping process.')
}
handle <- xgb.Booster(params, append(watchlist, dtrain))
bst <- xgb.handleToBooster(handle)
print.every.n <- max( as.integer(print.every.n), 1L) print.every.n <- max( as.integer(print.every.n), 1L)
for (i in 1:nrounds) { if (!has.callbacks(callbacks, 'cb.print_evaluation') && verbose)
succ <- xgb.iter.update(bst$handle, dtrain, i - 1, obj) callbacks <- c(callbacks, cb.print_evaluation(print.every.n))
if (length(watchlist) != 0) {
msg <- xgb.iter.eval(bst$handle, watchlist, i - 1, feval) # evaluation log callback: it is automatically enabled only when verbose > 0
if (0 == ( (i - 1) %% print.every.n)) evaluation_log <- list()
cat(paste(msg, "\n", sep = "")) if (verbose > 0 &&
if (!is.null(early.stop.round)) !has.callbacks(callbacks, 'cb.log_evaluation') &&
{ length(watchlist) > 0)
score <- strsplit(msg,':|\\s+')[[1]][3] callbacks <- c(callbacks, cb.log_evaluation())
score <- as.numeric(score)
if ( (maximize && score > bestScore) || (!maximize && score < bestScore)) { # Model saving callback
bestScore <- score if (!is.null(save_period) &&
bestInd <- i !has.callbacks(callbacks, 'cb.save_model'))
} else { callbacks <- c(callbacks, cb.save_model(save_period, save_name))
earlyStopflag = TRUE
if (i - bestInd >= early.stop.round) { # Early stopping callback
cat('Stopping. Best iteration:', bestInd, '\n') stop_condition <- FALSE
break if (!is.null(early.stop.round) &&
} !has.callbacks(callbacks, 'cb.early_stop'))
} callbacks <- c(callbacks,
} cb.early_stop(early.stop.round, maximize=maximize, verbose=verbose))
}
if (save_period > 0) { # Sort the callbacks into categories
if (i %% save_period == 0) { names(callbacks) <- callback.names(callbacks)
xgb.save(bst, save_name) cb <- categorize.callbacks(callbacks)
}
}
# Construct a booster (either a new one or load from xgb_model)
handle <- xgb.Booster(params, append(watchlist, dtrain), xgb_model)
bst <- xgb.handleToBooster(handle)
# When the 'xgb_model' was set, find out how many boosting rounds it has
# by adjusting its number of trees for num_parallel_tree and multiclass
ntree <- 0
if (!is.null(xgb_model)) {
ntree <- if ('ntree' %in% names(xgb_model)) xgb_model$ntree
else length(grep('^booster', xgb.dump(bst)))
} }
num_class <- max(as.numeric(NVL(params[['num_class']], 1)), 1)
num_parallel_tree <- max(as.numeric(NVL(params[['num_parallel_tree']], 1)), 1)
nboost <- ntree %/% (num_parallel_tree * num_class)
# TODO: distributed code
rank <- 0
begin_iteration <- nboost + 1
end_iteration <- nboost + nrounds
# the main loop for boosting iterations
for (iteration in begin_iteration:end_iteration) {
for (f in cb$pre_iter) f()
xgb.iter.update(bst$handle, dtrain, iteration - 1, obj)
bst_evaluation <- numeric(0)
if (length(watchlist) > 0)
bst_evaluation <- xgb.iter.eval(bst$handle, watchlist, iteration - 1, feval)
for (f in cb$post_iter) f()
if (stop_condition) break
}
for (f in cb$finalize) f(finalize=TRUE)
bst <- xgb.Booster.check(bst) bst <- xgb.Booster.check(bst)
if (!is.null(early.stop.round)) { # store the total number of boosting iterations and the number of trees
bst$bestScore <- bestScore bst$nboost = end_iteration
bst$bestInd <- bestInd bst$ntree = end_iteration * num_parallel_tree * num_class
# store the evaluation results
if (length(evaluation_log) > 0 &&
nrow(evaluation_log) > 0) {
# include the previous compatible history when available
if (class(xgb_model) == 'xgb.Booster' &&
!is.null(xgb_model$evaluation_log) &&
all(colnames(evaluation_log) == colnames(xgb_model$evaluation_log)))
evaluation_log <- rbindlist(list(xgb_model$evaluation_log, evaluation_log))
bst$evaluation_log <- evaluation_log
} }
attr(bst, "call") <- fit.call bst$call <- match.call()
attr(bst, "params") <- params bst$params <- params
bst$callbacks <- callbacks
return(bst) return(bst)
} }

View File

@ -1,83 +1,28 @@
#' eXtreme Gradient Boosting (Tree) library # Simple interface for training an xgboost model.
#' # Its documentation is combined with xgb.train.
#' A simple interface for training xgboost model. Look at \code{\link{xgb.train}} function for a more advanced interface. #
#' #' @rdname xgb.train
#' @param data takes \code{matrix}, \code{dgCMatrix}, local data file or
#' \code{xgb.DMatrix}.
#' @param label the response variable. User should not set this field,
#' if data is local data file or \code{xgb.DMatrix}.
#' @param params the list of parameters.
#'
#' Commonly used ones are:
#' \itemize{
#' \item \code{objective} objective function, common ones are
#' \itemize{
#' \item \code{reg:linear} linear regression
#' \item \code{binary:logistic} logistic regression for classification
#' }
#' \item \code{eta} step size of each boosting step
#' \item \code{max.depth} maximum depth of the tree
#' \item \code{nthread} number of thread used in training, if not set, all threads are used
#' }
#'
#' Look at \code{\link{xgb.train}} for a more complete list of parameters or \url{https://github.com/dmlc/xgboost/wiki/Parameters} for the full list.
#'
#' See also \code{demo/} for walkthrough example in R.
#'
#' @param nrounds the max number of iterations
#' @param verbose If 0, xgboost will stay silent. If 1, xgboost will print
#' information of performance. If 2, xgboost will print information of both
#' performance and construction progress information
#' @param print.every.n Print every N progress messages when \code{verbose>0}. Default is 1 which means all messages are printed.
#' @param missing Missing is only used when input is dense matrix, pick a float
#' value that represents missing value. Sometimes a data use 0 or other extreme value to represents missing values.
#' @param weight a vector indicating the weight for each row of the input.
#' @param early.stop.round If \code{NULL}, the early stopping function is not triggered.
#' If set to an integer \code{k}, training with a validation set will stop if the performance
#' keeps getting worse consecutively for \code{k} rounds.
#' @param maximize If \code{feval} and \code{early.stop.round} are set, then \code{maximize} must be set as well.
#' \code{maximize=TRUE} means the larger the evaluation score the better.
#' @param save_period save the model to the disk in every \code{save_period} rounds, 0 means no such action.
#' @param save_name the name or path for periodically saved model file.
#' @param ... other parameters to pass to \code{params}.
#'
#' @details
#' This is the modeling function for Xgboost.
#'
#' Parallelization is automatically enabled if \code{OpenMP} is present.
#'
#' Number of threads can also be manually specified via \code{nthread} parameter.
#'
#' @examples
#' data(agaricus.train, package='xgboost')
#' data(agaricus.test, package='xgboost')
#' train <- agaricus.train
#' test <- agaricus.test
#' bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
#' eta = 1, nthread = 2, nround = 2, objective = "binary:logistic")
#' pred <- predict(bst, test$data)
#'
#' @export #' @export
xgboost <- function(data = NULL, label = NULL, missing = NA, weight = NULL, xgboost <- function(data = NULL, label = NULL, missing = NA, weight = NULL,
params = list(), nrounds, params = list(), nrounds,
verbose = 1, print.every.n = 1L, early.stop.round = NULL, verbose = 1, print.every.n = 1L,
maximize = NULL, save_period = 0, save_name = "xgboost.model", ...) { early.stop.round = NULL, maximize = NULL,
save_period = 0, save_name = "xgboost.model",
xgb_model = NULL, callbacks = list(), ...) {
dtrain <- xgb.get.DMatrix(data, label, missing, weight) dtrain <- xgb.get.DMatrix(data, label, missing, weight)
params <- append(params, list(...)) watchlist <- list()
if (verbose > 0)
if (verbose > 0) { watchlist$train = dtrain
watchlist <- list(train = dtrain)
} else {
watchlist <- list()
}
bst <- xgb.train(params, dtrain, nrounds, watchlist, verbose = verbose, print.every.n=print.every.n, bst <- xgb.train(params, dtrain, nrounds, watchlist, verbose = verbose, print.every.n=print.every.n,
early.stop.round = early.stop.round, maximize = maximize, early.stop.round = early.stop.round, maximize = maximize,
save_period = save_period, save_name = save_name) save_period = save_period, save_name = save_name,
xgb_model = xgb_model, callbacks = callbacks, ...)
return(bst) return(bst)
} }
#' Training part from Mushroom Data Set #' Training part from Mushroom Data Set
#' #'
#' This data set is originally from the Mushroom data set, #' This data set is originally from the Mushroom data set,
@ -131,3 +76,18 @@ NULL
#' @format A list containing a label vector, and a dgCMatrix object with 1611 #' @format A list containing a label vector, and a dgCMatrix object with 1611
#' rows and 126 variables #' rows and 126 variables
NULL NULL
# Various imports
#' @importClassesFrom Matrix dgCMatrix dgeMatrix
#' @importFrom data.table data.table
#' @importFrom data.table as.data.table
#' @importFrom magrittr %>%
#' @importFrom data.table :=
#' @importFrom data.table rbindlist
#' @importFrom stringr str_extract
#' @importFrom stringr str_split
#' @importFrom stringr str_replace
#' @importFrom stringr str_match
#' @import methods
#' @useDynLib xgboost
NULL