From 76650c096f0d6b97c2306b518f81e9c1ee6856fb Mon Sep 17 00:00:00 2001 From: Vadim Khotilovich Date: Mon, 27 Jun 2016 01:49:47 -0500 Subject: [PATCH] [R] CB naming change; cv-prediction as CB; add.cb function to ensure proper CB order; docs; minor fixes + changes --- R-package/R/callbacks.R | 229 +++++++++++++++++++++++++++++----------- R-package/R/xgb.cv.R | 174 ++++++++++++++++-------------- R-package/R/xgb.train.R | 191 ++++++++++++++++++++------------- 3 files changed, 376 insertions(+), 218 deletions(-) diff --git a/R-package/R/callbacks.R b/R-package/R/callbacks.R index 95cc372eb..cffcec31f 100644 --- a/R-package/R/callbacks.R +++ b/R-package/R/callbacks.R @@ -15,18 +15,19 @@ #' the environment from which they are called from, which is a fairly uncommon thing to do in R. #' #' To write a custom callback closure, make sure you first understand the main concepts about R envoronments. -#' Check either the R docs on \code{\link[base]{environment}} or the -#' \href{http://adv-r.had.co.nz/Environments.html}{Environments chapter} from Hadley Wickham's "Advanced R" book. -#' Then take a look at the code of \code{cb.reset_learning_rate} for a simple example, -#' and see the \code{cb.log_evaluation} code for something more involved. -#' Also, you would need to get familiar with the objects available inside of the \code{xgb.train} internal environment. +#' Check either R documentation on \code{\link[base]{environment}} or the +#' \href{http://adv-r.had.co.nz/Environments.html}{Environments chapter} from the "Advanced R" +#' book by Hadley Wickham. Further, the best option is to read the code of some of the existing callbacks - +#' choose ones that do something similar to what you want to achieve. Also, you would need to get familiar +#' with the objects available inside of the \code{xgb.train} and \code{xgb.cv} internal environments. #' #' @seealso -#' \code{\link{cb.print_evaluation}}, -#' \code{\link{cb.log_evaluation}}, -#' \code{\link{cb.reset_parameters}}, -#' \code{\link{cb.early_stop}}, -#' \code{\link{cb.save_model}}, +#' \code{\link{cb.print.evaluation}}, +#' \code{\link{cb.evaluation.log}}, +#' \code{\link{cb.reset.parameters}}, +#' \code{\link{cb.early.stop}}, +#' \code{\link{cb.save.model}}, +#' \code{\link{cb.cv.predict}}, #' \code{\link{xgb.train}}, #' \code{\link{xgb.cv}} #' @@ -55,7 +56,7 @@ NULL #' \code{\link{callbacks}} #' #' @export -cb.print_evaluation <- function(period=1) { +cb.print.evaluation <- function(period=1) { callback <- function(env = parent.frame()) { if (length(env$bst_evaluation) == 0 || @@ -67,12 +68,12 @@ cb.print_evaluation <- function(period=1) { if ((i-1) %% period == 0 || i == env$begin_iteration || i == env$end_iteration) { - msg <- format_eval_string(i, env$bst_evaluation, env$bst_evaluation_err) + msg <- format.eval.string(i, env$bst_evaluation, env$bst_evaluation_err) cat(msg, '\n') } } attr(callback, 'call') <- match.call() - attr(callback, 'name') <- 'cb.print_evaluation' + attr(callback, 'name') <- 'cb.print.evaluation' callback } @@ -100,7 +101,7 @@ cb.print_evaluation <- function(period=1) { #' \code{\link{callbacks}} #' #' @export -cb.log_evaluation <- function() { +cb.evaluation.log <- function() { mnames <- NULL @@ -147,7 +148,7 @@ cb.log_evaluation <- function() { list(c(iter = env$iteration, ev))) } attr(callback, 'call') <- match.call() - attr(callback, 'name') <- 'cb.log_evaluation' + attr(callback, 'name') <- 'cb.evaluation.log' callback } @@ -178,17 +179,27 @@ cb.log_evaluation <- function() { #' \code{\link{callbacks}} #' #' @export -cb.reset_parameters <- function(new_params) { +cb.reset.parameters <- function(new_params) { if (typeof(new_params) != "list") stop("'new_params' must be a list") pnames <- gsub("\\.", "_", names(new_params)) - # TODO: restrict the set of parameters that could be reset? nrounds <- NULL # run some checks in the begining init <- function(env) { nrounds <<- env$end_iteration - env$begin_iteration + 1 + + if (is.null(env$bst) && is.null(env$bst_folds)) + stop("Parent frame has neither 'bst' nor 'bst_folds'") + + # Some parameters are not allowed to be changed, + # since changing them would simply wreck some chaos + not_allowed <- pnames %in% + c('num_class', 'num_output_group', 'size_leaf_vector', 'updater_seq') + if (any(not_allowed)) + stop('Parameters ', paste(pnames[not_allowed]), " cannot be changed during boosting.") + for (n in pnames) { p <- new_params[[n]] if (is.function(p)) { @@ -223,7 +234,7 @@ cb.reset_parameters <- function(new_params) { } attr(callback, 'is_pre_iteration') <- TRUE attr(callback, 'call') <- match.call() - attr(callback, 'name') <- 'cb.reset_parameters' + attr(callback, 'name') <- 'cb.reset.parameters' callback } @@ -246,15 +257,15 @@ cb.reset_parameters <- function(new_params) { #' This callback function determines the condition for early stopping #' by setting the \code{stop_condition = TRUE} flag in its calling frame. #' -#' The following additional fields are assigned to the model R object: +#' The following additional fields are assigned to the model's R object: #' \itemize{ #' \item \code{best_score} the evaluation score at the best iteration #' \item \code{best_iteration} at which boosting iteration the best score has occurred (1-based index) #' \item \code{best_ntreelimit} to use with the \code{ntreelimit} parameter in \code{predict}. -#' It differs from \code{best_iteration} in multiclass or random forest settings. +#' It differs from \code{best_iteration} in multiclass or random forest settings. #' } #' -#' The Same values are also stored as xgb-attributes, however: +#' The Same values are also stored as xgb-attributes: #' \itemize{ #' \item \code{best_iteration} is stored as a 0-based iteration index (for interoperability of binary models) #' \item \code{best_msg} message string is also stored. @@ -266,22 +277,22 @@ cb.reset_parameters <- function(new_params) { #' \code{stop_condition}, #' \code{bst_evaluation}, #' \code{rank}, -#' \code{bst} or \code{bst_folds}, +#' \code{bst} (or \code{bst_folds} and \code{basket}), #' \code{iteration}, #' \code{begin_iteration}, #' \code{end_iteration}, -#' \code{num_parallel_tree}, -#' \code{num_class}. +#' \code{num_parallel_tree}. #' #' @seealso #' \code{\link{callbacks}}, #' \code{\link{xgb.attr}} #' #' @export -cb.early_stop <- function(stopping_rounds, maximize=FALSE, +cb.early.stop <- function(stopping_rounds, maximize=FALSE, metric_name=NULL, verbose=TRUE) { # state variables best_iteration <- -1 + best_ntreelimit <- -1 best_score <- Inf best_msg <- NULL metric_idx <- 1 @@ -331,24 +342,23 @@ cb.early_stop <- function(stopping_rounds, maximize=FALSE, xgb.attributes(env$bst$handle) <- list(best_iteration = best_iteration - 1, best_score = best_score) } - } else if (is.null(env$bst_folds)) { - stop("Parent frame has neither 'bst' nor 'bst_folds'") + } else if (is.null(env$bst_folds) || is.null(env$basket)) { + stop("Parent frame has neither 'bst' nor ('bst_folds' and 'basket')") } } finalizer <- function(env) { - best_ntreelimit = best_iteration * env$num_parallel_tree * env$num_class if (!is.null(env$bst)) { attr_best_score = as.numeric(xgb.attr(env$bst$handle, 'best_score')) if (best_score != attr_best_score) - stop("Inconsistent 'best_score' between the state: ", best_score, + stop("Inconsistent 'best_score' values between the closure state: ", best_score, " and the xgb.attr: ", attr_best_score) - env$bst$best_score = best_score env$bst$best_iteration = best_iteration env$bst$best_ntreelimit = best_ntreelimit + env$bst$best_score = best_score } else { - attr(env$bst_folds, 'best_iteration') <- best_iteration - attr(env$bst_folds, 'best_ntreelimit') <- best_ntreelimit + env$basket$best_iteration <- best_iteration + env$basket$best_ntreelimit <- best_ntreelimit } } @@ -365,16 +375,17 @@ cb.early_stop <- function(stopping_rounds, maximize=FALSE, if (( maximize && score > best_score) || (!maximize && score < best_score)) { - best_msg <<- format_eval_string(i, env$bst_evaluation, env$bst_evaluation_err) + best_msg <<- format.eval.string(i, env$bst_evaluation, env$bst_evaluation_err) best_score <<- score best_iteration <<- i + best_ntreelimit <<- best_iteration * env$num_parallel_tree # save the property to attributes, so they will occur in checkpoint if (!is.null(env$bst)) { xgb.attributes(env$bst) <- list( best_iteration = best_iteration - 1, # convert to 0-based index best_score = best_score, best_msg = best_msg, - best_ntreelimit = best_iteration * env$num_parallel_tree * env$num_class) + best_ntreelimit = best_ntreelimit) } } else if (i - best_iteration >= stopping_rounds) { env$stop_condition <- TRUE @@ -384,7 +395,7 @@ cb.early_stop <- function(stopping_rounds, maximize=FALSE, } } attr(callback, 'call') <- match.call() - attr(callback, 'name') <- 'cb.early_stop' + attr(callback, 'name') <- 'cb.early.stop' callback } @@ -412,7 +423,7 @@ cb.early_stop <- function(stopping_rounds, maximize=FALSE, #' \code{\link{callbacks}} #' #' @export -cb.save_model <- function(save_period = 0, save_name = "xgboost.model") { +cb.save.model <- function(save_period = 0, save_name = "xgboost.model") { if (save_period < 0) stop("'save_period' cannot be negative") @@ -426,7 +437,80 @@ cb.save_model <- function(save_period = 0, save_name = "xgboost.model") { xgb.save(env$bst, sprintf(save_name, env$iteration)) } attr(callback, 'call') <- match.call() - attr(callback, 'name') <- 'cb.save_model' + attr(callback, 'name') <- 'cb.save.model' + callback +} + + +#' Callback closure for returning cross-validation based predictions. +#' +#' @param save_models a flag for whether to save the folds' models. +#' +#' @details +#' This callback function saves predictions for all of the test folds, +#' and also allows to save the folds' models. +#' +#' It is a "finalizer" callback and it uses early stopping information whenever it is available, +#' thus it must be run after the early stopping callback if the early stopping is used. +#' +#' Callback function expects the following values to be set in its calling frame: +#' \code{bst_folds}, +#' \code{basket}, +#' \code{data}, +#' \code{end_iteration}, +#' \code{num_parallel_tree}, +#' \code{num_class}. +#' +#' @return +#' Predictions are returned inside of the \code{pred} element, which is either a vector or a matrix, +#' depending on the number of prediction outputs per data row. The order of predictions corresponds +#' to the order of rows in the original dataset. Note that when a custom \code{folds} list is +#' provided in \code{xgb.cv}, the predictions would only be returned properly when this list is a +#' non-overlapping list of k sets of indices, as in a standard k-fold CV. The predictions would not be +#' meaningful when user-profided folds have overlapping indices as in, e.g., random sampling splits. +#' When some of the indices in the training dataset are not included into user-provided \code{folds}, +#' their prediction value would be \code{NA}. +#' +#' @seealso +#' \code{\link{callbacks}} +#' +#' @export +cb.cv.predict <- function(save_models = FALSE) { + + finalizer <- function(env) { + if (is.null(env$basket) || is.null(env$bst_folds)) + stop("'cb.cv.predict' callback requires 'basket' and 'bst_folds' lists in its calling frame") + + N <- nrow(env$data) + pred <- ifelse(env$num_class > 1, + matrix(NA_real_, N, env$num_class), + rep(NA_real_, N)) + + ntreelimit <- NVL(env$basket$best_ntreelimit, + env$end_iteration * env$num_parallel_tree) + for (fd in env$bst_folds) { + pr <- predict(fd$bst, fd$watchlist[[2]], ntreelimit = ntreelimit, reshape = TRUE) + if (is.matrix(pred)) { + pred[fd$index,] <- pr + } else { + pred[fd$index] <- pr + } + } + env$basket$pred <- pred + if (save_models) { + env$basket$models <- lapply(env$bst_folds, function(fd) { + xgb.attr(fd$bst, 'niter') <- env$end_iteration - 1 + xgb.Booster.check(xgb.handleToBooster(fd$bst), saveraw = TRUE) + }) + } + } + + callback <- function(env = parent.frame(), finalize = FALSE) { + if (finalize) + return(finalizer(env)) + } + attr(callback, 'call') <- match.call() + attr(callback, 'name') <- 'cb.cv.predict' callback } @@ -436,7 +520,7 @@ cb.save_model <- function(save_period = 0, save_name = "xgboost.model") { # # Format the evaluation metric string -format_eval_string <- function(iter, eval_res, eval_err=NULL) { +format.eval.string <- function(iter, eval_res, eval_err=NULL) { if (length(eval_res) == 0) stop('no evaluation results') enames <- names(eval_res) @@ -454,47 +538,68 @@ format_eval_string <- function(iter, eval_res, eval_err=NULL) { } # Extract callback names from the list of callbacks -callback.names <- function(cb.list) { - unlist(lapply(cb.list, function(x) attr(x, 'name'))) +callback.names <- function(cb_list) { + unlist(lapply(cb_list, function(x) attr(x, 'name'))) } # Extract callback calls from the list of callbacks -callback.calls <- function(cb.list) { - unlist(lapply(cb.list, function(x) attr(x, 'call'))) +callback.calls <- function(cb_list) { + unlist(lapply(cb_list, function(x) attr(x, 'call'))) +} + +# Add a callback cb to the list and make sure that +# cb.early.stop and cb.cv.predict are at the end of the list +# with cb.cv.predict being the last (when present) +add.cb <- function(cb_list, cb) { + cb_list <- c(cb_list, cb) + names(cb_list) <- callback.names(cb_list) + if ('cb.early.stop' %in% names(cb_list)) { + cb_list <- c(cb_list, cb_list['cb.early.stop']) + # this removes only the first one + cb_list['cb.early.stop'] <- NULL + } + if ('cb.cv.predict' %in% names(cb_list)) { + cb_list <- c(cb_list, cb_list['cb.cv.predict']) + cb_list['cb.cv.predict'] <- NULL + } + cb_list } # Sort callbacks list into categories -categorize.callbacks <- function(cb.list) { +categorize.callbacks <- function(cb_list) { list( pre_iter = Filter(function(x) { pre <- attr(x, 'is_pre_iteration') !is.null(pre) && pre - }, cb.list), + }, cb_list), post_iter = Filter(function(x) { pre <- attr(x, 'is_pre_iteration') is.null(pre) || !pre - }, cb.list), + }, cb_list), finalize = Filter(function(x) { 'finalize' %in% names(formals(x)) - }, cb.list) + }, cb_list) ) } -# Check whether all callback functions with names given by 'query.names' are present in the 'cb.list'. -has.callbacks <- function(cb.list, query.names) { - if (length(cb.list) < length(query.names)) +# Check whether all callback functions with names given by 'query_names' are present in the 'cb_list'. +has.callbacks <- function(cb_list, query_names) { + if (length(cb_list) < length(query_names)) return(FALSE) - if (!is.list(cb.list) || - !all(sapply(cb.list, class) == 'function')) - stop('`cb.list`` must be a list of callback functions') - cb.names <- callback.names(cb.list) - if (!is.character(cb.names) || - length(cb.names) != length(cb.list) || - any(cb.names == "")) - stop('All callbacks in the `cb.list` must have a non-empty `name` attribute') - if (!is.character(query.names) || - length(query.names) == 0 || - any(query.names == "")) - stop('query.names must be a non-empty vector of non-empty character names') - return(all(query.names %in% cb.names)) + if (!is.list(cb_list) || + any(sapply(cb_list, class) != 'function')) { + stop('`cb_list`` must be a list of callback functions') + } + cb_names <- callback.names(cb_list) + if (!is.character(cb_names) || + length(cb_names) != length(cb_list) || + any(cb_names == "")) { + stop('All callbacks in the `cb_list` must have a non-empty `name` attribute') + } + if (!is.character(query_names) || + length(query_names) == 0 || + any(query_names == "")) { + stop('query_names must be a non-empty vector of non-empty character names') + } + return(all(query_names %in% cb_names)) } diff --git a/R-package/R/xgb.cv.R b/R-package/R/xgb.cv.R index c7aeebf70..e3a84cec0 100644 --- a/R-package/R/xgb.cv.R +++ b/R-package/R/xgb.cv.R @@ -10,20 +10,22 @@ #' \item \code{binary:logistic} logistic regression for classification #' } #' \item \code{eta} step size of each boosting step -#' \item \code{max.depth} maximum depth of the tree +#' \item \code{max_depth} maximum depth of the tree #' \item \code{nthread} number of thread used in training, if not set, all threads are used #' } #' -#' See \link{xgb.train} for further details. +#' See \code{\link{xgb.train}} for further details. #' See also demo/ for walkthrough example in R. #' @param data takes an \code{xgb.DMatrix} or \code{Matrix} as the input. #' @param nrounds the max number of iterations #' @param nfold the original dataset is randomly partitioned into \code{nfold} equal size subsamples. -#' @param label option field, when data is \code{Matrix} -#' @param missing Missing is only used when input is dense matrix, pick a float -#' value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values. -#' @param prediction A logical value indicating whether to return the prediction vector. -#' @param showsd \code{boolean}, whether show standard deviation of cross validation +#' @param label vector of response values. Should be provided only when data is \code{DMatrix}. +#' @param missing is only used when input is a dense matrix. By default is set to NA, which means +#' that NA values should be considered as 'missing' by the algorithm. +#' Sometimes, 0 or other extreme value might be used to represent missing values. +#' @param prediction A logical value indicating whether to return the test fold predictions +#' from each CV model. This parameter engages the \code{\link{cb.cv.predict}} callback. +#' @param showsd \code{boolean}, whether to show standard deviation of cross validation #' @param metrics, list of evaluation metrics to be used in cross validation, #' when it is not specified, the evaluation metric is chosen according to objective function. #' Possible options are: @@ -35,34 +37,33 @@ #' \item \code{merror} Exact matching error, used to evaluate multi-class classification #' } #' @param obj customized objective function. Returns gradient and second order -#' gradient with given prediction and dtrain. +#' gradient with given prediction and dtrain. #' @param feval custimized evaluation function. Returns -#' \code{list(metric='metric-name', value='metric-value')} with given -#' prediction and dtrain. -#' @param stratified \code{boolean} whether sampling of folds should be stratified by the values of labels in \code{data} -#' @param folds \code{list} provides a possibility of using a list of pre-defined CV folds (each element must be a vector of fold's indices). -#' If folds are supplied, the nfold and stratified parameters would be ignored. +#' \code{list(metric='metric-name', value='metric-value')} with given +#' prediction and dtrain. +#' @param stratified a \code{boolean} indicating whether sampling of folds should be stratified +#' by the values of outcome labels. +#' @param folds \code{list} provides a possibility to use a list of pre-defined CV folds +#' (each element must be a vector of test fold's indices). When folds are supplied, +#' the \code{nfold} and \code{stratified} parameters are ignored. #' @param verbose \code{boolean}, print the statistics during the process -#' @param print.every.n Print every N progress messages when \code{verbose>0}. Default is 1 which means all messages are printed. -#' @param early.stop.round If \code{NULL}, the early stopping function is not triggered. +#' @param print_every_n Print each n-th iteration evaluation messages when \code{verbose>0}. +#' Default is 1 which means all messages are printed. This parameter is passed to the +#' \code{\link{cb.print.evaluation}} callback. +#' @param early_stopping_rounds If \code{NULL}, the early stopping function is not triggered. #' If set to an integer \code{k}, training with a validation set will stop if the performance #' doesn't improve for \code{k} rounds. -#' @param maximize If \code{feval} and \code{early.stop.round} are set, then \code{maximize} must be set as well. -#' \code{maximize=TRUE} means the larger the evaluation score the better. -#' +#' Setting this parameter engages the \code{\link{cb.early.stop}} callback. +#' @param maximize If \code{feval} and \code{early_stopping_rounds} are set, +#' then this parameter must be set as well. +#' When it is \code{TRUE}, it means the larger the evaluation score the better. +#' This parameter is passed to the \code{\link{cb.early.stop}} callback. +#' @param callbacks a list of callback functions to perform various task during boosting. +#' See \code{\link{callbacks}}. Some of the callbacks are automatically created depending on the +#' parameters' values. User can provide either existing or their own callback methods in order +#' to customize the training process. #' @param ... other parameters to pass to \code{params}. #' -#' @return -#' TODO: update this... -#' -#' If \code{prediction = TRUE}, a list with the following elements is returned: -#' \itemize{ -#' \item \code{dt} a \code{data.table} with each mean and standard deviation stat for training set and test set -#' \item \code{pred} an array or matrix (for multiclass classification) with predictions for each CV-fold for the model having been trained on the data in all other folds. -#' } -#' -#' If \code{prediction = FALSE}, just a \code{data.table} with each mean and standard deviation stat for training set and test set is returned. -#' #' @details #' The original sample is randomly partitioned into \code{nfold} equal size subsamples. #' @@ -74,23 +75,50 @@ #' #' Adapted from \url{http://en.wikipedia.org/wiki/Cross-validation_\%28statistics\%29#k-fold_cross-validation} #' +#' @return +#' An object of class \code{xgb.cv.synchronous} with the following elements: +#' \itemize{ +#' \item \code{call} a function call. +#' \item \code{params} parameters that were passed to the xgboost library. Note that it does not +#' capture parameters changed by the \code{\link{cb.reset.parameters}} callback. +#' \item \code{callbacks} callback functions that were either automatically assigned or +#' explicitely passed. +#' \item \code{evaluation_log} evaluation history storead as a \code{data.table} with the +#' first column corresponding to iteration number and the rest corresponding to the +#' CV-based evaluation means and standard deviations for the training and test CV-sets. +#' It is created by the \code{\link{cb.evaluation.log}} callback. +#' \item \code{niter} number of boosting iterations. +#' \item \code{folds} the list of CV folds' indices - either those passed through the \code{folds} +#' parameter or randomly generated. +#' \item \code{best_iteration} iteration number with the best evaluation metric value +#' (only available with early stopping). +#' \item \code{best_ntreelimit} the \code{ntreelimit} value corresponding to the best iteration, +#' which could further be used in \code{predict} method +#' (only available with early stopping). +#' \item \code{pred} CV prediction values available when \code{prediction} is set. +#' It is either vector or matrix (see \code{\link{cb.cv.predict}}). +#' \item \code{models} a liost of the CV folds' models. It is only available with the explicit +#' setting of the \code{cb.cv.predict(save_models = TRUE)} callback. +#' } +#' #' @examples #' data(agaricus.train, package='xgboost') #' dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label) -#' history <- xgb.cv(data = dtrain, nround=3, nthread = 2, nfold = 5, metrics=list("rmse","auc"), -#' max.depth =3, eta = 1, objective = "binary:logistic") -#' print(history) +#' cv <- xgb.cv(data = dtrain, nrounds = 3, nthread = 2, nfold = 5, metrics = list("rmse","auc"), +#' max_depth = 3, eta = 1, objective = "binary:logistic") +#' print(cv) +#' print(cv, verbose=TRUE) #' #' @export xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing = NA, prediction = FALSE, showsd = TRUE, metrics=list(), obj = NULL, feval = NULL, stratified = TRUE, folds = NULL, - verbose = TRUE, print.every.n=1L, - early.stop.round = NULL, maximize = NULL, callbacks = list(), ...) { + verbose = TRUE, print_every_n=1L, + early_stopping_rounds = NULL, maximize = NULL, callbacks = list(), ...) { - #strategy <- match.arg(strategy) + check.deprecation(...) - params <- check.params(params, ...) + params <- check.booster.params(params, ...) # TODO: should we deprecate the redundant 'metrics' parameter? for (m in metrics) params <- c(params, list("eval_metric" = m)) @@ -124,23 +152,28 @@ xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing = # verbosity & evaluation printing callback: params <- c(params, list(silent = 1)) - print.every.n <- max( as.integer(print.every.n), 1L) - if (!has.callbacks(callbacks, 'cb.print_evaluation') && verbose) - callbacks <- c(callbacks, cb.print_evaluation(print.every.n)) - + print_every_n <- max( as.integer(print_every_n), 1L) + if (!has.callbacks(callbacks, 'cb.print.evaluation') && verbose) { + callbacks <- add.cb(callbacks, cb.print.evaluation(print_every_n)) + } # evaluation log callback: always is on in CV evaluation_log <- list() - if (!has.callbacks(callbacks, 'cb.log_evaluation')) - callbacks <- c(callbacks, cb.log_evaluation()) - + if (!has.callbacks(callbacks, 'cb.evaluation.log')) { + callbacks <- add.cb(callbacks, cb.evaluation.log()) + } # Early stopping callback stop_condition <- FALSE - if (!is.null(early.stop.round) && - !has.callbacks(callbacks, 'cb.early_stop')) - callbacks <- c(callbacks, cb.early_stop(early.stop.round, maximize=maximize, verbose=verbose)) - + if (!is.null(early_stopping_rounds) && + !has.callbacks(callbacks, 'cb.early.stop')) { + callbacks <- add.cb(callbacks, cb.early.stop(early_stopping_rounds, + maximize=maximize, verbose=verbose)) + } + # CV-predictions callback + if (prediction && + !has.callbacks(callbacks, 'cb.cv.predict')) { + callbacks <- add.cb(callbacks, cb.cv.predict(save_model=FALSE)) + } # Sort the callbacks into categories - names(callbacks) <- callback.names(callbacks) cb <- categorize.callbacks(callbacks) @@ -152,10 +185,14 @@ xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing = bst <- xgb.Booster(params, list(dtrain, dtest)) list(dtrain=dtrain, bst=bst, watchlist=list(train=dtrain, test=dtest), index=folds[[k]]) }) + # a "basket" to collect some results from callbacks + basket <- list() + # extract parameters that can affect the relationship b/w #trees and #iterations num_class <- max(as.numeric(NVL(params[['num_class']], 1)), 1) num_parallel_tree <- max(as.numeric(NVL(params[['num_parallel_tree']], 1)), 1) + # those are fixed for CV (no training continuation) begin_iteration <- 1 end_iteration <- nrounds @@ -171,7 +208,7 @@ xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing = msg <- simplify2array(msg) bst_evaluation <- rowMeans(msg) bst_evaluation_err <- sqrt(rowMeans(msg^2) - bst_evaluation^2) - + for (f in cb$post_iter) f() if (stop_condition) break @@ -184,35 +221,11 @@ xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing = params = params, callbacks = callbacks, evaluation_log = evaluation_log, - nboost = end_iteration, - ntree = end_iteration * num_parallel_tree * num_class - ) - if (!is.null(attr(bst_folds, 'best_iteration'))) { - ret$best_iteration <- attr(bst_folds, 'best_iteration') - ret$best_ntreelimit <- attr(bst_folds, 'best_ntreelimit') - } - ret$folds <- folds + niter = end_iteration, + folds = folds + ) + ret <- c(ret, basket) - # TODO: should making prediction go - # a. into a callback? - # b. return folds' models, and have a separate method for predictions? - if (prediction) { - ret$pred <- ifelse(num_class > 1, - matrix(0, nrow(data), num_class), - rep(0, nrow(data))) - ntreelimit <- NVL(ret$best_ntreelimit, ret$ntree) - for (fd in bst_folds) { - pred <- predict(fd$bst, fd$watchlist[[2]], ntreelimit = ntreelimit) - if (is.matrix(ret$pred)) - ret$pred[fd$index,] <- t(matrix(pred, num_class, length(fd$index))) - else - ret$pred[fd$index] <- pred - } - ret$bst <- lapply(bst_folds, function(x) { - xgb.Booster.check(xgb.handleToBooster(x$bst), saveraw = TRUE) - }) - } - class(ret) <- 'xgb.cv.synchronous' invisible(ret) } @@ -234,8 +247,8 @@ xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing = #' @examples #' data(agaricus.train, package='xgboost') #' train <- agaricus.train -#' cv <- xgb.cv(data = train$data, label = train$label, nfold = 5, max.depth = 2, -#' eta = 1, nthread = 2, nround = 2, objective = "binary:logistic") +#' cv <- xgb.cv(data = train$data, label = train$label, nfold = 5, max_depth = 2, +#' eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic") #' print(cv) #' print(cv, verbose=TRUE) #' @@ -264,7 +277,7 @@ print.xgb.cv.synchronous <- function(x, verbose=FALSE, ...) { }) } - for (n in c('nboost', 'ntree', 'best_iteration', 'best_ntreelimit')) { + for (n in c('niter', 'best_iteration', 'best_ntreelimit')) { if (is.null(x[[n]])) next cat(n, ': ', x[[n]], '\n', sep='') @@ -279,6 +292,7 @@ print.xgb.cv.synchronous <- function(x, verbose=FALSE, ...) { if (verbose) cat('evaluation_log:\n') print(x$evaluation_log, row.names = FALSE, ...) + if (!is.null(x$best_iteration)) { cat('Best iteration:\n') print(x$evaluation_log[x$best_iteration], row.names = FALSE, ...) diff --git a/R-package/R/xgb.train.R b/R-package/R/xgb.train.R index c2fbea53e..022e02246 100644 --- a/R-package/R/xgb.train.R +++ b/R-package/R/xgb.train.R @@ -45,8 +45,8 @@ #' \item \code{binary:logistic} logistic regression for binary classification. Output probability. #' \item \code{binary:logitraw} logistic regression for binary classification, output score before logistic transformation. #' \item \code{num_class} set the number of classes. To use only with multiclass objectives. -#' \item \code{multi:softmax} set xgboost to do multiclass classification using the softmax objective. Class is represented by a number and should be from 0 to \code{num_class}. -#' \item \code{multi:softprob} same as softmax, but output a vector of ndata * nclass, which can be further reshaped to ndata, nclass matrix. The result contains predicted probabilities of each data point belonging to each class. +#' \item \code{multi:softmax} set xgboost to do multiclass classification using the softmax objective. Class is represented by a number and should be from 0 to \code{num_class - 1}. +#' \item \code{multi:softprob} same as softmax, but prediction outputs a vector of ndata * nclass elements, which can be further reshaped to ndata, nclass matrix. The result contains predicted probabilities of each data point belonging to each class. #' \item \code{rank:pairwise} set xgboost to do ranking task by minimizing the pairwise loss. #' } #' \item \code{base_score} the initial prediction score of all instances, global bias. Default: 0.5 @@ -63,34 +63,41 @@ #' the performance of each round's model on mat1 and mat2 #' #' @param obj customized objective function. Returns gradient and second order -#' gradient with given prediction and dtrain, +#' gradient with given prediction and dtrain. #' @param feval custimized evaluation function. Returns #' \code{list(metric='metric-name', value='metric-value')} with given -#' prediction and dtrain, +#' prediction and dtrain. #' @param verbose If 0, xgboost will stay silent. If 1, xgboost will print -#' information of performance. If 2, xgboost will print information of both -#' @param print.every.n Print every N progress messages when \code{verbose>0}. -#' Default is 1 which means all messages are printed. -#' @param early.stop.round If \code{NULL}, the early stopping function is not triggered. +#' information of performance. If 2, xgboost will print some additional information. +#' Setting \code{verbose > 0} automatically engages the \code{\link{cb.evaluation.log}} and +#' \code{\link{cb.print.evaluation}} callback functions. +#' @param print_every_n Print each n-th iteration evaluation messages when \code{verbose>0}. +#' Default is 1 which means all messages are printed. This parameter is passed to the +#' \code{\link{cb.print.evaluation}} callback. +#' @param early_stopping_rounds If \code{NULL}, the early stopping function is not triggered. #' If set to an integer \code{k}, training with a validation set will stop if the performance -#' keeps getting worse consecutively for \code{k} rounds. -#' @param maximize If \code{feval} and \code{early.stop.round} are set, -#' then \code{maximize} must be set as well. -#' \code{maximize=TRUE} means the larger the evaluation score the better. -#' @param save_period save the model to the disk after every \code{save_period} rounds, 0 means save at the end. -#' @param save_name the name or path for periodically saved model file. -#' @param xgb_model the previously built model to continue the trainig from. +#' doesn't improve for \code{k} rounds. +#' Setting this parameter engages the \code{\link{cb.early.stop}} callback. +#' @param maximize If \code{feval} and \code{early_stopping_rounds} are set, +#' then this parameter must be set as well. +#' When it is \code{TRUE}, it means the larger the evaluation score the better. +#' This parameter is passed to the \code{\link{cb.early.stop}} callback. +#' @param save_period when it is non-NULL, model is saved to disk after every \code{save_period} rounds, +#' 0 means save at the end. The saving is handled by the \code{\link{cb.save.model}} callback. +#' @param save_name the name or path for periodically saved model file. +#' @param xgb_model a previously built model to continue the trainig from. #' Could be either an object of class \code{xgb.Booster}, or its raw data, or the name of a #' file with a previously saved model. -#' @param callbacks a list of callback functions to perform various task during boosting. -#' See \code{\link{callbacks}}. Some of the callbacks are currently automatically -#' created when specific parameters are set. +#' @param callbacks a list of callback functions to perform various task during boosting. +#' See \code{\link{callbacks}}. Some of the callbacks are automatically created depending on the +#' parameters' values. User can provide either existing or their own callback methods in order +#' to customize the training process. #' @param ... other parameters to pass to \code{params}. -#' @param label the response variable. User should not set this field, -#' if data is local data file or \code{xgb.DMatrix}. +#' @param label vector of response values. Should not be provided when data is +#' a local data file name or an \code{xgb.DMatrix}. #' @param missing by default is set to NA, which means that NA values should be considered as 'missing' #' by the algorithm. Sometimes, 0 or other extreme value might be used to represent missing values. -#' This parameter is only used when input is dense matrix, +#' This parameter is only used when input is a dense matrix. #' @param weight a vector indicating the weight for each row of the input. #' #' @details @@ -112,25 +119,50 @@ #' \item \code{rmse} root mean square error. \url{http://en.wikipedia.org/wiki/Root_mean_square_error} #' \item \code{logloss} negative log-likelihood. \url{http://en.wikipedia.org/wiki/Log-likelihood} #' \item \code{mlogloss} multiclass logloss. \url{https://www.kaggle.com/wiki/MultiClassLogLoss} -#' \item \code{error} Binary classification error rate. It is calculated as \code{(wrong cases) / (all cases)}. +#' \item \code{error} Binary classification error rate. It is calculated as \code{(# wrong cases) / (# all cases)}. #' By default, it uses the 0.5 threshold for predicted values to define negative and positive instances. #' Different threshold (e.g., 0.) could be specified as "error@0." -#' \item \code{merror} Multiclass classification error rate. It is calculated as \code{(wrong cases) / (all cases)}. +#' \item \code{merror} Multiclass classification error rate. It is calculated as \code{(# wrong cases) / (# all cases)}. #' \item \code{auc} Area under the curve. \url{http://en.wikipedia.org/wiki/Receiver_operating_characteristic#'Area_under_curve} for ranking evaluation. #' \item \code{ndcg} Normalized Discounted Cumulative Gain (for ranking task). \url{http://en.wikipedia.org/wiki/NDCG} #' } #' #' The following callbacks are automatically created when certain parameters are set: #' \itemize{ -#' \item \code{cb.print_evaluation} is turned on when \code{verbose > 0}; -#' and the \code{print.every.n} parameter is passed to it. -#' \item \code{cb.log_evaluation} is on when \code{verbose > 0} and \code{watchlist} is present. -#' \item \code{cb.early_stop}: when \code{early.stop.round} is set. -#' \item \code{cb.save_model}: when \code{save_period > 0} is set. +#' \item \code{cb.print.evaluation} is turned on when \code{verbose > 0}; +#' and the \code{print_every_n} parameter is passed to it. +#' \item \code{cb.evaluation.log} is on when \code{verbose > 0} and \code{watchlist} is present. +#' \item \code{cb.early.stop}: when \code{early_stopping_rounds} is set. +#' \item \code{cb.save.model}: when \code{save_period > 0} is set. #' } #' #' @return -#' TODO +#' An object of class \code{xgb.Booster} with the following elements: +#' \itemize{ +#' \item \code{handle} a handle (pointer) to the xgboost model in memory. +#' \item \code{raw} a cached memory dump of the xgboost model saved as R's \code{raw} type. +#' \item \code{niter} number of boosting iterations. +#' \item \code{evaluation_log} evaluation history storead as a \code{data.table} with the +#' first column corresponding to iteration number and the rest corresponding to evaluation +#' metrics' values. It is created by the \code{\link{cb.evaluation.log}} callback. +#' \item \code{call} a function call. +#' \item \code{params} parameters that were passed to the xgboost library. Note that it does not +#' capture parameters changed by the \code{\link{cb.reset.parameters}} callback. +#' \item \code{callbacks} callback functions that were either automatically assigned or +#' explicitely passed. +#' \item \code{best_iteration} iteration number with the best evaluation metric value +#' (only available with early stopping). +#' \item \code{best_ntreelimit} the \code{ntreelimit} value corresponding to the best iteration, +#' which could further be used in \code{predict} method +#' (only available with early stopping). +#' \item \code{best_score} the best evaluation metric value during early stopping. +#' (only available with early stopping). +#' } +#' +#' @seealso +#' \code{\link{callbacks}}, +#' \code{\link{predict.xgb.Booster}}, +#' \code{\link{xgb.cv}} #' #' @examples #' data(agaricus.train, package='xgboost') @@ -141,8 +173,9 @@ #' watchlist <- list(eval = dtest, train = dtrain) #' #' ## A simple xgb.train example: -#' param <- list(max.depth = 2, eta = 1, silent = 1, objective="binary:logistic", eval_metric="auc") -#' bst <- xgb.train(param, dtrain, nthread = 2, nround = 2, watchlist) +#' param <- list(max_depth = 2, eta = 1, silent = 1, +#' objective = "binary:logistic", eval_metric = "auc") +#' bst <- xgb.train(param, dtrain, nthread = 2, nrounds = 2, watchlist) #' #' ## An xgb.train example where custom objective and evaluation metric are used: #' logregobj <- function(preds, dtrain) { @@ -157,33 +190,36 @@ #' err <- as.numeric(sum(labels != (preds > 0)))/length(labels) #' return(list(metric = "error", value = err)) #' } -#' bst <- xgb.train(param, dtrain, nthread = 2, nround = 2, watchlist) +#' bst <- xgb.train(param, dtrain, nthread = 2, nrounds = 2, watchlist) #' #' ## An xgb.train example of using variable learning rates at each iteration: #' my_etas <- list(eta = c(0.5, 0.1)) -#' bst <- xgb.train(param, dtrain, nthread = 2, nround = 2, watchlist, -#' callbacks = list(cb.reset_parameters(my_etas))) +#' bst <- xgb.train(param, dtrain, nthread = 2, nrounds = 2, watchlist, +#' callbacks = list(cb.reset.parameters(my_etas))) #' -#' ## Explicit use of the cb.log_evaluation callback allows to run +#' ## Explicit use of the cb.evaluation.log callback allows to run #' ## xgb.train silently but still store the evaluation results: -#' bst <- xgb.train(param, dtrain, nthread = 2, nround = 2, watchlist, -#' verbose = 0, callbacks = list(cb.log_evaluation())) +#' bst <- xgb.train(param, dtrain, nthread = 2, nrounds = 2, watchlist, +#' verbose = 0, callbacks = list(cb.evaluation.log())) #' print(bst$evaluation_log) #' #' ## An 'xgboost' interface example: -#' bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max.depth = 2, -#' eta = 1, nthread = 2, nround = 2, objective = "binary:logistic") +#' bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, +#' max_depth = 2, eta = 1, nthread = 2, nrounds = 2, +#' objective = "binary:logistic") #' pred <- predict(bst, agaricus.test$data) #' #' @rdname xgb.train #' @export xgb.train <- function(params = list(), data, nrounds, watchlist = list(), - obj = NULL, feval = NULL, verbose = 1, print.every.n=1L, - early.stop.round = NULL, maximize = NULL, + obj = NULL, feval = NULL, verbose = 1, print_every_n=1L, + early_stopping_rounds = NULL, maximize = NULL, save_period = NULL, save_name = "xgboost.model", xgb_model = NULL, callbacks = list(), ...) { - params <- check.params(params, ...) + check.deprecation(...) + + params <- check.booster.params(params, ...) check.custom.obj() check.custom.eval() @@ -203,31 +239,30 @@ xgb.train <- function(params = list(), data, nrounds, watchlist = list(), # evaluation printing callback params <- c(params, list(silent = ifelse(verbose > 1, 0, 1))) - print.every.n <- max( as.integer(print.every.n), 1L) - if (!has.callbacks(callbacks, 'cb.print_evaluation') && verbose) - callbacks <- c(callbacks, cb.print_evaluation(print.every.n)) - + print_every_n <- max( as.integer(print_every_n), 1L) + if (!has.callbacks(callbacks, 'cb.print.evaluation') && verbose) { + callbacks <- add.cb(callbacks, cb.print.evaluation(print_every_n)) + } # evaluation log callback: it is automatically enabled only when verbose > 0 evaluation_log <- list() if (verbose > 0 && - !has.callbacks(callbacks, 'cb.log_evaluation') && - length(watchlist) > 0) - callbacks <- c(callbacks, cb.log_evaluation()) - + !has.callbacks(callbacks, 'cb.evaluation.log') && + length(watchlist) > 0) { + callbacks <- add.cb(callbacks, cb.evaluation.log()) + } # Model saving callback if (!is.null(save_period) && - !has.callbacks(callbacks, 'cb.save_model')) - callbacks <- c(callbacks, cb.save_model(save_period, save_name)) - + !has.callbacks(callbacks, 'cb.save.model')) { + callbacks <- add.cb(callbacks, cb.save.model(save_period, save_name)) + } # Early stopping callback stop_condition <- FALSE - if (!is.null(early.stop.round) && - !has.callbacks(callbacks, 'cb.early_stop')) - callbacks <- c(callbacks, - cb.early_stop(early.stop.round, maximize=maximize, verbose=verbose)) - + if (!is.null(early_stopping_rounds) && + !has.callbacks(callbacks, 'cb.early.stop')) { + callbacks <- add.cb(callbacks, cb.early.stop(early_stopping_rounds, + maximize=maximize, verbose=verbose)) + } # Sort the callbacks into categories - names(callbacks) <- callback.names(callbacks) cb <- categorize.callbacks(callbacks) @@ -235,23 +270,24 @@ xgb.train <- function(params = list(), data, nrounds, watchlist = list(), handle <- xgb.Booster(params, append(watchlist, dtrain), xgb_model) bst <- xgb.handleToBooster(handle) - - # When the 'xgb_model' was set, find out how many boosting rounds it has - # by adjusting its number of trees for num_parallel_tree and multiclass - ntree <- 0 - if (!is.null(xgb_model)) { - ntree <- if ('ntree' %in% names(xgb_model)) xgb_model$ntree - else length(grep('^booster', xgb.dump(bst))) - } + # extract parameters that can affect the relationship b/w #trees and #iterations num_class <- max(as.numeric(NVL(params[['num_class']], 1)), 1) num_parallel_tree <- max(as.numeric(NVL(params[['num_parallel_tree']], 1)), 1) - nboost <- ntree %/% (num_parallel_tree * num_class) + + # When the 'xgb_model' was set, find out how many boosting iterations it has + niter_skip <- 0 + if (!is.null(xgb_model)) { + niter_skip <- as.numeric(xgb.attr(bst, 'niter')) + 1 + if (length(niter_skip) == 0) { + niter_skip <- xgb.ntree(bst) %/% (num_parallel_tree * num_class) + } + } # TODO: distributed code rank <- 0 - begin_iteration <- nboost + 1 - end_iteration <- nboost + nrounds + begin_iteration <- niter_skip + 1 + end_iteration <- niter_skip + nrounds # the main loop for boosting iterations for (iteration in begin_iteration:end_iteration) { @@ -263,6 +299,8 @@ xgb.train <- function(params = list(), data, nrounds, watchlist = list(), bst_evaluation <- numeric(0) if (length(watchlist) > 0) bst_evaluation <- xgb.iter.eval(bst$handle, watchlist, iteration - 1, feval) + + xgb.attr(bst$handle, 'niter') <- iteration - 1 for (f in cb$post_iter) f() @@ -270,20 +308,21 @@ xgb.train <- function(params = list(), data, nrounds, watchlist = list(), } for (f in cb$finalize) f(finalize=TRUE) - bst <- xgb.Booster.check(bst) - - # store the total number of boosting iterations and the number of trees - bst$nboost = end_iteration - bst$ntree = end_iteration * num_parallel_tree * num_class + bst <- xgb.Booster.check(bst, saveraw = TRUE) + # store the total number of boosting iterations + bst$niter = end_iteration + # store the evaluation results if (length(evaluation_log) > 0 && nrow(evaluation_log) > 0) { # include the previous compatible history when available if (class(xgb_model) == 'xgb.Booster' && !is.null(xgb_model$evaluation_log) && - all(colnames(evaluation_log) == colnames(xgb_model$evaluation_log))) + all.equal(colnames(evaluation_log), + colnames(xgb_model$evaluation_log))) { evaluation_log <- rbindlist(list(xgb_model$evaluation_log, evaluation_log)) + } bst$evaluation_log <- evaluation_log }