diff --git a/R-package/R/xgb.cv.R b/R-package/R/xgb.cv.R index e5f5c7b72..2dee9d9f8 100644 --- a/R-package/R/xgb.cv.R +++ b/R-package/R/xgb.cv.R @@ -54,6 +54,13 @@ #' @param folds \code{list} provides a possibility of using a list of pre-defined CV folds (each element must be a vector of fold's indices). #' If folds are supplied, the nfold and stratified parameters would be ignored. #' @param verbose \code{boolean}, print the statistics during the process +#' @param early_stop_round If \code{NULL}, the early stopping function is not triggered. +#' If set to an integer \code{k}, training with a validation set will stop if the performance +#' keeps getting worse consecutively for \code{k} rounds. +#' @param early.stop.round An alternative of \code{early_stop_round}. +#' @param maximize If \code{feval} and \code{early_stop_round} are set, then \code{maximize} must be set as well. +#' \code{maximize=TRUE} means the larger the evaluation score the better. +#' #' @param ... other parameters to pass to \code{params}. #' #' @return @@ -86,7 +93,8 @@ #' xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing = NULL, prediction = FALSE, showsd = TRUE, metrics=list(), - obj = NULL, feval = NULL, stratified = TRUE, folds = NULL, verbose = T,...) { + obj = NULL, feval = NULL, stratified = TRUE, folds = NULL, verbose = T, + early_stop_round = NULL, early.stop.round = NULL, maximize = NULL, ...) { if (typeof(params) != "list") { stop("xgb.cv: first argument params must be list") } @@ -109,7 +117,36 @@ xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing = for (mc in metrics) { params <- append(params, list("eval_metric"=mc)) } - + + # Early Stopping + if (is.null(early_stop_round) && !is.null(early.stop.round)) + early_stop_round = early.stop.round + if (!is.null(early_stop_round)){ + if (!is.null(feval) && is.null(maximize)) + stop('Please set maximize to note whether the model is maximizing the evaluation or not.') + if (is.null(maximize) && is.null(params$eval_metric)) + stop('Please set maximize to note whether the model is maximizing the evaluation or not.') + if (is.null(maximize)) + { + if (params$eval_metric %in% c('rmse','logloss','error','merror','mlogloss')) { + maximize = FALSE + } else { + maximize = TRUE + } + } + + if (maximize) { + bestScore = 0 + } else { + bestScore = Inf + } + bestInd = 0 + earlyStopflag = FALSE + + if (length(metrics)>1) + warning('Only the first metric is used for early stopping process.') + } + xgb_folds <- xgb.cv.mknfold(dtrain, nfold, params, stratified, folds) obj_type = params[['objective']] mat_pred = FALSE @@ -149,6 +186,24 @@ xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing = ret <- xgb.cv.aggcv(msg, showsd) history <- c(history, ret) if(verbose) paste(ret, "\n", sep="") %>% cat + + # early_Stopping + if (!is.null(early_stop_round)){ + score = strsplit(ret,'\\s+')[[1]][1+length(metrics)+1] + score = strsplit(score,'\\+|:')[[1]][[2]] + score = as.numeric(score) + if ((maximize && score>bestScore) || (!maximize && scoreearly_stop_round) { + earlyStopflag = TRUE + cat('Stopping. Best iteration:',bestInd) + break + } + } + } + } colnames <- str_split(string = history[1], pattern = "\t")[[1]] %>% .[2:length(.)] %>% str_extract(".*:") %>% str_replace(":","") %>% str_replace("-", ".") diff --git a/R-package/demo/early_Stopping.R b/R-package/demo/early_Stopping.R index 4cab385ca..692b413aa 100644 --- a/R-package/demo/early_Stopping.R +++ b/R-package/demo/early_Stopping.R @@ -35,3 +35,5 @@ print ('start training with early Stopping setting') # simply look at xgboost.py's implementation of train bst <- xgb.train(param, dtrain, num_round, watchlist, logregobj, evalerror, maximize = FALSE, earlyStopRound = 3) +bst <- xgb.cv(param, dtrain, num_round, nfold=5, obj=logregobj, feval = evalerror, + maximize = FALSE, earlyStopRound = 3) diff --git a/R-package/man/xgb.cv.Rd b/R-package/man/xgb.cv.Rd index 19ab788f9..a2cd68c92 100644 --- a/R-package/man/xgb.cv.Rd +++ b/R-package/man/xgb.cv.Rd @@ -7,7 +7,8 @@ xgb.cv(params = list(), data, nrounds, nfold, label = NULL, missing = NULL, prediction = FALSE, showsd = TRUE, metrics = list(), obj = NULL, feval = NULL, stratified = TRUE, folds = NULL, - verbose = T, ...) + verbose = T, early_stop_round = NULL, early.stop.round = NULL, + maximize = NULL, ...) } \arguments{ \item{params}{the list of parameters. Commonly used ones are: @@ -65,6 +66,15 @@ If folds are supplied, the nfold and stratified parameters would be ignored.} \item{verbose}{\code{boolean}, print the statistics during the process} +\item{early_stop_round}{If \code{NULL}, the early stopping function is not triggered. +If set to an integer \code{k}, training with a validation set will stop if the performance +keeps getting worse consecutively for \code{k} rounds.} + +\item{early.stop.round}{An alternative of \code{early_stop_round}.} + +\item{maximize}{If \code{feval} and \code{early_stop_round} are set, then \code{maximize} must be set as well. + \code{maximize=TRUE} means the larger the evaluation score the better.} + \item{...}{other parameters to pass to \code{params}.} } \value{