add early stopping to R

2015-05-05 16:31:49 -07:00
parent 3b4697786e
commit 54fb49ee5c
24 changed files with 106 additions and 31 deletions
--- a/R-package/R/xgb.train.R
+++ b/R-package/R/xgb.train.R
@@ -66,7 +66,11 @@
 #'   prediction and dtrain,
 #' @param verbose If 0, xgboost will stay silent. If 1, xgboost will print 
 #'   information of performance. If 2, xgboost will print information of both
-#'
+#' @param earlyStopRound If \code{NULL}, the early stopping function is not triggered. 
+#'     If set to an integer \code{k}, training with a validation set will stop if the performance 
+#'     keeps getting worse consecutively for \code{k} rounds.
+#' @param maximize If \code{feval} and \code{earlyStopRound} are set, then \code{maximize} must be set as well.
+#'     \code{maximize=TRUE} means the larger the evaluation score the better.
 #' @param ... other parameters to pass to \code{params}.
 #' 
 #' @details 
@@ -114,7 +118,8 @@
 #' @export
 #' 
 xgb.train <- function(params=list(), data, nrounds, watchlist = list(), 
-                      obj = NULL, feval = NULL, verbose = 1, ...) {
+                      obj = NULL, feval = NULL, verbose = 1, 
+                      earlyStopRound = NULL, maximize = NULL, ...) {
  dtrain <- data
  if (typeof(params) != "list") {
    stop("xgb.train: first argument params must be list")
@@ -133,6 +138,33 @@ xgb.train <- function(params=list(), data, nrounds, watchlist = list(),
  }
  params = append(params, list(...))
  
+  # Early stopping
+  if (!is.null(feval) && is.null(maximize))
+    stop('Please set maximize to note whether the model is maximizing the evaluation or not.')
+  if (length(watchlist) == 0 && !is.null(earlyStopRound))
+    stop('For early stopping you need at least one set in watchlist.')
+  if (is.null(maximize) && is.null(params$eval_metric))
+    stop('Please set maximize to note whether the model is maximizing the evaluation or not.')
+  if (is.null(maximize))
+  {
+    if (params$eval_metric %in% c('rmse','logloss','error','merror','mlogloss')) {
+      maximize = FALSE
+    } else {
+      maximize = TRUE
+    }
+  }
+  
+  if (maximize) {
+    bestScore = 0
+  } else {
+    bestScore = Inf
+  }
+  bestInd = 0
+  earlyStopflag = FALSE
+  
+  if (length(watchlist)>1 && !is.null(earlyStopRound))
+    warning('Only the first data set in watchlist is used for early stopping process.')
+  
  handle <- xgb.Booster(params, append(watchlist, dtrain))
  bst <- xgb.handleToBooster(handle)
  for (i in 1:nrounds) {
@@ -140,8 +172,30 @@ xgb.train <- function(params=list(), data, nrounds, watchlist = list(),
    if (length(watchlist) != 0) {
      msg <- xgb.iter.eval(bst$handle, watchlist, i - 1, feval)
      cat(paste(msg, "\n", sep=""))
+      if (!is.null(earlyStopRound))
+      {
+        score = strsplit(msg,'\\s+')[[1]][1]
+        score = strsplit(score,':')[[1]][2]
+        score = as.numeric(score)
+        if ((maximize && score>bestScore) || (!maximize && score<bestScore)) {
+          bestScore = score
+          bestInd = i
+        } else {
+          if (i-bestInd>earlyStopRound) {
+            earlyStopflag = TRUE
+          }
+        }
+      }
+    }
+    if (earlyStopflag) {
+      cat('Stopping. Best iteration:',bestInd)
+      break
    }
  }
  bst <- xgb.Booster.check(bst)
+  if (!is.null(earlyStopRound)) {
+    bst$bestScore = bestScore
+    bst$bestInd = bestInd
+  }
  return(bst)
 } 
--- a/R-package/R/xgboost.R
+++ b/R-package/R/xgboost.R
@@ -30,6 +30,11 @@
 #'   performance and construction progress information
 #' @param missing Missing is only used when input is dense matrix, pick a float 
 #'     value that represents missing value. Sometimes a data use 0 or other extreme value to represents missing values.
+#' @param earlyStopRound If \code{NULL}, the early stopping function is not triggered. 
+#'     If set to an integer \code{k}, training with a validation set will stop if the performance 
+#'     keeps getting worse consecutively for \code{k} rounds.
+#' @param maximize If \code{feval} and \code{earlyStopRound} are set, then \code{maximize} must be set as well.
+#'     \code{maximize=TRUE} means the larger the evaluation score the better.
 #' @param ... other parameters to pass to \code{params}.
 #' 
 #' @details 
@@ -51,7 +56,7 @@
 #' @export
 #' 
 xgboost <- function(data = NULL, label = NULL, missing = NULL, params = list(), nrounds, 
-                    verbose = 1, ...) {
+                    verbose = 1, earlyStopRound = NULL, maximize = NULL, ...) {
  if (is.null(missing)) {
    dtrain <- xgb.get.DMatrix(data, label)
  } else {
@@ -66,7 +71,8 @@ xgboost <- function(data = NULL, label = NULL, missing = NULL, params = list(),
    watchlist <- list()
  }
  
-  bst <- xgb.train(params, dtrain, nrounds, watchlist, verbose=verbose)
+  bst <- xgb.train(params, dtrain, nrounds, watchlist, verbose = verbose,
+                   earlyStopRound = earlyStopRound)
  
  return(bst)
 }