R-callbacks refactor

2016-06-09 02:46:13 -05:00
parent 754f3a6e07
commit 422b0000a8
5 changed files with 1109 additions and 515 deletions
--- a/R-package/R/xgb.train.R
+++ b/R-package/R/xgb.train.R
@@ -1,8 +1,10 @@
 #' eXtreme Gradient Boosting Training
 #' 
-#' An advanced interface for training xgboost model. Look at \code{\link{xgboost}} function for a simpler interface.
+#' \code{xgb.train} is an advanced interface for training an xgboost model. The \code{xgboost} function provides a simpler interface.
 #'
 #' @param params the list of parameters. 
+#'        The complete list of parameters is available at \url{http://xgboost.readthedocs.io/en/latest/parameter.html}.
+#'        Below is a shorter summary:
 #' 
 #' 1. General Parameters
 #' 
@@ -51,60 +53,98 @@
 #'   \item \code{eval_metric} evaluation metrics for validation data. Users can pass a self-defined function to it. Default: metric will be assigned according to objective(rmse for regression, and error for classification, mean average precision for ranking). List is provided in detail section.
 #' }
 #' 
-#' @param data takes an \code{xgb.DMatrix} as the input.
+#' @param data input dataset. \code{xgb.train} takes only an \code{xgb.DMatrix} as the input.
+#'        \code{xgboost}, in addition, also accepts \code{matrix}, \code{dgCMatrix}, or local data file.
 #' @param nrounds the max number of iterations
 #' @param watchlist what information should be printed when \code{verbose=1} or
-#'   \code{verbose=2}. Watchlist is used to specify validation set monitoring
-#'   during training. For example user can specify
-#'    watchlist=list(validation1=mat1, validation2=mat2) to watch
-#'    the performance of each round's model on mat1 and mat2
+#'        \code{verbose=2}. Watchlist is used to specify validation set monitoring
+#'        during training. For example user can specify
+#'        watchlist=list(validation1=mat1, validation2=mat2) to watch
+#'        the performance of each round's model on mat1 and mat2
 #'
 #' @param obj customized objective function. Returns gradient and second order 
-#'   gradient with given prediction and dtrain, 
+#'        gradient with given prediction and dtrain, 
 #' @param feval custimized evaluation function. Returns 
-#'   \code{list(metric='metric-name', value='metric-value')} with given 
-#'   prediction and dtrain,
+#'        \code{list(metric='metric-name', value='metric-value')} with given 
+#'        prediction and dtrain,
 #' @param verbose If 0, xgboost will stay silent. If 1, xgboost will print 
-#'   information of performance. If 2, xgboost will print information of both
-#' @param print.every.n Print every N progress messages when \code{verbose>0}. Default is 1 which means all messages are printed.
+#'        information of performance. If 2, xgboost will print information of both
+#' @param print.every.n Print every N progress messages when \code{verbose>0}.
+#'        Default is 1 which means all messages are printed.
 #' @param early.stop.round If \code{NULL}, the early stopping function is not triggered. 
-#'     If set to an integer \code{k}, training with a validation set will stop if the performance 
-#'     keeps getting worse consecutively for \code{k} rounds.
-#' @param maximize If \code{feval} and \code{early.stop.round} are set, then \code{maximize} must be set as well.
-#'     \code{maximize=TRUE} means the larger the evaluation score the better.
-#' @param save_period save the model to the disk in every \code{save_period} rounds, 0 means no such action.
-#' @param save_name the name or path for periodically saved model file.
+#'        If set to an integer \code{k}, training with a validation set will stop if the performance 
+#'        keeps getting worse consecutively for \code{k} rounds.
+#' @param maximize If \code{feval} and \code{early.stop.round} are set, 
+#'        then \code{maximize} must be set as well.
+#'        \code{maximize=TRUE} means the larger the evaluation score the better.
+#' @param save_period save the model to the disk after every \code{save_period} rounds, 0 means save at the end.
+#' @param save_name the name or path for periodically saved model file. 
+#' @param xgb_model the previously built model to continue the trainig from. 
+#'        Could be either an object of class \code{xgb.Booster}, or its raw data, or the name of a 
+#'        file with a previously saved model.
+#' @param callbacks a list of callback functions to perform various task during boosting. 
+#'        See \code{\link{callbacks}}. Some of the callbacks are currently automatically 
+#'        created when specific parameters are set.
 #' @param ... other parameters to pass to \code{params}.
+#' @param label the response variable. User should not set this field,
+#'        if data is local data file or \code{xgb.DMatrix}. 
+#' @param missing by default is set to NA, which means that NA values should be considered as 'missing'
+#'        by the algorithm. Sometimes, 0 or other extreme value might be used to represent missing values.
+#'        This parameter is only used when input is dense matrix,
+#' @param weight a vector indicating the weight for each row of the input.
 #' 
 #' @details 
-#' This is the training function for \code{xgboost}. 
+#' These are the training functions for \code{xgboost}. 
 #' 
-#' It supports advanced features such as \code{watchlist}, customized objective function (\code{feval}),
-#' therefore it is more flexible than \code{\link{xgboost}} function.
+#' The \code{xgb.train} interface supports advanced features such as \code{watchlist}, 
+#' customized objective and evaluation metric functions, therefore it is more flexible 
+#' than the \code{\link{xgboost}} interface.
 #'
 #' Parallelization is automatically enabled if \code{OpenMP} is present. 
 #' Number of threads can also be manually specified via \code{nthread} parameter.
 #' 
-#' \code{eval_metric} parameter (not listed above) is set automatically by Xgboost but can be overriden by parameter. Below is provided the list of different metric optimized by Xgboost to help you to understand how it works inside or to use them with the \code{watchlist} parameter.
+#' The evaluation metric is chosen automatically by Xgboost (according to the objective)
+#' when the \code{eval_metric} parameter is not provided.
+#' User may set one or several \code{eval_metric} parameters. 
+#' Note that when using a customized metric, only this single metric can be used.
+#' The folloiwing is the list of built-in metrics for which Xgboost provides optimized implementation:
 #'   \itemize{
 #'      \item \code{rmse} root mean square error. \url{http://en.wikipedia.org/wiki/Root_mean_square_error}
 #'      \item \code{logloss} negative log-likelihood. \url{http://en.wikipedia.org/wiki/Log-likelihood}
 #'      \item \code{mlogloss} multiclass logloss. \url{https://www.kaggle.com/wiki/MultiClassLogLoss}
-#'      \item \code{error} Binary classification error rate. It is calculated as \code{(wrong cases) / (all cases)}. For the predictions, the evaluation will regard the instances with prediction value larger than 0.5 as positive instances, and the others as negative instances.
+#'      \item \code{error} Binary classification error rate. It is calculated as \code{(wrong cases) / (all cases)}.
+#'            By default, it uses the 0.5 threshold for predicted values to define negative and positive instances.
+#'            Different threshold (e.g., 0.) could be specified as "error@0."
 #'      \item \code{merror} Multiclass classification error rate. It is calculated as \code{(wrong cases) / (all cases)}.
 #'      \item \code{auc} Area under the curve. \url{http://en.wikipedia.org/wiki/Receiver_operating_characteristic#'Area_under_curve} for ranking evaluation.
 #'      \item \code{ndcg} Normalized Discounted Cumulative Gain (for ranking task). \url{http://en.wikipedia.org/wiki/NDCG}
 #'   }
-#'   
-#' Full list of parameters is available in the Wiki \url{https://github.com/dmlc/xgboost/wiki/Parameters}.
 #' 
-#' This function only accepts an \code{\link{xgb.DMatrix}} object as the input.
+#' The following callbacks are automatically created when certain parameters are set:
+#' \itemize{
+#'   \item \code{cb.print_evaluation} is turned on when \code{verbose > 0};
+#'         and the \code{print.every.n} parameter is passed to it.
+#'   \item \code{cb.log_evaluation} is on when \code{verbose > 0} and \code{watchlist} is present.
+#'   \item \code{cb.early_stop}: when \code{early.stop.round} is set.
+#'   \item \code{cb.save_model}: when \code{save_period > 0} is set.
+#' }
+#' 
+#' @return 
+#' TODO
 #' 
 #' @examples
 #' data(agaricus.train, package='xgboost')
+#' data(agaricus.test, package='xgboost')
+#' 
 #' dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
-#' dtest <- dtrain
+#' dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label)
 #' watchlist <- list(eval = dtest, train = dtrain)
+#' 
+#' ## A simple xgb.train example:
+#' param <- list(max.depth = 2, eta = 1, silent = 1, objective="binary:logistic", eval_metric="auc")
+#' bst <- xgb.train(param, dtrain, nthread = 2, nround = 2, watchlist)
+#' 
+#' ## An xgb.train example where custom objective and evaluation metric are used:
 #' logregobj <- function(preds, dtrain) {
 #'    labels <- getinfo(dtrain, "label")
 #'    preds <- 1/(1 + exp(-preds))
@@ -117,121 +157,139 @@
 #'   err <- as.numeric(sum(labels != (preds > 0)))/length(labels)
 #'   return(list(metric = "error", value = err))
 #' }
-#' param <- list(max.depth = 2, eta = 1, silent = 1, objective=logregobj,eval_metric=evalerror)
 #' bst <- xgb.train(param, dtrain, nthread = 2, nround = 2, watchlist)
+#' 
+#' ## An xgb.train example of using variable learning rates at each iteration:
+#' my_etas <- list(eta = c(0.5, 0.1))
+#' bst <- xgb.train(param, dtrain, nthread = 2, nround = 2, watchlist,
+#'                  callbacks = list(cb.reset_parameters(my_etas)))
+#' 
+#' ## Explicit use of the cb.log_evaluation callback allows to run 
+#' ## xgb.train silently but still store the evaluation results:
+#' bst <- xgb.train(param, dtrain, nthread = 2, nround = 2, watchlist,
+#'                  verbose = 0, callbacks = list(cb.log_evaluation()))
+#' print(bst$evaluation_log)
+#' 
+#' ## An 'xgboost' interface example:
+#' bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max.depth = 2, 
+#'                eta = 1, nthread = 2, nround = 2, objective = "binary:logistic")
+#' pred <- predict(bst, agaricus.test$data)
+#' 
+#' @rdname xgb.train
 #' @export
-xgb.train <- function(params=list(), data, nrounds, watchlist = list(),
+xgb.train <- function(params = list(), data, nrounds, watchlist = list(),
                      obj = NULL, feval = NULL, verbose = 1, print.every.n=1L,
                      early.stop.round = NULL, maximize = NULL,
-                      save_period = 0, save_name = "xgboost.model", ...) {
+                      save_period = NULL, save_name = "xgboost.model", 
+                      xgb_model = NULL, callbacks = list(), ...) {
+  
+  params <- check.params(params, ...)
+
+  check.custom.obj()
+  check.custom.eval()
+  
+  # data & watchlist checks
  dtrain <- data
-  if (typeof(params) != "list") {
-    stop("xgb.train: first argument params must be list")
-  }
-  if (class(dtrain) != "xgb.DMatrix") {
-    stop("xgb.train: second argument dtrain must be xgb.DMatrix")
-  }
-  if (verbose > 1) {
-    params <- append(params, list(silent = 0))
-  } else {
-    params <- append(params, list(silent = 1))
-  }
-  if (length(watchlist) != 0 && verbose == 0) {
-    warning('watchlist is provided but verbose=0, no evaluation information will be printed')
+  if (class(dtrain) != "xgb.DMatrix") 
+    stop("second argument dtrain must be xgb.DMatrix")
+  if (length(watchlist) > 0) {
+    if (typeof(watchlist) != "list" ||
+        !all(sapply(watchlist, class) == "xgb.DMatrix"))
+      stop("watchlist must be a list of xgb.DMatrix elements")
+    evnames <- names(watchlist)
+    if (is.null(evnames) || any(evnames == ""))
+      stop("each element of the watchlist must have a name tag")
  }

-  fit.call <- match.call()
-  dot.params <- list(...)
-  nms.params <- names(params)
-  nms.dot.params <- names(dot.params)
-  if (length(intersect(nms.params,nms.dot.params)) > 0)
-    stop("Duplicated term in parameters. Please check your list of params.")
-  params <- append(params, dot.params)
-
-  # customized objective and evaluation metric interface
-  if (!is.null(params$objective) && !is.null(obj))
-    stop("xgb.train: cannot assign two different objectives")
-  if (!is.null(params$objective))
-    if (class(params$objective) == 'function') {
-      obj <- params$objective
-      params$objective <- NULL
-    }
-  if (!is.null(params$eval_metric) && !is.null(feval))
-    stop("xgb.train: cannot assign two different evaluation metrics")
-  if (!is.null(params$eval_metric))
-    if (class(params$eval_metric) == 'function') {
-      feval <- params$eval_metric
-      params$eval_metric <- NULL
-    }
-
-  # Early stopping
-  if (!is.null(early.stop.round)){
-    if (!is.null(feval) && is.null(maximize))
-      stop('Please set maximize to note whether the model is maximizing the evaluation or not.')
-    if (length(watchlist) == 0)
-      stop('For early stopping you need at least one set in watchlist.')
-    if (is.null(maximize) && is.null(params$eval_metric))
-      stop('Please set maximize to note whether the model is maximizing the evaluation or not.')
-    if (is.null(maximize))
-    {
-      if (params$eval_metric %in% c('rmse','logloss','error','merror','mlogloss')) {
-        maximize <- FALSE
-      } else {
-        maximize <- TRUE
-      }
-    }
-
-    if (maximize) {
-      bestScore <- 0
-    } else {
-      bestScore <- Inf
-    }
-    bestInd <- 0
-    earlyStopflag = FALSE
-
-    if (length(watchlist) > 1)
-      warning('Only the first data set in watchlist is used for early stopping process.')
-  }
-
-  handle <- xgb.Booster(params, append(watchlist, dtrain))
-  bst <- xgb.handleToBooster(handle)
+  # evaluation printing callback
+  params <- c(params, list(silent = ifelse(verbose > 1, 0, 1)))
  print.every.n <- max( as.integer(print.every.n), 1L)
-  for (i in 1:nrounds) {
-    succ <- xgb.iter.update(bst$handle, dtrain, i - 1, obj)
-    if (length(watchlist) != 0) {
-      msg <- xgb.iter.eval(bst$handle, watchlist, i - 1, feval)
-      if (0 == ( (i - 1) %% print.every.n))
-	    cat(paste(msg, "\n", sep = ""))
-      if (!is.null(early.stop.round))
-      {
-        score <- strsplit(msg,':|\\s+')[[1]][3]
-        score <- as.numeric(score)
-        if ( (maximize && score > bestScore) || (!maximize && score < bestScore)) {
-          bestScore <- score
-          bestInd <- i
-        } else {
-          earlyStopflag = TRUE
-          if (i - bestInd >= early.stop.round) {
-            cat('Stopping. Best iteration:', bestInd, '\n')
-            break
-          }
-        }
-      }
-    }
-    if (save_period > 0) {
-      if (i %% save_period == 0) {
-        xgb.save(bst, save_name)
-      }
-    }
+  if (!has.callbacks(callbacks, 'cb.print_evaluation') && verbose)
+    callbacks <- c(callbacks, cb.print_evaluation(print.every.n))
+
+  # evaluation log callback:  it is automatically enabled only when verbose > 0
+  evaluation_log <- list()
+  if (verbose > 0 &&
+      !has.callbacks(callbacks, 'cb.log_evaluation') &&
+      length(watchlist) > 0)
+    callbacks <- c(callbacks, cb.log_evaluation())
+  
+  # Model saving callback
+  if (!is.null(save_period) &&
+      !has.callbacks(callbacks, 'cb.save_model'))
+    callbacks <- c(callbacks, cb.save_model(save_period, save_name))
+  
+  # Early stopping callback
+  stop_condition <- FALSE
+  if (!is.null(early.stop.round) &&
+      !has.callbacks(callbacks, 'cb.early_stop'))
+    callbacks <- c(callbacks, 
+                   cb.early_stop(early.stop.round, maximize=maximize, verbose=verbose))
+
+  # Sort the callbacks into categories
+  names(callbacks) <- callback.names(callbacks)
+  cb <- categorize.callbacks(callbacks)
+
+  
+  # Construct a booster (either a new one or load from xgb_model)
+  handle <- xgb.Booster(params, append(watchlist, dtrain), xgb_model)
+  bst <- xgb.handleToBooster(handle)
+
+  
+  # When the 'xgb_model' was set, find out how many boosting rounds it has
+  # by adjusting its number of trees for num_parallel_tree and multiclass
+  ntree <- 0
+  if (!is.null(xgb_model)) {
+    ntree <- if ('ntree' %in% names(xgb_model)) xgb_model$ntree
+             else length(grep('^booster', xgb.dump(bst)))
  }
+  num_class <- max(as.numeric(NVL(params[['num_class']], 1)), 1)
+  num_parallel_tree <- max(as.numeric(NVL(params[['num_parallel_tree']], 1)), 1)
+  nboost <- ntree %/% (num_parallel_tree * num_class)
+
+  # TODO: distributed code
+  rank <- 0
+  
+  begin_iteration <- nboost + 1
+  end_iteration <- nboost + nrounds
+  
+  # the main loop for boosting iterations
+  for (iteration in begin_iteration:end_iteration) {
+    
+    for (f in cb$pre_iter) f()
+    
+    xgb.iter.update(bst$handle, dtrain, iteration - 1, obj)
+    
+    bst_evaluation <- numeric(0)
+    if (length(watchlist) > 0)
+      bst_evaluation <- xgb.iter.eval(bst$handle, watchlist, iteration - 1, feval)
+
+    for (f in cb$post_iter) f()
+
+    if (stop_condition) break
+  }
+  for (f in cb$finalize) f(finalize=TRUE)
+  
  bst <- xgb.Booster.check(bst)
-
-  if (!is.null(early.stop.round)) {
-    bst$bestScore <- bestScore
-    bst$bestInd <- bestInd
+  
+  # store the total number of boosting iterations and the number of trees
+  bst$nboost = end_iteration
+  bst$ntree = end_iteration * num_parallel_tree * num_class
+  
+  # store the evaluation results
+  if (length(evaluation_log) > 0 &&
+      nrow(evaluation_log) > 0) {
+    # include the previous compatible history when available
+    if (class(xgb_model) == 'xgb.Booster' &&
+        !is.null(xgb_model$evaluation_log) &&
+        all(colnames(evaluation_log) == colnames(xgb_model$evaluation_log)))
+      evaluation_log <- rbindlist(list(xgb_model$evaluation_log, evaluation_log))
+    bst$evaluation_log <- evaluation_log
  }

-  attr(bst, "call") <- fit.call
-  attr(bst, "params") <- params
+  bst$call <- match.call()
+  bst$params <- params
+  bst$callbacks <- callbacks
+  
  return(bst)
 }