xgboost/R-package/R/xgb.cv.R

#' Cross Validation
#'
#' The cross valudation function of xgboost
#'
#' @importFrom data.table data.table
#' @importFrom data.table as.data.table
#' @importFrom magrittr %>%
#' @importFrom data.table :=
#' @importFrom data.table rbindlist
#' @importFrom stringr str_extract_all
#' @importFrom stringr str_extract
#' @importFrom stringr str_split
#' @importFrom stringr str_replace
#' @importFrom stringr str_match
#'
#' @param params the list of parameters. Commonly used ones are:
#' \itemize{
#'   \item \code{objective} objective function, common ones are
#'   \itemize{
#'     \item \code{reg:linear} linear regression
#'     \item \code{binary:logistic} logistic regression for classification
#'   }
#'   \item \code{eta} step size of each boosting step
#'   \item \code{max.depth} maximum depth of the tree
#'   \item \code{nthread} number of thread used in training, if not set, all threads are used
#' }
#'
#'   See \url{https://github.com/tqchen/xgboost/wiki/Parameters} for
#'   further details. See also demo/ for walkthrough example in R.
#' @param data takes an \code{xgb.DMatrix} as the input.
#' @param nrounds the max number of iterations
#' @param nfold number of folds used
#' @param label option field, when data is Matrix
#' @param missing Missing is only used when input is dense matrix, pick a float
#'    value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values.
#' @param prediction A logical value indicating whether to return the prediction vector.
#' @param showsd \code{boolean}, whether show standard deviation of cross validation
#' @param metrics, list of evaluation metrics to be used in corss validation,
#'   when it is not specified, the evaluation metric is chosen according to objective function.
#'   Possible options are:
#' \itemize{
#'   \item \code{error} binary classification error rate
#'   \item \code{rmse} Rooted mean square error
#'   \item \code{logloss} negative log-likelihood function
#'   \item \code{auc} Area under curve
#'   \item \code{merror} Exact matching error, used to evaluate multi-class classification
#' }
#' @param obj customized objective function. Returns gradient and second order
#'   gradient with given prediction and dtrain,
#' @param feval custimized evaluation function. Returns
#'   \code{list(metric='metric-name', value='metric-value')} with given
#'   prediction and dtrain,
#' @param missing Missing is only used when input is dense matrix, pick a float
#     value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values.
#' @param verbose \code{boolean}, print the statistics during the process.
#' @param ... other parameters to pass to \code{params}.
#'
#' @return A \code{data.table} with each mean and standard deviation stat for training set and test set.
#'
#' @details
#' This is the cross validation function for xgboost
#'
#' Parallelization is automatically enabled if OpenMP is present.
#' Number of threads can also be manually specified via "nthread" parameter.
#'
#' This function only accepts an \code{xgb.DMatrix} object as the input.
#'
#' @examples
#' data(agaricus.train, package='xgboost')
#' dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
#' history <- xgb.cv(data = dtrain, nround=3, nfold = 5, metrics=list("rmse","auc"),
#'                   "max.depth"=3, "eta"=1, "objective"="binary:logistic")
#' print(history)
#' @export
#'
xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing = NULL,
                   prediction = FALSE, showsd = TRUE, metrics=list(),
                   obj = NULL, feval = NULL, verbose = T,...) {
  if (typeof(params) != "list") {
    stop("xgb.cv: first argument params must be list")
  }
  if (nfold <= 1) {
    stop("nfold must be bigger than 1")
  }
  if (is.null(missing)) {
    dtrain <- xgb.get.DMatrix(data, label)
  } else {
    dtrain <- xgb.get.DMatrix(data, label, missing)
  }
  params <- append(params, list(...))
  params <- append(params, list(silent=1))
  for (mc in metrics) {
    params <- append(params, list("eval_metric"=mc))
  }

  folds <- xgb.cv.mknfold(dtrain, nfold, params)
  predictValues <- rep(0,xgb.numrow(dtrain))
  history <- c()
  for (i in 1:nrounds) {
    msg <- list()
    for (k in 1:nfold) {
      fd <- folds[[k]]
      succ <- xgb.iter.update(fd$booster, fd$dtrain, i - 1, obj)
      if (!prediction){
        msg[[k]] <- xgb.iter.eval(fd$booster, fd$watchlist, i - 1, feval) %>% str_split("\t") %>% .[[1]]
      } else {
        res <- xgb.iter.eval(fd$booster, fd$watchlist, i - 1, feval, prediction)
        predictValues[fd$index] <- res[[2]]
        msg[[k]] <- res[[1]] %>% str_split("\t") %>% .[[1]]
      }
    }
    ret <- xgb.cv.aggcv(msg, showsd)
    history <- c(history, ret)
    if(verbose) paste(ret, "\n", sep="") %>% cat
  }

  colnames <- str_split(string = history[1], pattern = "\t")[[1]] %>% .[2:length(.)] %>% str_extract(".*:") %>% str_replace(":","") %>% str_replace("-", ".")
  colnamesMean <- paste(colnames, "mean")
  colnamesStd <- paste(colnames, "std")

  colnames <- c()
  for(i in 1:length(colnamesMean)) colnames <- c(colnames, colnamesMean[i], colnamesStd[i])

  type <- rep(x = "numeric", times = length(colnames))
  dt <- read.table(text = "", colClasses = type, col.names = colnames) %>% as.data.table
  split <- str_split(string = history, pattern = "\t")

  for(line in split) dt <- line[2:length(line)] %>% str_extract_all(pattern = "\\d*\\.+\\d*") %>% unlist %>% as.list %>% {vec <- .; rbindlist(list(dt, vec), use.names = F, fill = F)}

  if (prediction) {
    return(list(dt = dt,pred = predictValues))
  }
  return(dt)
}