Merge pull request #227 from khotilov/master

add stratified cross validation for classification
2015-04-30 11:39:52 -07:00 · 2015-04-30 11:39:52 -07:00 · bab7b58d94
commit bab7b58d94
parent 188d81d64a aefd234da3
3 changed files with 118 additions and 25 deletions
--- a/R-package/R/utils.R
+++ b/R-package/R/utils.R
@ -214,34 +214,49 @@ xgb.iter.eval <- function(booster, watchlist, iter, feval = NULL, prediction = F
 #------------------------------------------
 # helper functions for cross validation
 #
-xgb.cv.mknfold <- function(dall, nfold, param) {
+xgb.cv.mknfold <- function(dall, nfold, param, stratified, folds) {
  if (nfold <= 1) {
    stop("nfold must be bigger than 1")
  }
+  if(is.null(folds)) {
+    y <- getinfo(dall, 'label')
    randidx <- sample(1 : xgb.numrow(dall))
+    if (stratified & length(y) == length(randidx)) {
+      y <- y[randidx]
+      # By default assume that y is a classification label,
+      # and only leave it numeric for the reg:linear objective.
+      # WARNING: if there would be any other objectives with truly
+      #   numerical labels, they currently would not be treated correctly!
+      if (param[['objective']] != 'reg:linear') y <- factor(y)
+      folds <- xgb.createFolds(y, nfold)
+    } else { 
+      # make simple non-stratified folds
      kstep <- length(randidx) %/% nfold
-  idset <- list()
+      folds <- list()
      for (i in 1:(nfold-1)) {
-    idset[[i]] = randidx[1:kstep]
-    randidx = setdiff(randidx,idset[[i]])
+        folds[[i]] = randidx[1:kstep]
+        randidx = setdiff(randidx, folds[[i]])
+      }
+      folds[[nfold]] = randidx
+    }
  }
-  idset[[nfold]] = randidx
  ret <- list()
  for (k in 1:nfold) {
-    dtest <- slice(dall, idset[[k]])
+    dtest <- slice(dall, folds[[k]])
    didx = c()
    for (i in 1:nfold) {
      if (i != k) {
-        didx <- append(didx, idset[[i]])
+        didx <- append(didx, folds[[i]])
      }
    }
    dtrain <- slice(dall, didx)
    bst <- xgb.Booster(param, list(dtrain, dtest))
    watchlist = list(train=dtrain, test=dtest)
-    ret[[k]] <- list(dtrain=dtrain, booster=bst, watchlist=watchlist, index=idset[[k]])
+    ret[[k]] <- list(dtrain=dtrain, booster=bst, watchlist=watchlist, index=folds[[k]])
  }
  return (ret)
 }
+
 xgb.cv.aggcv <- function(res, showsd = TRUE) {
  header <- res[[1]]
  ret <- header[1]
@ -261,3 +276,53 @@ xgb.cv.aggcv <- function(res, showsd = TRUE) {
  }
  return (ret)
 }
+
+# Shamelessly copied from caret::createFolds
+# and simplified by always returning an unnamed list of test indices
+xgb.createFolds <- function(y, k = 10)
+{
+  if(is.numeric(y)) {
+    ## Group the numeric data based on their magnitudes
+    ## and sample within those groups.
+
+    ## When the number of samples is low, we may have
+    ## issues further slicing the numeric data into
+    ## groups. The number of groups will depend on the
+    ## ratio of the number of folds to the sample size.
+    ## At most, we will use quantiles. If the sample
+    ## is too small, we just do regular unstratified
+    ## CV
+    cuts <- floor(length(y)/k)
+    if(cuts < 2) cuts <- 2
+    if(cuts > 5) cuts <- 5
+    y <- cut(y,
+             unique(quantile(y, probs = seq(0, 1, length = cuts))),
+             include.lowest = TRUE)
+  }
+
+  if(k < length(y)) {
+    ## reset levels so that the possible levels and
+    ## the levels in the vector are the same
+    y <- factor(as.character(y))
+    numInClass <- table(y)
+    foldVector <- vector(mode = "integer", length(y))
+    
+    ## For each class, balance the fold allocation as far
+    ## as possible, then resample the remainder.
+    ## The final assignment of folds is also randomized.
+    for(i in 1:length(numInClass)) {
+      ## create a vector of integers from 1:k as many times as possible without
+      ## going over the number of samples in the class. Note that if the number
+      ## of samples in a class is less than k, nothing is producd here.
+      seqVector <- rep(1:k, numInClass[i] %/% k)
+      ## add enough random integers to get  length(seqVector) == numInClass[i]
+      if(numInClass[i] %% k > 0) seqVector <- c(seqVector, sample(1:k, numInClass[i] %% k))
+      ## shuffle the integers for fold assignment and assign to this classes's data
+      foldVector[which(y == dimnames(numInClass)$y[i])] <- sample(seqVector)
+    }
+  } else foldVector <- seq(along = y)
+
+  out <- split(seq(along = y), foldVector)
+  names(out) <- NULL
+  out
+}
--- a/R-package/R/xgb.cv.R
+++ b/R-package/R/xgb.cv.R
@ -46,14 +46,24 @@
 #'   \item \code{merror} Exact matching error, used to evaluate multi-class classification
 #' }
 #' @param obj customized objective function. Returns gradient and second order 
-#'   gradient with given prediction and dtrain, 
+#'   gradient with given prediction and dtrain.
 #' @param feval custimized evaluation function. Returns 
 #'   \code{list(metric='metric-name', value='metric-value')} with given 
-#'   prediction and dtrain,
-#' @param verbose \code{boolean}, print the statistics during the process.
+#'   prediction and dtrain.
+#' @param stratified \code{boolean} whether sampling of folds should be stratified by the values of labels in \code{data}
+#' @param folds \code{list} provides a possibility of using a list of pre-defined CV folds (each element must be a vector of fold's indices).
+#'   If folds are supplied, the nfold and stratified parameters would be ignored.
+#' @param verbose \code{boolean}, print the statistics during the process
 #' @param ... other parameters to pass to \code{params}.
 #' 
-#' @return A \code{data.table} with each mean and standard deviation stat for training set and test set.
+#' @return
+#' If \code{prediction = TRUE}, a list with the following elements is returned:
+#' \itemize{
+#'   \item \code{dt} a \code{data.table} with each mean and standard deviation stat for training set and test set
+#'   \item \code{pred} an array or matrix (for multiclass classification) with predictions for each CV-fold for the model having been trained on the data in all other folds.
+#' }
+#'
+#' If \code{prediction = FALSE}, just a \code{data.table} with each mean and standard deviation stat for training set and test set is returned.
 #'
 #' @details 
 #' The original sample is randomly partitioned into \code{nfold} equal size subsamples. 
@ -76,10 +86,16 @@
 #'
 xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing = NULL, 
                   prediction = FALSE, showsd = TRUE, metrics=list(), 
-                   obj = NULL, feval = NULL, verbose = T,...) {
+                   obj = NULL, feval = NULL, stratified = TRUE, folds = NULL, verbose = T,...) {
  if (typeof(params) != "list") {
    stop("xgb.cv: first argument params must be list")
  }
+  if(!is.null(folds)) {
+    if(class(folds)!="list" | length(folds) < 2) {
+      stop("folds must be a list with 2 or more elements that are vectors of indices for each CV-fold")
+    }
+    nfold <- length(folds)
+  }
  if (nfold <= 1) {
    stop("nfold must be bigger than 1")
  }
@ -94,7 +110,7 @@ xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing =
    params <- append(params, list("eval_metric"=mc))
  }

-  folds <- xgb.cv.mknfold(dtrain, nfold, params)
+  xgb_folds <- xgb.cv.mknfold(dtrain, nfold, params, stratified, folds)
  obj_type = params[['objective']]
  mat_pred = FALSE
  if (!is.null(obj_type) && obj_type=='multi:softprob')
@ -111,7 +127,7 @@ xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing =
  for (i in 1:nrounds) {
    msg <- list()
    for (k in 1:nfold) {
-      fd <- folds[[k]]
+      fd <- xgb_folds[[k]]
      succ <- xgb.iter.update(fd$booster, fd$dtrain, i - 1, obj)
      if (i<nrounds) {
          msg[[k]] <- xgb.iter.eval(fd$booster, fd$watchlist, i - 1, feval) %>% str_split("\t") %>% .[[1]]
@ -147,7 +163,7 @@ xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing =
  dt <- read.table(text = "", colClasses = type, col.names = colnames) %>% as.data.table
  split <- str_split(string = history, pattern = "\t")
  
-  for(line in split) dt <- line[2:length(line)] %>% str_extract_all(pattern = "\\d*\\.+\\d*") %>% unlist %>% as.list %>% {vec <- .; rbindlist(list(dt, vec), use.names = F, fill = F)}
+  for(line in split) dt <- line[2:length(line)] %>% str_extract_all(pattern = "\\d*\\.+\\d*") %>% unlist %>% as.numeric %>% as.list %>% {rbindlist(list(dt, .), use.names = F, fill = F)}
  
  if (prediction) {
    return(list(dt = dt,pred = predictValues))
--- a/R-package/man/xgb.cv.Rd
+++ b/R-package/man/xgb.cv.Rd
@ -6,7 +6,8 @@
 \usage{
 xgb.cv(params = list(), data, nrounds, nfold, label = NULL,
  missing = NULL, prediction = FALSE, showsd = TRUE, metrics = list(),
-  obj = NULL, feval = NULL, verbose = T, ...)
+  obj = NULL, feval = NULL, stratified = TRUE, folds = NULL,
+  verbose = T, ...)
 }
 \arguments{
 \item{params}{the list of parameters. Commonly used ones are:
@ -51,18 +52,29 @@ value that represents missing value. Sometime a data use 0 or other extreme valu
 }}

 \item{obj}{customized objective function. Returns gradient and second order
-gradient with given prediction and dtrain,}
+gradient with given prediction and dtrain.}

 \item{feval}{custimized evaluation function. Returns
 \code{list(metric='metric-name', value='metric-value')} with given
-prediction and dtrain,}
+prediction and dtrain.}

-\item{verbose}{\code{boolean}, print the statistics during the process.}
+\item{stratified}{\code{boolean} whether sampling of folds should be stratified by the values of labels in \code{data}}
+
+\item{folds}{\code{list} provides a possibility of using a list of pre-defined CV folds (each element must be a vector of fold's indices).
+If folds are supplied, the nfold and stratified parameters would be ignored.}
+
+\item{verbose}{\code{boolean}, print the statistics during the process}

 \item{...}{other parameters to pass to \code{params}.}
 }
 \value{
-A \code{data.table} with each mean and standard deviation stat for training set and test set.
+If \code{prediction = TRUE}, a list with the following elements is returned:
+\itemize{
+  \item \code{dt} a \code{data.table} with each mean and standard deviation stat for training set and test set
+  \item \code{pred} an array or matrix (for multiclass classification) with predictions for each CV-fold for the model having been trained on the data in all other folds.
+}
+
+If \code{prediction = FALSE}, just a \code{data.table} with each mean and standard deviation stat for training set and test set is returned.
 }
 \description{
 The cross valudation function of xgboost