From b8711226e2f121a7cb76200716421ee840c0f900 Mon Sep 17 00:00:00 2001
From: Vadim Khotilovich <khotilovich@gmail.com>
Date: Thu, 2 Apr 2015 19:48:23 -0500
Subject: [PATCH] added an option for stratified CV to xgb.cv

---
 R-package/R/utils.R  | 77 ++++++++++++++++++++++++++++++++++++++++----
 R-package/R/xgb.cv.R | 11 ++++---
 2 files changed, 76 insertions(+), 12 deletions(-)

diff --git a/R-package/R/utils.R b/R-package/R/utils.R
index 7336ed213..b0b565a3d 100644
--- a/R-package/R/utils.R
+++ b/R-package/R/utils.R
@@ -214,18 +214,30 @@ xgb.iter.eval <- function(booster, watchlist, iter, feval = NULL, prediction = F
 #------------------------------------------
 # helper functions for cross validation
 #
-xgb.cv.mknfold <- function(dall, nfold, param) {
+xgb.cv.mknfold <- function(dall, nfold, param, stratified) {
   if (nfold <= 1) {
     stop("nfold must be bigger than 1")
   }
   randidx <- sample(1 : xgb.numrow(dall))
-  kstep <- length(randidx) %/% nfold
-  idset <- list()
-  for (i in 1:(nfold-1)) {
-    idset[[i]] = randidx[1:kstep]
-    randidx = setdiff(randidx,idset[[i]])
+  y <- getinfo(dall, 'label')
+  if (stratified & length(y) == length(randidx)) {
+    y <- y[randidx]
+    # by default assume that y is a classification label, and only
+    # leave it numeric for the reg:linear objective
+    # WARNING: if there would be any objectives with truly numerical 
+    #  they would not currently be treated correctly.
+    if (param[['objective']] != 'reg:linear') y <- factor(y)
+    idset <- xgb.createFolds(y, nfold)
+  } else { 
+    # make simple non-stratified folds
+    kstep <- length(randidx) %/% nfold
+    idset <- list()
+    for (i in 1:(nfold-1)) {
+      idset[[i]] = randidx[1:kstep]
+      randidx = setdiff(randidx,idset[[i]])
+    }
+    idset[[nfold]] = randidx
   }
-  idset[[nfold]] = randidx
   ret <- list()
   for (k in 1:nfold) {
     dtest <- slice(dall, idset[[k]])
@@ -242,6 +254,7 @@ xgb.cv.mknfold <- function(dall, nfold, param) {
   }
   return (ret)
 }
+
 xgb.cv.aggcv <- function(res, showsd = TRUE) {
   header <- res[[1]]
   ret <- header[1]
@@ -261,3 +274,53 @@ xgb.cv.aggcv <- function(res, showsd = TRUE) {
   }
   return (ret)
 }
+
+# Shamelessly copied from caret::createFolds
+# and simplified by always returning an unnamed list of test indices
+xgb.createFolds <- function(y, k = 10)
+{
+  if(is.numeric(y)) {
+    ## Group the numeric data based on their magnitudes
+    ## and sample within those groups.
+
+    ## When the number of samples is low, we may have
+    ## issues further slicing the numeric data into
+    ## groups. The number of groups will depend on the
+    ## ratio of the number of folds to the sample size.
+    ## At most, we will use quantiles. If the sample
+    ## is too small, we just do regular unstratified
+    ## CV
+    cuts <- floor(length(y)/k)
+    if(cuts < 2) cuts <- 2
+    if(cuts > 5) cuts <- 5
+    y <- cut(y,
+             unique(quantile(y, probs = seq(0, 1, length = cuts))),
+             include.lowest = TRUE)
+  }
+
+  if(k < length(y)) {
+    ## reset levels so that the possible levels and
+    ## the levels in the vector are the same
+    y <- factor(as.character(y))
+    numInClass <- table(y)
+    foldVector <- vector(mode = "integer", length(y))
+    
+    ## For each class, balance the fold allocation as far
+    ## as possible, then resample the remainder.
+    ## The final assignment of folds is also randomized.
+    for(i in 1:length(numInClass)) {
+      ## create a vector of integers from 1:k as many times as possible without
+      ## going over the number of samples in the class. Note that if the number
+      ## of samples in a class is less than k, nothing is producd here.
+      seqVector <- rep(1:k, numInClass[i] %/% k)
+      ## add enough random integers to get  length(seqVector) == numInClass[i]
+      if(numInClass[i] %% k > 0) seqVector <- c(seqVector, sample(1:k, numInClass[i] %% k))
+      ## shuffle the integers for fold assignment and assign to this classes's data
+      foldVector[which(y == dimnames(numInClass)$y[i])] <- sample(seqVector)
+    }
+  } else foldVector <- seq(along = y)
+
+  out <- split(seq(along = y), foldVector)
+  names(out) <- NULL
+  out
+}
diff --git a/R-package/R/xgb.cv.R b/R-package/R/xgb.cv.R
index 3472ead4d..38f70b973 100644
--- a/R-package/R/xgb.cv.R
+++ b/R-package/R/xgb.cv.R
@@ -46,11 +46,12 @@
 #'   \item \code{merror} Exact matching error, used to evaluate multi-class classification
 #' }
 #' @param obj customized objective function. Returns gradient and second order 
-#'   gradient with given prediction and dtrain, 
+#'   gradient with given prediction and dtrain.
 #' @param feval custimized evaluation function. Returns 
 #'   \code{list(metric='metric-name', value='metric-value')} with given 
-#'   prediction and dtrain,
-#' @param verbose \code{boolean}, print the statistics during the process.
+#'   prediction and dtrain.
+#' @param stratified \code{boolean}, whether the sampling of folds should be stratified by the values of labels in \code{data}
+#' @param verbose \code{boolean}, print the statistics during the process
 #' @param ... other parameters to pass to \code{params}.
 #' 
 #' @return A \code{data.table} with each mean and standard deviation stat for training set and test set.
@@ -76,7 +77,7 @@
 #'
 xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing = NULL, 
                    prediction = FALSE, showsd = TRUE, metrics=list(), 
-                   obj = NULL, feval = NULL, verbose = T,...) {
+                   obj = NULL, feval = NULL, stratified = TRUE, verbose = T,...) {
   if (typeof(params) != "list") {
     stop("xgb.cv: first argument params must be list")
   }
@@ -94,7 +95,7 @@ xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing =
     params <- append(params, list("eval_metric"=mc))
   }
 
-  folds <- xgb.cv.mknfold(dtrain, nfold, params)
+  folds <- xgb.cv.mknfold(dtrain, nfold, params, stratified)
   obj_type = params[['objective']]
   mat_pred = FALSE
   if (!is.null(obj_type) && obj_type=='multi:softprob')