make it possible to use a list of pre-defined CV folds in xgb.cv

2015-04-03 13:24:04 -05:00 · 2015-04-03 13:24:04 -05:00 · 31b0e53cd4
commit 31b0e53cd4
parent c03b42054f
3 changed files with 42 additions and 28 deletions
--- a/R-package/R/utils.R
+++ b/R-package/R/utils.R
@ -214,43 +214,45 @@ xgb.iter.eval <- function(booster, watchlist, iter, feval = NULL, prediction = F
 #------------------------------------------
 # helper functions for cross validation
 #
-xgb.cv.mknfold <- function(dall, nfold, param, stratified) {
+xgb.cv.mknfold <- function(dall, nfold, param, stratified, folds) {
  if (nfold <= 1) {
    stop("nfold must be bigger than 1")
  }
-  randidx <- sample(1 : xgb.numrow(dall))
-  y <- getinfo(dall, 'label')
-  if (stratified & length(y) == length(randidx)) {
-    y <- y[randidx]
-    # By default assume that y is a classification label,
-    # and only leave it numeric for the reg:linear objective.
-    # WARNING: if there would be any other objectives with truly
-    #   numerical labels, they currently would not be treated correctly!
-    if (param[['objective']] != 'reg:linear') y <- factor(y)
-    idset <- xgb.createFolds(y, nfold)
-  } else { 
-    # make simple non-stratified folds
-    kstep <- length(randidx) %/% nfold
-    idset <- list()
-    for (i in 1:(nfold-1)) {
-      idset[[i]] = randidx[1:kstep]
-      randidx = setdiff(randidx,idset[[i]])
+  if(is.null(folds)) {
+    y <- getinfo(dall, 'label')
+    randidx <- sample(1 : xgb.numrow(dall))
+    if (stratified & length(y) == length(randidx)) {
+      y <- y[randidx]
+      # By default assume that y is a classification label,
+      # and only leave it numeric for the reg:linear objective.
+      # WARNING: if there would be any other objectives with truly
+      #   numerical labels, they currently would not be treated correctly!
+      if (param[['objective']] != 'reg:linear') y <- factor(y)
+      folds <- xgb.createFolds(y, nfold)
+    } else { 
+      # make simple non-stratified folds
+      kstep <- length(randidx) %/% nfold
+      folds <- list()
+      for (i in 1:(nfold-1)) {
+        folds[[i]] = randidx[1:kstep]
+        randidx = setdiff(randidx, folds[[i]])
+      }
+      folds[[nfold]] = randidx
    }
-    idset[[nfold]] = randidx
  }
  ret <- list()
  for (k in 1:nfold) {
-    dtest <- slice(dall, idset[[k]])
+    dtest <- slice(dall, folds[[k]])
    didx = c()
    for (i in 1:nfold) {
      if (i != k) {
-        didx <- append(didx, idset[[i]])
+        didx <- append(didx, folds[[i]])
      }
    }
    dtrain <- slice(dall, didx)
    bst <- xgb.Booster(param, list(dtrain, dtest))
    watchlist = list(train=dtrain, test=dtest)
-    ret[[k]] <- list(dtrain=dtrain, booster=bst, watchlist=watchlist, index=idset[[k]])
+    ret[[k]] <- list(dtrain=dtrain, booster=bst, watchlist=watchlist, index=folds[[k]])
  }
  return (ret)
 }
--- a/R-package/R/xgb.cv.R
+++ b/R-package/R/xgb.cv.R
@ -50,7 +50,9 @@
 #' @param feval custimized evaluation function. Returns 
 #'   \code{list(metric='metric-name', value='metric-value')} with given 
 #'   prediction and dtrain.
-#' @param stratified \code{boolean}, whether sampling of folds should be stratified by the values of labels in \code{data}
+#' @param stratified \code{boolean} whether sampling of folds should be stratified by the values of labels in \code{data}
+#' @param folds \code{list} provides a possibility of using a list of pre-defined CV folds (each element must be a vector of fold's indices).
+#'   If folds are supplied, the nfold and stratified parameters would be ignored.
 #' @param verbose \code{boolean}, print the statistics during the process
 #' @param ... other parameters to pass to \code{params}.
 #' 
@ -84,10 +86,16 @@
 #'
 xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing = NULL, 
                   prediction = FALSE, showsd = TRUE, metrics=list(), 
-                   obj = NULL, feval = NULL, stratified = TRUE, verbose = T,...) {
+                   obj = NULL, feval = NULL, stratified = TRUE, folds = NULL, verbose = T,...) {
  if (typeof(params) != "list") {
    stop("xgb.cv: first argument params must be list")
  }
+  if(!is.null(folds)) {
+    if(class(folds)!="list" | length(folds) < 2) {
+      stop("folds must be a list with 2 or more elements that are vectors of indices for each CV-fold")
+    }
+    nfold <- length(folds)
+  }
  if (nfold <= 1) {
    stop("nfold must be bigger than 1")
  }
@ -102,7 +110,7 @@ xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing =
    params <- append(params, list("eval_metric"=mc))
  }

-  folds <- xgb.cv.mknfold(dtrain, nfold, params, stratified)
+  xgb_folds <- xgb.cv.mknfold(dtrain, nfold, params, stratified, folds)
  obj_type = params[['objective']]
  mat_pred = FALSE
  if (!is.null(obj_type) && obj_type=='multi:softprob')
@ -119,7 +127,7 @@ xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing =
  for (i in 1:nrounds) {
    msg <- list()
    for (k in 1:nfold) {
-      fd <- folds[[k]]
+      fd <- xgb_folds[[k]]
      succ <- xgb.iter.update(fd$booster, fd$dtrain, i - 1, obj)
      if (i<nrounds) {
          msg[[k]] <- xgb.iter.eval(fd$booster, fd$watchlist, i - 1, feval) %>% str_split("\t") %>% .[[1]]
--- a/R-package/man/xgb.cv.Rd
+++ b/R-package/man/xgb.cv.Rd
@ -6,7 +6,8 @@
 \usage{
 xgb.cv(params = list(), data, nrounds, nfold, label = NULL,
  missing = NULL, prediction = FALSE, showsd = TRUE, metrics = list(),
-  obj = NULL, feval = NULL, stratified = TRUE, verbose = T, ...)
+  obj = NULL, feval = NULL, stratified = TRUE, folds = NULL,
+  verbose = T, ...)
 }
 \arguments{
 \item{params}{the list of parameters. Commonly used ones are:
@ -57,7 +58,10 @@ gradient with given prediction and dtrain.}
 \code{list(metric='metric-name', value='metric-value')} with given
 prediction and dtrain.}

-\item{stratified}{\code{boolean}, whether sampling of folds should be stratified by the values of labels in \code{data}}
+\item{stratified}{\code{boolean} whether sampling of folds should be stratified by the values of labels in \code{data}}
+
+\item{folds}{\code{list} provides a possibility of using a list of pre-defined CV folds (each element must be a vector of fold's indices).
+If folds are supplied, the nfold and stratified parameters would be ignored.}

 \item{verbose}{\code{boolean}, print the statistics during the process}