From b8711226e2f121a7cb76200716421ee840c0f900 Mon Sep 17 00:00:00 2001 From: Vadim Khotilovich Date: Thu, 2 Apr 2015 19:48:23 -0500 Subject: [PATCH] added an option for stratified CV to xgb.cv --- R-package/R/utils.R | 77 ++++++++++++++++++++++++++++++++++++++++---- R-package/R/xgb.cv.R | 11 ++++--- 2 files changed, 76 insertions(+), 12 deletions(-) diff --git a/R-package/R/utils.R b/R-package/R/utils.R index 7336ed213..b0b565a3d 100644 --- a/R-package/R/utils.R +++ b/R-package/R/utils.R @@ -214,18 +214,30 @@ xgb.iter.eval <- function(booster, watchlist, iter, feval = NULL, prediction = F #------------------------------------------ # helper functions for cross validation # -xgb.cv.mknfold <- function(dall, nfold, param) { +xgb.cv.mknfold <- function(dall, nfold, param, stratified) { if (nfold <= 1) { stop("nfold must be bigger than 1") } randidx <- sample(1 : xgb.numrow(dall)) - kstep <- length(randidx) %/% nfold - idset <- list() - for (i in 1:(nfold-1)) { - idset[[i]] = randidx[1:kstep] - randidx = setdiff(randidx,idset[[i]]) + y <- getinfo(dall, 'label') + if (stratified & length(y) == length(randidx)) { + y <- y[randidx] + # by default assume that y is a classification label, and only + # leave it numeric for the reg:linear objective + # WARNING: if there would be any objectives with truly numerical + # they would not currently be treated correctly. + if (param[['objective']] != 'reg:linear') y <- factor(y) + idset <- xgb.createFolds(y, nfold) + } else { + # make simple non-stratified folds + kstep <- length(randidx) %/% nfold + idset <- list() + for (i in 1:(nfold-1)) { + idset[[i]] = randidx[1:kstep] + randidx = setdiff(randidx,idset[[i]]) + } + idset[[nfold]] = randidx } - idset[[nfold]] = randidx ret <- list() for (k in 1:nfold) { dtest <- slice(dall, idset[[k]]) @@ -242,6 +254,7 @@ xgb.cv.mknfold <- function(dall, nfold, param) { } return (ret) } + xgb.cv.aggcv <- function(res, showsd = TRUE) { header <- res[[1]] ret <- header[1] @@ -261,3 +274,53 @@ xgb.cv.aggcv <- function(res, showsd = TRUE) { } return (ret) } + +# Shamelessly copied from caret::createFolds +# and simplified by always returning an unnamed list of test indices +xgb.createFolds <- function(y, k = 10) +{ + if(is.numeric(y)) { + ## Group the numeric data based on their magnitudes + ## and sample within those groups. + + ## When the number of samples is low, we may have + ## issues further slicing the numeric data into + ## groups. The number of groups will depend on the + ## ratio of the number of folds to the sample size. + ## At most, we will use quantiles. If the sample + ## is too small, we just do regular unstratified + ## CV + cuts <- floor(length(y)/k) + if(cuts < 2) cuts <- 2 + if(cuts > 5) cuts <- 5 + y <- cut(y, + unique(quantile(y, probs = seq(0, 1, length = cuts))), + include.lowest = TRUE) + } + + if(k < length(y)) { + ## reset levels so that the possible levels and + ## the levels in the vector are the same + y <- factor(as.character(y)) + numInClass <- table(y) + foldVector <- vector(mode = "integer", length(y)) + + ## For each class, balance the fold allocation as far + ## as possible, then resample the remainder. + ## The final assignment of folds is also randomized. + for(i in 1:length(numInClass)) { + ## create a vector of integers from 1:k as many times as possible without + ## going over the number of samples in the class. Note that if the number + ## of samples in a class is less than k, nothing is producd here. + seqVector <- rep(1:k, numInClass[i] %/% k) + ## add enough random integers to get length(seqVector) == numInClass[i] + if(numInClass[i] %% k > 0) seqVector <- c(seqVector, sample(1:k, numInClass[i] %% k)) + ## shuffle the integers for fold assignment and assign to this classes's data + foldVector[which(y == dimnames(numInClass)$y[i])] <- sample(seqVector) + } + } else foldVector <- seq(along = y) + + out <- split(seq(along = y), foldVector) + names(out) <- NULL + out +} diff --git a/R-package/R/xgb.cv.R b/R-package/R/xgb.cv.R index 3472ead4d..38f70b973 100644 --- a/R-package/R/xgb.cv.R +++ b/R-package/R/xgb.cv.R @@ -46,11 +46,12 @@ #' \item \code{merror} Exact matching error, used to evaluate multi-class classification #' } #' @param obj customized objective function. Returns gradient and second order -#' gradient with given prediction and dtrain, +#' gradient with given prediction and dtrain. #' @param feval custimized evaluation function. Returns #' \code{list(metric='metric-name', value='metric-value')} with given -#' prediction and dtrain, -#' @param verbose \code{boolean}, print the statistics during the process. +#' prediction and dtrain. +#' @param stratified \code{boolean}, whether the sampling of folds should be stratified by the values of labels in \code{data} +#' @param verbose \code{boolean}, print the statistics during the process #' @param ... other parameters to pass to \code{params}. #' #' @return A \code{data.table} with each mean and standard deviation stat for training set and test set. @@ -76,7 +77,7 @@ #' xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing = NULL, prediction = FALSE, showsd = TRUE, metrics=list(), - obj = NULL, feval = NULL, verbose = T,...) { + obj = NULL, feval = NULL, stratified = TRUE, verbose = T,...) { if (typeof(params) != "list") { stop("xgb.cv: first argument params must be list") } @@ -94,7 +95,7 @@ xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing = params <- append(params, list("eval_metric"=mc)) } - folds <- xgb.cv.mknfold(dtrain, nfold, params) + folds <- xgb.cv.mknfold(dtrain, nfold, params, stratified) obj_type = params[['objective']] mat_pred = FALSE if (!is.null(obj_type) && obj_type=='multi:softprob')