added an option for stratified CV to xgb.cv

This commit is contained in:
Vadim Khotilovich 2015-04-02 19:48:23 -05:00
parent e9c95645a3
commit b8711226e2
2 changed files with 76 additions and 12 deletions

View File

@ -214,18 +214,30 @@ xgb.iter.eval <- function(booster, watchlist, iter, feval = NULL, prediction = F
#------------------------------------------
# helper functions for cross validation
#
xgb.cv.mknfold <- function(dall, nfold, param) {
xgb.cv.mknfold <- function(dall, nfold, param, stratified) {
if (nfold <= 1) {
stop("nfold must be bigger than 1")
}
randidx <- sample(1 : xgb.numrow(dall))
kstep <- length(randidx) %/% nfold
idset <- list()
for (i in 1:(nfold-1)) {
idset[[i]] = randidx[1:kstep]
randidx = setdiff(randidx,idset[[i]])
y <- getinfo(dall, 'label')
if (stratified & length(y) == length(randidx)) {
y <- y[randidx]
# by default assume that y is a classification label, and only
# leave it numeric for the reg:linear objective
# WARNING: if there would be any objectives with truly numerical
# they would not currently be treated correctly.
if (param[['objective']] != 'reg:linear') y <- factor(y)
idset <- xgb.createFolds(y, nfold)
} else {
# make simple non-stratified folds
kstep <- length(randidx) %/% nfold
idset <- list()
for (i in 1:(nfold-1)) {
idset[[i]] = randidx[1:kstep]
randidx = setdiff(randidx,idset[[i]])
}
idset[[nfold]] = randidx
}
idset[[nfold]] = randidx
ret <- list()
for (k in 1:nfold) {
dtest <- slice(dall, idset[[k]])
@ -242,6 +254,7 @@ xgb.cv.mknfold <- function(dall, nfold, param) {
}
return (ret)
}
xgb.cv.aggcv <- function(res, showsd = TRUE) {
header <- res[[1]]
ret <- header[1]
@ -261,3 +274,53 @@ xgb.cv.aggcv <- function(res, showsd = TRUE) {
}
return (ret)
}
# Shamelessly copied from caret::createFolds
# and simplified by always returning an unnamed list of test indices
xgb.createFolds <- function(y, k = 10)
{
if(is.numeric(y)) {
## Group the numeric data based on their magnitudes
## and sample within those groups.
## When the number of samples is low, we may have
## issues further slicing the numeric data into
## groups. The number of groups will depend on the
## ratio of the number of folds to the sample size.
## At most, we will use quantiles. If the sample
## is too small, we just do regular unstratified
## CV
cuts <- floor(length(y)/k)
if(cuts < 2) cuts <- 2
if(cuts > 5) cuts <- 5
y <- cut(y,
unique(quantile(y, probs = seq(0, 1, length = cuts))),
include.lowest = TRUE)
}
if(k < length(y)) {
## reset levels so that the possible levels and
## the levels in the vector are the same
y <- factor(as.character(y))
numInClass <- table(y)
foldVector <- vector(mode = "integer", length(y))
## For each class, balance the fold allocation as far
## as possible, then resample the remainder.
## The final assignment of folds is also randomized.
for(i in 1:length(numInClass)) {
## create a vector of integers from 1:k as many times as possible without
## going over the number of samples in the class. Note that if the number
## of samples in a class is less than k, nothing is producd here.
seqVector <- rep(1:k, numInClass[i] %/% k)
## add enough random integers to get length(seqVector) == numInClass[i]
if(numInClass[i] %% k > 0) seqVector <- c(seqVector, sample(1:k, numInClass[i] %% k))
## shuffle the integers for fold assignment and assign to this classes's data
foldVector[which(y == dimnames(numInClass)$y[i])] <- sample(seqVector)
}
} else foldVector <- seq(along = y)
out <- split(seq(along = y), foldVector)
names(out) <- NULL
out
}

View File

@ -46,11 +46,12 @@
#' \item \code{merror} Exact matching error, used to evaluate multi-class classification
#' }
#' @param obj customized objective function. Returns gradient and second order
#' gradient with given prediction and dtrain,
#' gradient with given prediction and dtrain.
#' @param feval custimized evaluation function. Returns
#' \code{list(metric='metric-name', value='metric-value')} with given
#' prediction and dtrain,
#' @param verbose \code{boolean}, print the statistics during the process.
#' prediction and dtrain.
#' @param stratified \code{boolean}, whether the sampling of folds should be stratified by the values of labels in \code{data}
#' @param verbose \code{boolean}, print the statistics during the process
#' @param ... other parameters to pass to \code{params}.
#'
#' @return A \code{data.table} with each mean and standard deviation stat for training set and test set.
@ -76,7 +77,7 @@
#'
xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing = NULL,
prediction = FALSE, showsd = TRUE, metrics=list(),
obj = NULL, feval = NULL, verbose = T,...) {
obj = NULL, feval = NULL, stratified = TRUE, verbose = T,...) {
if (typeof(params) != "list") {
stop("xgb.cv: first argument params must be list")
}
@ -94,7 +95,7 @@ xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing =
params <- append(params, list("eval_metric"=mc))
}
folds <- xgb.cv.mknfold(dtrain, nfold, params)
folds <- xgb.cv.mknfold(dtrain, nfold, params, stratified)
obj_type = params[['objective']]
mat_pred = FALSE
if (!is.null(obj_type) && obj_type=='multi:softprob')