make it possible to use a list of pre-defined CV folds in xgb.cv

This commit is contained in:
Vadim Khotilovich 2015-04-03 13:24:04 -05:00
parent c03b42054f
commit 31b0e53cd4
3 changed files with 42 additions and 28 deletions

View File

@ -214,12 +214,13 @@ xgb.iter.eval <- function(booster, watchlist, iter, feval = NULL, prediction = F
#------------------------------------------ #------------------------------------------
# helper functions for cross validation # helper functions for cross validation
# #
xgb.cv.mknfold <- function(dall, nfold, param, stratified) { xgb.cv.mknfold <- function(dall, nfold, param, stratified, folds) {
if (nfold <= 1) { if (nfold <= 1) {
stop("nfold must be bigger than 1") stop("nfold must be bigger than 1")
} }
randidx <- sample(1 : xgb.numrow(dall)) if(is.null(folds)) {
y <- getinfo(dall, 'label') y <- getinfo(dall, 'label')
randidx <- sample(1 : xgb.numrow(dall))
if (stratified & length(y) == length(randidx)) { if (stratified & length(y) == length(randidx)) {
y <- y[randidx] y <- y[randidx]
# By default assume that y is a classification label, # By default assume that y is a classification label,
@ -227,30 +228,31 @@ xgb.cv.mknfold <- function(dall, nfold, param, stratified) {
# WARNING: if there would be any other objectives with truly # WARNING: if there would be any other objectives with truly
# numerical labels, they currently would not be treated correctly! # numerical labels, they currently would not be treated correctly!
if (param[['objective']] != 'reg:linear') y <- factor(y) if (param[['objective']] != 'reg:linear') y <- factor(y)
idset <- xgb.createFolds(y, nfold) folds <- xgb.createFolds(y, nfold)
} else { } else {
# make simple non-stratified folds # make simple non-stratified folds
kstep <- length(randidx) %/% nfold kstep <- length(randidx) %/% nfold
idset <- list() folds <- list()
for (i in 1:(nfold-1)) { for (i in 1:(nfold-1)) {
idset[[i]] = randidx[1:kstep] folds[[i]] = randidx[1:kstep]
randidx = setdiff(randidx,idset[[i]]) randidx = setdiff(randidx, folds[[i]])
}
folds[[nfold]] = randidx
} }
idset[[nfold]] = randidx
} }
ret <- list() ret <- list()
for (k in 1:nfold) { for (k in 1:nfold) {
dtest <- slice(dall, idset[[k]]) dtest <- slice(dall, folds[[k]])
didx = c() didx = c()
for (i in 1:nfold) { for (i in 1:nfold) {
if (i != k) { if (i != k) {
didx <- append(didx, idset[[i]]) didx <- append(didx, folds[[i]])
} }
} }
dtrain <- slice(dall, didx) dtrain <- slice(dall, didx)
bst <- xgb.Booster(param, list(dtrain, dtest)) bst <- xgb.Booster(param, list(dtrain, dtest))
watchlist = list(train=dtrain, test=dtest) watchlist = list(train=dtrain, test=dtest)
ret[[k]] <- list(dtrain=dtrain, booster=bst, watchlist=watchlist, index=idset[[k]]) ret[[k]] <- list(dtrain=dtrain, booster=bst, watchlist=watchlist, index=folds[[k]])
} }
return (ret) return (ret)
} }

View File

@ -50,7 +50,9 @@
#' @param feval custimized evaluation function. Returns #' @param feval custimized evaluation function. Returns
#' \code{list(metric='metric-name', value='metric-value')} with given #' \code{list(metric='metric-name', value='metric-value')} with given
#' prediction and dtrain. #' prediction and dtrain.
#' @param stratified \code{boolean}, whether sampling of folds should be stratified by the values of labels in \code{data} #' @param stratified \code{boolean} whether sampling of folds should be stratified by the values of labels in \code{data}
#' @param folds \code{list} provides a possibility of using a list of pre-defined CV folds (each element must be a vector of fold's indices).
#' If folds are supplied, the nfold and stratified parameters would be ignored.
#' @param verbose \code{boolean}, print the statistics during the process #' @param verbose \code{boolean}, print the statistics during the process
#' @param ... other parameters to pass to \code{params}. #' @param ... other parameters to pass to \code{params}.
#' #'
@ -84,10 +86,16 @@
#' #'
xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing = NULL, xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing = NULL,
prediction = FALSE, showsd = TRUE, metrics=list(), prediction = FALSE, showsd = TRUE, metrics=list(),
obj = NULL, feval = NULL, stratified = TRUE, verbose = T,...) { obj = NULL, feval = NULL, stratified = TRUE, folds = NULL, verbose = T,...) {
if (typeof(params) != "list") { if (typeof(params) != "list") {
stop("xgb.cv: first argument params must be list") stop("xgb.cv: first argument params must be list")
} }
if(!is.null(folds)) {
if(class(folds)!="list" | length(folds) < 2) {
stop("folds must be a list with 2 or more elements that are vectors of indices for each CV-fold")
}
nfold <- length(folds)
}
if (nfold <= 1) { if (nfold <= 1) {
stop("nfold must be bigger than 1") stop("nfold must be bigger than 1")
} }
@ -102,7 +110,7 @@ xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing =
params <- append(params, list("eval_metric"=mc)) params <- append(params, list("eval_metric"=mc))
} }
folds <- xgb.cv.mknfold(dtrain, nfold, params, stratified) xgb_folds <- xgb.cv.mknfold(dtrain, nfold, params, stratified, folds)
obj_type = params[['objective']] obj_type = params[['objective']]
mat_pred = FALSE mat_pred = FALSE
if (!is.null(obj_type) && obj_type=='multi:softprob') if (!is.null(obj_type) && obj_type=='multi:softprob')
@ -119,7 +127,7 @@ xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing =
for (i in 1:nrounds) { for (i in 1:nrounds) {
msg <- list() msg <- list()
for (k in 1:nfold) { for (k in 1:nfold) {
fd <- folds[[k]] fd <- xgb_folds[[k]]
succ <- xgb.iter.update(fd$booster, fd$dtrain, i - 1, obj) succ <- xgb.iter.update(fd$booster, fd$dtrain, i - 1, obj)
if (i<nrounds) { if (i<nrounds) {
msg[[k]] <- xgb.iter.eval(fd$booster, fd$watchlist, i - 1, feval) %>% str_split("\t") %>% .[[1]] msg[[k]] <- xgb.iter.eval(fd$booster, fd$watchlist, i - 1, feval) %>% str_split("\t") %>% .[[1]]

View File

@ -6,7 +6,8 @@
\usage{ \usage{
xgb.cv(params = list(), data, nrounds, nfold, label = NULL, xgb.cv(params = list(), data, nrounds, nfold, label = NULL,
missing = NULL, prediction = FALSE, showsd = TRUE, metrics = list(), missing = NULL, prediction = FALSE, showsd = TRUE, metrics = list(),
obj = NULL, feval = NULL, stratified = TRUE, verbose = T, ...) obj = NULL, feval = NULL, stratified = TRUE, folds = NULL,
verbose = T, ...)
} }
\arguments{ \arguments{
\item{params}{the list of parameters. Commonly used ones are: \item{params}{the list of parameters. Commonly used ones are:
@ -57,7 +58,10 @@ gradient with given prediction and dtrain.}
\code{list(metric='metric-name', value='metric-value')} with given \code{list(metric='metric-name', value='metric-value')} with given
prediction and dtrain.} prediction and dtrain.}
\item{stratified}{\code{boolean}, whether sampling of folds should be stratified by the values of labels in \code{data}} \item{stratified}{\code{boolean} whether sampling of folds should be stratified by the values of labels in \code{data}}
\item{folds}{\code{list} provides a possibility of using a list of pre-defined CV folds (each element must be a vector of fold's indices).
If folds are supplied, the nfold and stratified parameters would be ignored.}
\item{verbose}{\code{boolean}, print the statistics during the process} \item{verbose}{\code{boolean}, print the statistics during the process}