From 31b0e53cd462460ff1b80e281872dd3c3903f3fd Mon Sep 17 00:00:00 2001 From: Vadim Khotilovich Date: Fri, 3 Apr 2015 13:24:04 -0500 Subject: [PATCH] make it possible to use a list of pre-defined CV folds in xgb.cv --- R-package/R/utils.R | 46 +++++++++++++++++++++-------------------- R-package/R/xgb.cv.R | 16 ++++++++++---- R-package/man/xgb.cv.Rd | 8 +++++-- 3 files changed, 42 insertions(+), 28 deletions(-) diff --git a/R-package/R/utils.R b/R-package/R/utils.R index ca32901dc..a9ea767a8 100644 --- a/R-package/R/utils.R +++ b/R-package/R/utils.R @@ -214,43 +214,45 @@ xgb.iter.eval <- function(booster, watchlist, iter, feval = NULL, prediction = F #------------------------------------------ # helper functions for cross validation # -xgb.cv.mknfold <- function(dall, nfold, param, stratified) { +xgb.cv.mknfold <- function(dall, nfold, param, stratified, folds) { if (nfold <= 1) { stop("nfold must be bigger than 1") } - randidx <- sample(1 : xgb.numrow(dall)) - y <- getinfo(dall, 'label') - if (stratified & length(y) == length(randidx)) { - y <- y[randidx] - # By default assume that y is a classification label, - # and only leave it numeric for the reg:linear objective. - # WARNING: if there would be any other objectives with truly - # numerical labels, they currently would not be treated correctly! - if (param[['objective']] != 'reg:linear') y <- factor(y) - idset <- xgb.createFolds(y, nfold) - } else { - # make simple non-stratified folds - kstep <- length(randidx) %/% nfold - idset <- list() - for (i in 1:(nfold-1)) { - idset[[i]] = randidx[1:kstep] - randidx = setdiff(randidx,idset[[i]]) + if(is.null(folds)) { + y <- getinfo(dall, 'label') + randidx <- sample(1 : xgb.numrow(dall)) + if (stratified & length(y) == length(randidx)) { + y <- y[randidx] + # By default assume that y is a classification label, + # and only leave it numeric for the reg:linear objective. + # WARNING: if there would be any other objectives with truly + # numerical labels, they currently would not be treated correctly! + if (param[['objective']] != 'reg:linear') y <- factor(y) + folds <- xgb.createFolds(y, nfold) + } else { + # make simple non-stratified folds + kstep <- length(randidx) %/% nfold + folds <- list() + for (i in 1:(nfold-1)) { + folds[[i]] = randidx[1:kstep] + randidx = setdiff(randidx, folds[[i]]) + } + folds[[nfold]] = randidx } - idset[[nfold]] = randidx } ret <- list() for (k in 1:nfold) { - dtest <- slice(dall, idset[[k]]) + dtest <- slice(dall, folds[[k]]) didx = c() for (i in 1:nfold) { if (i != k) { - didx <- append(didx, idset[[i]]) + didx <- append(didx, folds[[i]]) } } dtrain <- slice(dall, didx) bst <- xgb.Booster(param, list(dtrain, dtest)) watchlist = list(train=dtrain, test=dtest) - ret[[k]] <- list(dtrain=dtrain, booster=bst, watchlist=watchlist, index=idset[[k]]) + ret[[k]] <- list(dtrain=dtrain, booster=bst, watchlist=watchlist, index=folds[[k]]) } return (ret) } diff --git a/R-package/R/xgb.cv.R b/R-package/R/xgb.cv.R index db2ecf103..e5f5c7b72 100644 --- a/R-package/R/xgb.cv.R +++ b/R-package/R/xgb.cv.R @@ -50,7 +50,9 @@ #' @param feval custimized evaluation function. Returns #' \code{list(metric='metric-name', value='metric-value')} with given #' prediction and dtrain. -#' @param stratified \code{boolean}, whether sampling of folds should be stratified by the values of labels in \code{data} +#' @param stratified \code{boolean} whether sampling of folds should be stratified by the values of labels in \code{data} +#' @param folds \code{list} provides a possibility of using a list of pre-defined CV folds (each element must be a vector of fold's indices). +#' If folds are supplied, the nfold and stratified parameters would be ignored. #' @param verbose \code{boolean}, print the statistics during the process #' @param ... other parameters to pass to \code{params}. #' @@ -84,10 +86,16 @@ #' xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing = NULL, prediction = FALSE, showsd = TRUE, metrics=list(), - obj = NULL, feval = NULL, stratified = TRUE, verbose = T,...) { + obj = NULL, feval = NULL, stratified = TRUE, folds = NULL, verbose = T,...) { if (typeof(params) != "list") { stop("xgb.cv: first argument params must be list") } + if(!is.null(folds)) { + if(class(folds)!="list" | length(folds) < 2) { + stop("folds must be a list with 2 or more elements that are vectors of indices for each CV-fold") + } + nfold <- length(folds) + } if (nfold <= 1) { stop("nfold must be bigger than 1") } @@ -102,7 +110,7 @@ xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing = params <- append(params, list("eval_metric"=mc)) } - folds <- xgb.cv.mknfold(dtrain, nfold, params, stratified) + xgb_folds <- xgb.cv.mknfold(dtrain, nfold, params, stratified, folds) obj_type = params[['objective']] mat_pred = FALSE if (!is.null(obj_type) && obj_type=='multi:softprob') @@ -119,7 +127,7 @@ xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing = for (i in 1:nrounds) { msg <- list() for (k in 1:nfold) { - fd <- folds[[k]] + fd <- xgb_folds[[k]] succ <- xgb.iter.update(fd$booster, fd$dtrain, i - 1, obj) if (i% str_split("\t") %>% .[[1]] diff --git a/R-package/man/xgb.cv.Rd b/R-package/man/xgb.cv.Rd index 20423f76a..feee4e18f 100644 --- a/R-package/man/xgb.cv.Rd +++ b/R-package/man/xgb.cv.Rd @@ -6,7 +6,8 @@ \usage{ xgb.cv(params = list(), data, nrounds, nfold, label = NULL, missing = NULL, prediction = FALSE, showsd = TRUE, metrics = list(), - obj = NULL, feval = NULL, stratified = TRUE, verbose = T, ...) + obj = NULL, feval = NULL, stratified = TRUE, folds = NULL, + verbose = T, ...) } \arguments{ \item{params}{the list of parameters. Commonly used ones are: @@ -57,7 +58,10 @@ gradient with given prediction and dtrain.} \code{list(metric='metric-name', value='metric-value')} with given prediction and dtrain.} -\item{stratified}{\code{boolean}, whether sampling of folds should be stratified by the values of labels in \code{data}} +\item{stratified}{\code{boolean} whether sampling of folds should be stratified by the values of labels in \code{data}} + +\item{folds}{\code{list} provides a possibility of using a list of pre-defined CV folds (each element must be a vector of fold's indices). +If folds are supplied, the nfold and stratified parameters would be ignored.} \item{verbose}{\code{boolean}, print the statistics during the process}