make it possible to use a list of pre-defined CV folds in xgb.cv
This commit is contained in:
parent
c03b42054f
commit
31b0e53cd4
@ -214,12 +214,13 @@ xgb.iter.eval <- function(booster, watchlist, iter, feval = NULL, prediction = F
|
|||||||
#------------------------------------------
|
#------------------------------------------
|
||||||
# helper functions for cross validation
|
# helper functions for cross validation
|
||||||
#
|
#
|
||||||
xgb.cv.mknfold <- function(dall, nfold, param, stratified) {
|
xgb.cv.mknfold <- function(dall, nfold, param, stratified, folds) {
|
||||||
if (nfold <= 1) {
|
if (nfold <= 1) {
|
||||||
stop("nfold must be bigger than 1")
|
stop("nfold must be bigger than 1")
|
||||||
}
|
}
|
||||||
randidx <- sample(1 : xgb.numrow(dall))
|
if(is.null(folds)) {
|
||||||
y <- getinfo(dall, 'label')
|
y <- getinfo(dall, 'label')
|
||||||
|
randidx <- sample(1 : xgb.numrow(dall))
|
||||||
if (stratified & length(y) == length(randidx)) {
|
if (stratified & length(y) == length(randidx)) {
|
||||||
y <- y[randidx]
|
y <- y[randidx]
|
||||||
# By default assume that y is a classification label,
|
# By default assume that y is a classification label,
|
||||||
@ -227,30 +228,31 @@ xgb.cv.mknfold <- function(dall, nfold, param, stratified) {
|
|||||||
# WARNING: if there would be any other objectives with truly
|
# WARNING: if there would be any other objectives with truly
|
||||||
# numerical labels, they currently would not be treated correctly!
|
# numerical labels, they currently would not be treated correctly!
|
||||||
if (param[['objective']] != 'reg:linear') y <- factor(y)
|
if (param[['objective']] != 'reg:linear') y <- factor(y)
|
||||||
idset <- xgb.createFolds(y, nfold)
|
folds <- xgb.createFolds(y, nfold)
|
||||||
} else {
|
} else {
|
||||||
# make simple non-stratified folds
|
# make simple non-stratified folds
|
||||||
kstep <- length(randidx) %/% nfold
|
kstep <- length(randidx) %/% nfold
|
||||||
idset <- list()
|
folds <- list()
|
||||||
for (i in 1:(nfold-1)) {
|
for (i in 1:(nfold-1)) {
|
||||||
idset[[i]] = randidx[1:kstep]
|
folds[[i]] = randidx[1:kstep]
|
||||||
randidx = setdiff(randidx,idset[[i]])
|
randidx = setdiff(randidx, folds[[i]])
|
||||||
|
}
|
||||||
|
folds[[nfold]] = randidx
|
||||||
}
|
}
|
||||||
idset[[nfold]] = randidx
|
|
||||||
}
|
}
|
||||||
ret <- list()
|
ret <- list()
|
||||||
for (k in 1:nfold) {
|
for (k in 1:nfold) {
|
||||||
dtest <- slice(dall, idset[[k]])
|
dtest <- slice(dall, folds[[k]])
|
||||||
didx = c()
|
didx = c()
|
||||||
for (i in 1:nfold) {
|
for (i in 1:nfold) {
|
||||||
if (i != k) {
|
if (i != k) {
|
||||||
didx <- append(didx, idset[[i]])
|
didx <- append(didx, folds[[i]])
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
dtrain <- slice(dall, didx)
|
dtrain <- slice(dall, didx)
|
||||||
bst <- xgb.Booster(param, list(dtrain, dtest))
|
bst <- xgb.Booster(param, list(dtrain, dtest))
|
||||||
watchlist = list(train=dtrain, test=dtest)
|
watchlist = list(train=dtrain, test=dtest)
|
||||||
ret[[k]] <- list(dtrain=dtrain, booster=bst, watchlist=watchlist, index=idset[[k]])
|
ret[[k]] <- list(dtrain=dtrain, booster=bst, watchlist=watchlist, index=folds[[k]])
|
||||||
}
|
}
|
||||||
return (ret)
|
return (ret)
|
||||||
}
|
}
|
||||||
|
|||||||
@ -50,7 +50,9 @@
|
|||||||
#' @param feval custimized evaluation function. Returns
|
#' @param feval custimized evaluation function. Returns
|
||||||
#' \code{list(metric='metric-name', value='metric-value')} with given
|
#' \code{list(metric='metric-name', value='metric-value')} with given
|
||||||
#' prediction and dtrain.
|
#' prediction and dtrain.
|
||||||
#' @param stratified \code{boolean}, whether sampling of folds should be stratified by the values of labels in \code{data}
|
#' @param stratified \code{boolean} whether sampling of folds should be stratified by the values of labels in \code{data}
|
||||||
|
#' @param folds \code{list} provides a possibility of using a list of pre-defined CV folds (each element must be a vector of fold's indices).
|
||||||
|
#' If folds are supplied, the nfold and stratified parameters would be ignored.
|
||||||
#' @param verbose \code{boolean}, print the statistics during the process
|
#' @param verbose \code{boolean}, print the statistics during the process
|
||||||
#' @param ... other parameters to pass to \code{params}.
|
#' @param ... other parameters to pass to \code{params}.
|
||||||
#'
|
#'
|
||||||
@ -84,10 +86,16 @@
|
|||||||
#'
|
#'
|
||||||
xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing = NULL,
|
xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing = NULL,
|
||||||
prediction = FALSE, showsd = TRUE, metrics=list(),
|
prediction = FALSE, showsd = TRUE, metrics=list(),
|
||||||
obj = NULL, feval = NULL, stratified = TRUE, verbose = T,...) {
|
obj = NULL, feval = NULL, stratified = TRUE, folds = NULL, verbose = T,...) {
|
||||||
if (typeof(params) != "list") {
|
if (typeof(params) != "list") {
|
||||||
stop("xgb.cv: first argument params must be list")
|
stop("xgb.cv: first argument params must be list")
|
||||||
}
|
}
|
||||||
|
if(!is.null(folds)) {
|
||||||
|
if(class(folds)!="list" | length(folds) < 2) {
|
||||||
|
stop("folds must be a list with 2 or more elements that are vectors of indices for each CV-fold")
|
||||||
|
}
|
||||||
|
nfold <- length(folds)
|
||||||
|
}
|
||||||
if (nfold <= 1) {
|
if (nfold <= 1) {
|
||||||
stop("nfold must be bigger than 1")
|
stop("nfold must be bigger than 1")
|
||||||
}
|
}
|
||||||
@ -102,7 +110,7 @@ xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing =
|
|||||||
params <- append(params, list("eval_metric"=mc))
|
params <- append(params, list("eval_metric"=mc))
|
||||||
}
|
}
|
||||||
|
|
||||||
folds <- xgb.cv.mknfold(dtrain, nfold, params, stratified)
|
xgb_folds <- xgb.cv.mknfold(dtrain, nfold, params, stratified, folds)
|
||||||
obj_type = params[['objective']]
|
obj_type = params[['objective']]
|
||||||
mat_pred = FALSE
|
mat_pred = FALSE
|
||||||
if (!is.null(obj_type) && obj_type=='multi:softprob')
|
if (!is.null(obj_type) && obj_type=='multi:softprob')
|
||||||
@ -119,7 +127,7 @@ xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing =
|
|||||||
for (i in 1:nrounds) {
|
for (i in 1:nrounds) {
|
||||||
msg <- list()
|
msg <- list()
|
||||||
for (k in 1:nfold) {
|
for (k in 1:nfold) {
|
||||||
fd <- folds[[k]]
|
fd <- xgb_folds[[k]]
|
||||||
succ <- xgb.iter.update(fd$booster, fd$dtrain, i - 1, obj)
|
succ <- xgb.iter.update(fd$booster, fd$dtrain, i - 1, obj)
|
||||||
if (i<nrounds) {
|
if (i<nrounds) {
|
||||||
msg[[k]] <- xgb.iter.eval(fd$booster, fd$watchlist, i - 1, feval) %>% str_split("\t") %>% .[[1]]
|
msg[[k]] <- xgb.iter.eval(fd$booster, fd$watchlist, i - 1, feval) %>% str_split("\t") %>% .[[1]]
|
||||||
|
|||||||
@ -6,7 +6,8 @@
|
|||||||
\usage{
|
\usage{
|
||||||
xgb.cv(params = list(), data, nrounds, nfold, label = NULL,
|
xgb.cv(params = list(), data, nrounds, nfold, label = NULL,
|
||||||
missing = NULL, prediction = FALSE, showsd = TRUE, metrics = list(),
|
missing = NULL, prediction = FALSE, showsd = TRUE, metrics = list(),
|
||||||
obj = NULL, feval = NULL, stratified = TRUE, verbose = T, ...)
|
obj = NULL, feval = NULL, stratified = TRUE, folds = NULL,
|
||||||
|
verbose = T, ...)
|
||||||
}
|
}
|
||||||
\arguments{
|
\arguments{
|
||||||
\item{params}{the list of parameters. Commonly used ones are:
|
\item{params}{the list of parameters. Commonly used ones are:
|
||||||
@ -57,7 +58,10 @@ gradient with given prediction and dtrain.}
|
|||||||
\code{list(metric='metric-name', value='metric-value')} with given
|
\code{list(metric='metric-name', value='metric-value')} with given
|
||||||
prediction and dtrain.}
|
prediction and dtrain.}
|
||||||
|
|
||||||
\item{stratified}{\code{boolean}, whether sampling of folds should be stratified by the values of labels in \code{data}}
|
\item{stratified}{\code{boolean} whether sampling of folds should be stratified by the values of labels in \code{data}}
|
||||||
|
|
||||||
|
\item{folds}{\code{list} provides a possibility of using a list of pre-defined CV folds (each element must be a vector of fold's indices).
|
||||||
|
If folds are supplied, the nfold and stratified parameters would be ignored.}
|
||||||
|
|
||||||
\item{verbose}{\code{boolean}, print the statistics during the process}
|
\item{verbose}{\code{boolean}, print the statistics during the process}
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user