Merge pull request #227 from khotilov/master
add stratified cross validation for classification
This commit is contained in:
commit
bab7b58d94
@ -214,34 +214,49 @@ xgb.iter.eval <- function(booster, watchlist, iter, feval = NULL, prediction = F
|
|||||||
#------------------------------------------
|
#------------------------------------------
|
||||||
# helper functions for cross validation
|
# helper functions for cross validation
|
||||||
#
|
#
|
||||||
xgb.cv.mknfold <- function(dall, nfold, param) {
|
xgb.cv.mknfold <- function(dall, nfold, param, stratified, folds) {
|
||||||
if (nfold <= 1) {
|
if (nfold <= 1) {
|
||||||
stop("nfold must be bigger than 1")
|
stop("nfold must be bigger than 1")
|
||||||
}
|
}
|
||||||
randidx <- sample(1 : xgb.numrow(dall))
|
if(is.null(folds)) {
|
||||||
kstep <- length(randidx) %/% nfold
|
y <- getinfo(dall, 'label')
|
||||||
idset <- list()
|
randidx <- sample(1 : xgb.numrow(dall))
|
||||||
for (i in 1:(nfold-1)) {
|
if (stratified & length(y) == length(randidx)) {
|
||||||
idset[[i]] = randidx[1:kstep]
|
y <- y[randidx]
|
||||||
randidx = setdiff(randidx,idset[[i]])
|
# By default assume that y is a classification label,
|
||||||
|
# and only leave it numeric for the reg:linear objective.
|
||||||
|
# WARNING: if there would be any other objectives with truly
|
||||||
|
# numerical labels, they currently would not be treated correctly!
|
||||||
|
if (param[['objective']] != 'reg:linear') y <- factor(y)
|
||||||
|
folds <- xgb.createFolds(y, nfold)
|
||||||
|
} else {
|
||||||
|
# make simple non-stratified folds
|
||||||
|
kstep <- length(randidx) %/% nfold
|
||||||
|
folds <- list()
|
||||||
|
for (i in 1:(nfold-1)) {
|
||||||
|
folds[[i]] = randidx[1:kstep]
|
||||||
|
randidx = setdiff(randidx, folds[[i]])
|
||||||
|
}
|
||||||
|
folds[[nfold]] = randidx
|
||||||
|
}
|
||||||
}
|
}
|
||||||
idset[[nfold]] = randidx
|
|
||||||
ret <- list()
|
ret <- list()
|
||||||
for (k in 1:nfold) {
|
for (k in 1:nfold) {
|
||||||
dtest <- slice(dall, idset[[k]])
|
dtest <- slice(dall, folds[[k]])
|
||||||
didx = c()
|
didx = c()
|
||||||
for (i in 1:nfold) {
|
for (i in 1:nfold) {
|
||||||
if (i != k) {
|
if (i != k) {
|
||||||
didx <- append(didx, idset[[i]])
|
didx <- append(didx, folds[[i]])
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
dtrain <- slice(dall, didx)
|
dtrain <- slice(dall, didx)
|
||||||
bst <- xgb.Booster(param, list(dtrain, dtest))
|
bst <- xgb.Booster(param, list(dtrain, dtest))
|
||||||
watchlist = list(train=dtrain, test=dtest)
|
watchlist = list(train=dtrain, test=dtest)
|
||||||
ret[[k]] <- list(dtrain=dtrain, booster=bst, watchlist=watchlist, index=idset[[k]])
|
ret[[k]] <- list(dtrain=dtrain, booster=bst, watchlist=watchlist, index=folds[[k]])
|
||||||
}
|
}
|
||||||
return (ret)
|
return (ret)
|
||||||
}
|
}
|
||||||
|
|
||||||
xgb.cv.aggcv <- function(res, showsd = TRUE) {
|
xgb.cv.aggcv <- function(res, showsd = TRUE) {
|
||||||
header <- res[[1]]
|
header <- res[[1]]
|
||||||
ret <- header[1]
|
ret <- header[1]
|
||||||
@ -261,3 +276,53 @@ xgb.cv.aggcv <- function(res, showsd = TRUE) {
|
|||||||
}
|
}
|
||||||
return (ret)
|
return (ret)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# Shamelessly copied from caret::createFolds
|
||||||
|
# and simplified by always returning an unnamed list of test indices
|
||||||
|
xgb.createFolds <- function(y, k = 10)
|
||||||
|
{
|
||||||
|
if(is.numeric(y)) {
|
||||||
|
## Group the numeric data based on their magnitudes
|
||||||
|
## and sample within those groups.
|
||||||
|
|
||||||
|
## When the number of samples is low, we may have
|
||||||
|
## issues further slicing the numeric data into
|
||||||
|
## groups. The number of groups will depend on the
|
||||||
|
## ratio of the number of folds to the sample size.
|
||||||
|
## At most, we will use quantiles. If the sample
|
||||||
|
## is too small, we just do regular unstratified
|
||||||
|
## CV
|
||||||
|
cuts <- floor(length(y)/k)
|
||||||
|
if(cuts < 2) cuts <- 2
|
||||||
|
if(cuts > 5) cuts <- 5
|
||||||
|
y <- cut(y,
|
||||||
|
unique(quantile(y, probs = seq(0, 1, length = cuts))),
|
||||||
|
include.lowest = TRUE)
|
||||||
|
}
|
||||||
|
|
||||||
|
if(k < length(y)) {
|
||||||
|
## reset levels so that the possible levels and
|
||||||
|
## the levels in the vector are the same
|
||||||
|
y <- factor(as.character(y))
|
||||||
|
numInClass <- table(y)
|
||||||
|
foldVector <- vector(mode = "integer", length(y))
|
||||||
|
|
||||||
|
## For each class, balance the fold allocation as far
|
||||||
|
## as possible, then resample the remainder.
|
||||||
|
## The final assignment of folds is also randomized.
|
||||||
|
for(i in 1:length(numInClass)) {
|
||||||
|
## create a vector of integers from 1:k as many times as possible without
|
||||||
|
## going over the number of samples in the class. Note that if the number
|
||||||
|
## of samples in a class is less than k, nothing is producd here.
|
||||||
|
seqVector <- rep(1:k, numInClass[i] %/% k)
|
||||||
|
## add enough random integers to get length(seqVector) == numInClass[i]
|
||||||
|
if(numInClass[i] %% k > 0) seqVector <- c(seqVector, sample(1:k, numInClass[i] %% k))
|
||||||
|
## shuffle the integers for fold assignment and assign to this classes's data
|
||||||
|
foldVector[which(y == dimnames(numInClass)$y[i])] <- sample(seqVector)
|
||||||
|
}
|
||||||
|
} else foldVector <- seq(along = y)
|
||||||
|
|
||||||
|
out <- split(seq(along = y), foldVector)
|
||||||
|
names(out) <- NULL
|
||||||
|
out
|
||||||
|
}
|
||||||
|
|||||||
@ -46,15 +46,25 @@
|
|||||||
#' \item \code{merror} Exact matching error, used to evaluate multi-class classification
|
#' \item \code{merror} Exact matching error, used to evaluate multi-class classification
|
||||||
#' }
|
#' }
|
||||||
#' @param obj customized objective function. Returns gradient and second order
|
#' @param obj customized objective function. Returns gradient and second order
|
||||||
#' gradient with given prediction and dtrain,
|
#' gradient with given prediction and dtrain.
|
||||||
#' @param feval custimized evaluation function. Returns
|
#' @param feval custimized evaluation function. Returns
|
||||||
#' \code{list(metric='metric-name', value='metric-value')} with given
|
#' \code{list(metric='metric-name', value='metric-value')} with given
|
||||||
#' prediction and dtrain,
|
#' prediction and dtrain.
|
||||||
#' @param verbose \code{boolean}, print the statistics during the process.
|
#' @param stratified \code{boolean} whether sampling of folds should be stratified by the values of labels in \code{data}
|
||||||
|
#' @param folds \code{list} provides a possibility of using a list of pre-defined CV folds (each element must be a vector of fold's indices).
|
||||||
|
#' If folds are supplied, the nfold and stratified parameters would be ignored.
|
||||||
|
#' @param verbose \code{boolean}, print the statistics during the process
|
||||||
#' @param ... other parameters to pass to \code{params}.
|
#' @param ... other parameters to pass to \code{params}.
|
||||||
#'
|
#'
|
||||||
#' @return A \code{data.table} with each mean and standard deviation stat for training set and test set.
|
#' @return
|
||||||
#'
|
#' If \code{prediction = TRUE}, a list with the following elements is returned:
|
||||||
|
#' \itemize{
|
||||||
|
#' \item \code{dt} a \code{data.table} with each mean and standard deviation stat for training set and test set
|
||||||
|
#' \item \code{pred} an array or matrix (for multiclass classification) with predictions for each CV-fold for the model having been trained on the data in all other folds.
|
||||||
|
#' }
|
||||||
|
#'
|
||||||
|
#' If \code{prediction = FALSE}, just a \code{data.table} with each mean and standard deviation stat for training set and test set is returned.
|
||||||
|
#'
|
||||||
#' @details
|
#' @details
|
||||||
#' The original sample is randomly partitioned into \code{nfold} equal size subsamples.
|
#' The original sample is randomly partitioned into \code{nfold} equal size subsamples.
|
||||||
#'
|
#'
|
||||||
@ -76,10 +86,16 @@
|
|||||||
#'
|
#'
|
||||||
xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing = NULL,
|
xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing = NULL,
|
||||||
prediction = FALSE, showsd = TRUE, metrics=list(),
|
prediction = FALSE, showsd = TRUE, metrics=list(),
|
||||||
obj = NULL, feval = NULL, verbose = T,...) {
|
obj = NULL, feval = NULL, stratified = TRUE, folds = NULL, verbose = T,...) {
|
||||||
if (typeof(params) != "list") {
|
if (typeof(params) != "list") {
|
||||||
stop("xgb.cv: first argument params must be list")
|
stop("xgb.cv: first argument params must be list")
|
||||||
}
|
}
|
||||||
|
if(!is.null(folds)) {
|
||||||
|
if(class(folds)!="list" | length(folds) < 2) {
|
||||||
|
stop("folds must be a list with 2 or more elements that are vectors of indices for each CV-fold")
|
||||||
|
}
|
||||||
|
nfold <- length(folds)
|
||||||
|
}
|
||||||
if (nfold <= 1) {
|
if (nfold <= 1) {
|
||||||
stop("nfold must be bigger than 1")
|
stop("nfold must be bigger than 1")
|
||||||
}
|
}
|
||||||
@ -94,7 +110,7 @@ xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing =
|
|||||||
params <- append(params, list("eval_metric"=mc))
|
params <- append(params, list("eval_metric"=mc))
|
||||||
}
|
}
|
||||||
|
|
||||||
folds <- xgb.cv.mknfold(dtrain, nfold, params)
|
xgb_folds <- xgb.cv.mknfold(dtrain, nfold, params, stratified, folds)
|
||||||
obj_type = params[['objective']]
|
obj_type = params[['objective']]
|
||||||
mat_pred = FALSE
|
mat_pred = FALSE
|
||||||
if (!is.null(obj_type) && obj_type=='multi:softprob')
|
if (!is.null(obj_type) && obj_type=='multi:softprob')
|
||||||
@ -111,7 +127,7 @@ xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing =
|
|||||||
for (i in 1:nrounds) {
|
for (i in 1:nrounds) {
|
||||||
msg <- list()
|
msg <- list()
|
||||||
for (k in 1:nfold) {
|
for (k in 1:nfold) {
|
||||||
fd <- folds[[k]]
|
fd <- xgb_folds[[k]]
|
||||||
succ <- xgb.iter.update(fd$booster, fd$dtrain, i - 1, obj)
|
succ <- xgb.iter.update(fd$booster, fd$dtrain, i - 1, obj)
|
||||||
if (i<nrounds) {
|
if (i<nrounds) {
|
||||||
msg[[k]] <- xgb.iter.eval(fd$booster, fd$watchlist, i - 1, feval) %>% str_split("\t") %>% .[[1]]
|
msg[[k]] <- xgb.iter.eval(fd$booster, fd$watchlist, i - 1, feval) %>% str_split("\t") %>% .[[1]]
|
||||||
@ -147,7 +163,7 @@ xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing =
|
|||||||
dt <- read.table(text = "", colClasses = type, col.names = colnames) %>% as.data.table
|
dt <- read.table(text = "", colClasses = type, col.names = colnames) %>% as.data.table
|
||||||
split <- str_split(string = history, pattern = "\t")
|
split <- str_split(string = history, pattern = "\t")
|
||||||
|
|
||||||
for(line in split) dt <- line[2:length(line)] %>% str_extract_all(pattern = "\\d*\\.+\\d*") %>% unlist %>% as.list %>% {vec <- .; rbindlist(list(dt, vec), use.names = F, fill = F)}
|
for(line in split) dt <- line[2:length(line)] %>% str_extract_all(pattern = "\\d*\\.+\\d*") %>% unlist %>% as.numeric %>% as.list %>% {rbindlist(list(dt, .), use.names = F, fill = F)}
|
||||||
|
|
||||||
if (prediction) {
|
if (prediction) {
|
||||||
return(list(dt = dt,pred = predictValues))
|
return(list(dt = dt,pred = predictValues))
|
||||||
|
|||||||
@ -6,7 +6,8 @@
|
|||||||
\usage{
|
\usage{
|
||||||
xgb.cv(params = list(), data, nrounds, nfold, label = NULL,
|
xgb.cv(params = list(), data, nrounds, nfold, label = NULL,
|
||||||
missing = NULL, prediction = FALSE, showsd = TRUE, metrics = list(),
|
missing = NULL, prediction = FALSE, showsd = TRUE, metrics = list(),
|
||||||
obj = NULL, feval = NULL, verbose = T, ...)
|
obj = NULL, feval = NULL, stratified = TRUE, folds = NULL,
|
||||||
|
verbose = T, ...)
|
||||||
}
|
}
|
||||||
\arguments{
|
\arguments{
|
||||||
\item{params}{the list of parameters. Commonly used ones are:
|
\item{params}{the list of parameters. Commonly used ones are:
|
||||||
@ -51,18 +52,29 @@ value that represents missing value. Sometime a data use 0 or other extreme valu
|
|||||||
}}
|
}}
|
||||||
|
|
||||||
\item{obj}{customized objective function. Returns gradient and second order
|
\item{obj}{customized objective function. Returns gradient and second order
|
||||||
gradient with given prediction and dtrain,}
|
gradient with given prediction and dtrain.}
|
||||||
|
|
||||||
\item{feval}{custimized evaluation function. Returns
|
\item{feval}{custimized evaluation function. Returns
|
||||||
\code{list(metric='metric-name', value='metric-value')} with given
|
\code{list(metric='metric-name', value='metric-value')} with given
|
||||||
prediction and dtrain,}
|
prediction and dtrain.}
|
||||||
|
|
||||||
\item{verbose}{\code{boolean}, print the statistics during the process.}
|
\item{stratified}{\code{boolean} whether sampling of folds should be stratified by the values of labels in \code{data}}
|
||||||
|
|
||||||
|
\item{folds}{\code{list} provides a possibility of using a list of pre-defined CV folds (each element must be a vector of fold's indices).
|
||||||
|
If folds are supplied, the nfold and stratified parameters would be ignored.}
|
||||||
|
|
||||||
|
\item{verbose}{\code{boolean}, print the statistics during the process}
|
||||||
|
|
||||||
\item{...}{other parameters to pass to \code{params}.}
|
\item{...}{other parameters to pass to \code{params}.}
|
||||||
}
|
}
|
||||||
\value{
|
\value{
|
||||||
A \code{data.table} with each mean and standard deviation stat for training set and test set.
|
If \code{prediction = TRUE}, a list with the following elements is returned:
|
||||||
|
\itemize{
|
||||||
|
\item \code{dt} a \code{data.table} with each mean and standard deviation stat for training set and test set
|
||||||
|
\item \code{pred} an array or matrix (for multiclass classification) with predictions for each CV-fold for the model having been trained on the data in all other folds.
|
||||||
|
}
|
||||||
|
|
||||||
|
If \code{prediction = FALSE}, just a \code{data.table} with each mean and standard deviation stat for training set and test set is returned.
|
||||||
}
|
}
|
||||||
\description{
|
\description{
|
||||||
The cross valudation function of xgboost
|
The cross valudation function of xgboost
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user