add cross validation

This commit is contained in:
tqchen 2014-09-05 22:34:32 -07:00
parent bc1817ca2f
commit 0ecd6c08f3
5 changed files with 66 additions and 10 deletions

View File

@ -8,6 +8,7 @@ export(xgb.dump)
export(xgb.load) export(xgb.load)
export(xgb.save) export(xgb.save)
export(xgb.train) export(xgb.train)
export(xgb.cv)
export(xgboost) export(xgboost)
exportMethods(predict) exportMethods(predict)
import(methods) import(methods)

View File

@ -103,6 +103,10 @@ xgb.get.DMatrix <- function(data, label = NULL) {
} }
return (dtrain) return (dtrain)
} }
xgb.numrow <- function(dmat) {
nrow <- .Call("XGDMatrixNumRow_R", dmat, PACKAGE="xgboost")
return(nrow)
}
# iteratively update booster with customized statistics # iteratively update booster with customized statistics
xgb.iter.boost <- function(booster, dtrain, gpair) { xgb.iter.boost <- function(booster, dtrain, gpair) {
if (class(booster) != "xgb.Booster") { if (class(booster) != "xgb.Booster") {
@ -174,23 +178,51 @@ xgb.iter.eval <- function(booster, watchlist, iter, feval = NULL) {
} }
} else { } else {
msg <- "" msg <- ""
} }
return(msg) return(msg)
} }
#------------------------------------------ #------------------------------------------
# helper functions for cross validation # helper functions for cross validation
# #
xgb.cv.mknfold <- function(dall, nfold, param, metrics=list(), fpreproc = NULL) { xgb.cv.mknfold <- function(dall, nfold, param) {
randidx <- sample(1 : xgb.numrow(dall)) randidx <- sample(1 : xgb.numrow(dall))
kstep <- length(randidx) / nfold kstep <- length(randidx) / nfold
idset <- list() idset <- list()
for (i in 1:nfold) { for (i in 1:nfold) {
idset = append(idset, randidx[ ((i-1) * kstep + 1) : min(i * kstep, length(randidx)) ]) idset[[i]] <- randidx[ ((i-1) * kstep + 1) : min(i * kstep, length(randidx)) ]
} }
ret <- list() ret <- list()
for (k in 1:nfold) { for (k in 1:nfold) {
dtest <- slice(dall, idset[[k]])
didx = c()
for (i in 1:nfold) {
if (i != k) {
didx <- append(didx, idset[[i]])
}
}
dtrain <- slice(dall, didx)
bst <- xgb.Booster(param, list(dtrain, dtest))
watchlist = list(train=dtrain, test=dtest)
ret[[k]] <- list(dtrain=dtrain, booster=bst, watchlist=watchlist)
} }
return (ret)
}
xgb.cv.aggcv <- function(res, showsd = TRUE) {
header = res[[1]]
ret <- header[1]
for (i in 2:length(header)) {
kv <- strsplit(header[i], ":")[[1]]
ret <- paste(ret, "\t", kv[1], ":", sep="")
stats <- c()
stats[1] <- as.numeric(kv[2])
for (j in 2:length(res)) {
tkv <- strsplit(res[[j]][i], ":")[[1]]
stats[j] <- as.numeric(tkv[2])
}
ret <- paste(ret, sprintf("%f", mean(stats)), sep="")
if (showsd) {
ret <- paste(ret, sprintf("+%f", sd(stats)), sep="")
}
}
return (ret)
} }

View File

@ -46,12 +46,26 @@
#' #'
#' @export #' @export
#' #'
xgb.cv <- function(params=list(), data, nrounds, metrics=list(), label = NULL, xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL,
obj = NULL, feval = NULL, ...) { showsd = TRUE, obj = NULL, feval = NULL, ...) {
if (typeof(params) != "list") { if (typeof(params) != "list") {
stop("xgb.cv: first argument params must be list") stop("xgb.cv: first argument params must be list")
} }
dtrain <- xgb.get.DMatrix(data, label) dtrain <- xgb.get.DMatrix(data, label)
params = append(params, list(...)) params <- append(params, list(...))
params <- append(params, list(silent=1))
folds <- xgb.cv.mknfold(dtrain, nfold, params)
history <- list()
for (i in 1:nrounds) {
msg <- list()
for (k in 1:nfold) {
fd <- folds[[k]]
succ <- xgb.iter.update(fd$booster, fd$dtrain, i - 1, obj)
msg[[k]] <- strsplit(xgb.iter.eval(fd$booster, fd$watchlist, i - 1, feval), "\t")[[1]]
}
ret <- xgb.cv.aggcv(msg, showsd)
history <- append(history, ret)
cat(paste(ret, "\n", sep=""))
}
return (history)
} }

View File

@ -174,6 +174,10 @@ extern "C" {
_WrapperEnd(); _WrapperEnd();
return ret; return ret;
} }
SEXP XGDMatrixNumRow_R(SEXP handle) {
bst_ulong nrow = XGDMatrixNumRow(R_ExternalPtrAddr(handle));
return ScalarInteger(static_cast<int>(nrow));
}
// functions related to booster // functions related to booster
void _BoosterFinalizer(SEXP ext) { void _BoosterFinalizer(SEXP ext) {
if (R_ExternalPtrAddr(ext) == NULL) return; if (R_ExternalPtrAddr(ext) == NULL) return;

View File

@ -65,6 +65,11 @@ extern "C" {
* \return info vector * \return info vector
*/ */
SEXP XGDMatrixGetInfo_R(SEXP handle, SEXP field); SEXP XGDMatrixGetInfo_R(SEXP handle, SEXP field);
/*!
* \brief return number of rows
* \param handle a instance of data matrix
*/
SEXP XGDMatrixNumRow_R(SEXP handle);
/*! /*!
* \brief create xgboost learner * \brief create xgboost learner
* \param dmats a list of dmatrix handles that will be cached * \param dmats a list of dmatrix handles that will be cached