From cdea1685e581c081b72a7107bb6fb899fa6c5c2f Mon Sep 17 00:00:00 2001 From: El Potaeto Date: Fri, 2 Jan 2015 11:21:53 +0100 Subject: [PATCH] Add a new verbose parameter to print progress during the process (set to true by default to not change behavior of existing code) + source code refactoring --- R-package/NAMESPACE | 1 - R-package/R/xgb.cv.R | 26 ++++++++++++-------------- R-package/man/xgb.cv.Rd | 9 ++++++--- 3 files changed, 18 insertions(+), 18 deletions(-) diff --git a/R-package/NAMESPACE b/R-package/NAMESPACE index bd12fc7ec..6e74d9ac2 100644 --- a/R-package/NAMESPACE +++ b/R-package/NAMESPACE @@ -25,5 +25,4 @@ importFrom(stringr,str_extract) importFrom(stringr,str_extract_all) importFrom(stringr,str_match) importFrom(stringr,str_replace) -importFrom(stringr,str_replace_all) importFrom(stringr,str_split) diff --git a/R-package/R/xgb.cv.R b/R-package/R/xgb.cv.R index c2e73e202..7256980c6 100644 --- a/R-package/R/xgb.cv.R +++ b/R-package/R/xgb.cv.R @@ -8,8 +8,8 @@ #' @importFrom data.table := #' @importFrom data.table rbindlist #' @importFrom stringr str_extract_all +#' @importFrom stringr str_extract #' @importFrom stringr str_split -#' @importFrom stringr str_replace_all #' @importFrom stringr str_replace #' @importFrom stringr str_match #' @@ -31,7 +31,7 @@ #' @param nrounds the max number of iterations #' @param nfold number of folds used #' @param label option field, when data is Matrix -#' @param showsd boolean, whether show standard deviation of cross validation +#' @param showsd \code{boolean}, whether show standard deviation of cross validation #' @param metrics, list of evaluation metrics to be used in corss validation, #' when it is not specified, the evaluation metric is chosen according to objective function. #' Possible options are: @@ -49,9 +49,10 @@ #' prediction and dtrain, #' @param missing Missing is only used when input is dense matrix, pick a float # value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values. +#' @param verbose \code{boolean}, print the statistics during the process. #' @param ... other parameters to pass to \code{params}. #' -#' @return a \code{data.table} with each mean and standard deviation stat for training set and test set. +#' @return A \code{data.table} with each mean and standard deviation stat for training set and test set. #' #' @details #' This is the cross validation function for xgboost @@ -66,10 +67,11 @@ #' dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label) #' history <- xgb.cv(data = dtrain, nround=3, nfold = 5, metrics=list("rmse","auc"), #' "max.depth"=3, "eta"=1, "objective"="binary:logistic") +#' print(history) #' @export #' xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing = NULL, - showsd = TRUE, metrics=list(), obj = NULL, feval = NULL, ...) { + showsd = TRUE, metrics=list(), obj = NULL, feval = NULL, verbose = T,...) { if (typeof(params) != "list") { stop("xgb.cv: first argument params must be list") } @@ -94,28 +96,24 @@ xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing = for (k in 1:nfold) { fd <- folds[[k]] succ <- xgb.iter.update(fd$booster, fd$dtrain, i - 1, obj) - msg[[k]] <- strsplit(xgb.iter.eval(fd$booster, fd$watchlist, i - 1, feval), - "\t")[[1]] + msg[[k]] <- xgb.iter.eval(fd$booster, fd$watchlist, i - 1, feval) %>% str_split("\t") %>% .[[1]] } ret <- xgb.cv.aggcv(msg, showsd) history <- c(history, ret) - cat(paste(ret, "\n", sep="")) + if(verbose) paste(ret, "\n", sep="") %>% cat } - colnames <- str_split(string = history[1], pattern = "\t")[[1]] %>% .[2:length(.)] %>% str_extract(".*:") %>% str_replace(":","") %>% str_replace_all("-", ".") - + colnames <- str_split(string = history[1], pattern = "\t")[[1]] %>% .[2:length(.)] %>% str_extract(".*:") %>% str_replace(":","") %>% str_replace("-", ".") colnamesMean <- paste(colnames, "mean") colnamesStd <- paste(colnames, "std") + colnames <- c() for(i in 1:length(colnamesMean)) colnames <- c(colnames, colnamesMean[i], colnamesStd[i]) type <- rep(x = "numeric", times = length(colnames)) - dt <- read.table(text = "", colClasses = type, col.names = colnames) %>% as.data.table - split = str_split(string = history, pattern = "\t") - for(line in split){ - dt <- line[2:length(line)] %>% str_extract_all(pattern = "\\d.\\d*") %>% unlist %>% as.list %>% {vec <- .;rbindlist(list(dt, vec), use.names = F, fill = F)} - } + + for(line in split) dt <- line[2:length(line)] %>% str_extract_all(pattern = "\\d.\\d*") %>% unlist %>% as.list %>% {vec <- .; rbindlist(list(dt, vec), use.names = F, fill = F)} dt } \ No newline at end of file diff --git a/R-package/man/xgb.cv.Rd b/R-package/man/xgb.cv.Rd index 19f04ee79..7ba5eb727 100644 --- a/R-package/man/xgb.cv.Rd +++ b/R-package/man/xgb.cv.Rd @@ -6,7 +6,7 @@ \usage{ xgb.cv(params = list(), data, nrounds, nfold, label = NULL, missing = NULL, showsd = TRUE, metrics = list(), obj = NULL, - feval = NULL, ...) + feval = NULL, verbose = T, ...) } \arguments{ \item{params}{the list of parameters. Commonly used ones are: @@ -34,7 +34,7 @@ xgb.cv(params = list(), data, nrounds, nfold, label = NULL, \item{missing}{Missing is only used when input is dense matrix, pick a float} -\item{showsd}{boolean, whether show standard deviation of cross validation} +\item{showsd}{\code{boolean}, whether show standard deviation of cross validation} \item{metrics,}{list of evaluation metrics to be used in corss validation, when it is not specified, the evaluation metric is chosen according to objective function. @@ -54,10 +54,12 @@ gradient with given prediction and dtrain,} \code{list(metric='metric-name', value='metric-value')} with given prediction and dtrain,} +\item{verbose}{\code{boolean}, print the statistics during the process.} + \item{...}{other parameters to pass to \code{params}.} } \value{ -a \code{data.table} with each mean and standard deviation stat for training set and test set. +A \code{data.table} with each mean and standard deviation stat for training set and test set. } \description{ The cross valudation function of xgboost @@ -75,5 +77,6 @@ data(agaricus.train, package='xgboost') dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label) history <- xgb.cv(data = dtrain, nround=3, nfold = 5, metrics=list("rmse","auc"), "max.depth"=3, "eta"=1, "objective"="binary:logistic") +print(history) }