[R] Make xgb.cv work with xgb.DMatrix only, adding support for survival and ranking fields (#10031)
--------- Co-authored-by: Philip Hyunsu Cho <chohyu01@cs.washington.edu>
This commit is contained in:
@@ -26,6 +26,11 @@ NVL <- function(x, val) {
|
||||
'multi:softprob', 'rank:pairwise', 'rank:ndcg', 'rank:map'))
|
||||
}
|
||||
|
||||
.RANKING_OBJECTIVES <- function() {
|
||||
return(c('binary:logistic', 'binary:logitraw', 'binary:hinge', 'multi:softmax',
|
||||
'multi:softprob'))
|
||||
}
|
||||
|
||||
|
||||
#
|
||||
# Low-level functions for boosting --------------------------------------------
|
||||
@@ -235,33 +240,43 @@ convert.labels <- function(labels, objective_name) {
|
||||
}
|
||||
|
||||
# Generates random (stratified if needed) CV folds
|
||||
generate.cv.folds <- function(nfold, nrows, stratified, label, params) {
|
||||
generate.cv.folds <- function(nfold, nrows, stratified, label, group, params) {
|
||||
if (NROW(group)) {
|
||||
if (stratified) {
|
||||
warning(
|
||||
paste0(
|
||||
"Stratified splitting is not supported when using 'group' attribute.",
|
||||
" Will use unstratified splitting."
|
||||
)
|
||||
)
|
||||
}
|
||||
return(generate.group.folds(nfold, group))
|
||||
}
|
||||
objective <- params$objective
|
||||
if (!is.character(objective)) {
|
||||
warning("Will use unstratified splitting (custom objective used)")
|
||||
stratified <- FALSE
|
||||
}
|
||||
# cannot stratify if label is NULL
|
||||
if (stratified && is.null(label)) {
|
||||
warning("Will use unstratified splitting (no 'labels' available)")
|
||||
stratified <- FALSE
|
||||
}
|
||||
|
||||
# cannot do it for rank
|
||||
objective <- params$objective
|
||||
if (is.character(objective) && strtrim(objective, 5) == 'rank:') {
|
||||
stop("\n\tAutomatic generation of CV-folds is not implemented for ranking!\n",
|
||||
stop("\n\tAutomatic generation of CV-folds is not implemented for ranking without 'group' field!\n",
|
||||
"\tConsider providing pre-computed CV-folds through the 'folds=' parameter.\n")
|
||||
}
|
||||
# shuffle
|
||||
rnd_idx <- sample.int(nrows)
|
||||
if (stratified &&
|
||||
length(label) == length(rnd_idx)) {
|
||||
if (stratified && length(label) == length(rnd_idx)) {
|
||||
y <- label[rnd_idx]
|
||||
# WARNING: some heuristic logic is employed to identify classification setting!
|
||||
# - For classification, need to convert y labels to factor before making the folds,
|
||||
# and then do stratification by factor levels.
|
||||
# - For regression, leave y numeric and do stratification by quantiles.
|
||||
if (is.character(objective)) {
|
||||
y <- convert.labels(y, params$objective)
|
||||
} else {
|
||||
# If no 'objective' given in params, it means that user either wants to
|
||||
# use the default 'reg:squarederror' objective or has provided a custom
|
||||
# obj function. Here, assume classification setting when y has 5 or less
|
||||
# unique values:
|
||||
if (length(unique(y)) <= 5) {
|
||||
y <- factor(y)
|
||||
}
|
||||
y <- convert.labels(y, objective)
|
||||
}
|
||||
folds <- xgb.createFolds(y = y, k = nfold)
|
||||
} else {
|
||||
@@ -277,6 +292,29 @@ generate.cv.folds <- function(nfold, nrows, stratified, label, params) {
|
||||
return(folds)
|
||||
}
|
||||
|
||||
generate.group.folds <- function(nfold, group) {
|
||||
ngroups <- length(group) - 1
|
||||
if (ngroups < nfold) {
|
||||
stop("DMatrix has fewer groups than folds.")
|
||||
}
|
||||
seq_groups <- seq_len(ngroups)
|
||||
indices <- lapply(seq_groups, function(gr) seq(group[gr] + 1, group[gr + 1]))
|
||||
assignments <- base::split(seq_groups, as.integer(seq_groups %% nfold))
|
||||
assignments <- unname(assignments)
|
||||
|
||||
out <- vector("list", nfold)
|
||||
randomized_groups <- sample(ngroups)
|
||||
for (idx in seq_len(nfold)) {
|
||||
groups_idx_test <- randomized_groups[assignments[[idx]]]
|
||||
groups_test <- indices[groups_idx_test]
|
||||
idx_test <- unlist(groups_test)
|
||||
attributes(idx_test)$group_test <- lengths(groups_test)
|
||||
attributes(idx_test)$group_train <- lengths(indices[-groups_idx_test])
|
||||
out[[idx]] <- idx_test
|
||||
}
|
||||
return(out)
|
||||
}
|
||||
|
||||
# Creates CV folds stratified by the values of y.
|
||||
# It was borrowed from caret::createFolds and simplified
|
||||
# by always returning an unnamed list of fold indices.
|
||||
|
||||
@@ -1259,8 +1259,11 @@ xgb.get.DMatrix.data <- function(dmat) {
|
||||
#' Get a new DMatrix containing the specified rows of
|
||||
#' original xgb.DMatrix object
|
||||
#'
|
||||
#' @param object Object of class "xgb.DMatrix"
|
||||
#' @param idxset a integer vector of indices of rows needed
|
||||
#' @param object Object of class "xgb.DMatrix".
|
||||
#' @param idxset An integer vector of indices of rows needed (base-1 indexing).
|
||||
#' @param allow_groups Whether to allow slicing an `xgb.DMatrix` with `group` (or
|
||||
#' equivalently `qid`) field. Note that in such case, the result will not have
|
||||
#' the groups anymore - they need to be set manually through `setinfo`.
|
||||
#' @param colset currently not used (columns subsetting is not available)
|
||||
#'
|
||||
#' @examples
|
||||
@@ -1275,11 +1278,11 @@ xgb.get.DMatrix.data <- function(dmat) {
|
||||
#'
|
||||
#' @rdname xgb.slice.DMatrix
|
||||
#' @export
|
||||
xgb.slice.DMatrix <- function(object, idxset) {
|
||||
xgb.slice.DMatrix <- function(object, idxset, allow_groups = FALSE) {
|
||||
if (!inherits(object, "xgb.DMatrix")) {
|
||||
stop("object must be xgb.DMatrix")
|
||||
}
|
||||
ret <- .Call(XGDMatrixSliceDMatrix_R, object, idxset)
|
||||
ret <- .Call(XGDMatrixSliceDMatrix_R, object, idxset, allow_groups)
|
||||
|
||||
attr_list <- attributes(object)
|
||||
nr <- nrow(object)
|
||||
@@ -1296,7 +1299,15 @@ xgb.slice.DMatrix <- function(object, idxset) {
|
||||
}
|
||||
}
|
||||
}
|
||||
return(structure(ret, class = "xgb.DMatrix"))
|
||||
|
||||
out <- structure(ret, class = "xgb.DMatrix")
|
||||
parent_fields <- as.list(attributes(object)$fields)
|
||||
if (NROW(parent_fields)) {
|
||||
child_fields <- parent_fields[!(names(parent_fields) %in% c("group", "qid"))]
|
||||
child_fields <- as.environment(child_fields)
|
||||
attributes(out)$fields <- child_fields
|
||||
}
|
||||
return(out)
|
||||
}
|
||||
|
||||
#' @rdname xgb.slice.DMatrix
|
||||
@@ -1340,11 +1351,11 @@ print.xgb.DMatrix <- function(x, verbose = FALSE, ...) {
|
||||
}
|
||||
|
||||
cat(class_print, ' dim:', nrow(x), 'x', ncol(x), ' info: ')
|
||||
infos <- character(0)
|
||||
if (xgb.DMatrix.hasinfo(x, 'label')) infos <- 'label'
|
||||
if (xgb.DMatrix.hasinfo(x, 'weight')) infos <- c(infos, 'weight')
|
||||
if (xgb.DMatrix.hasinfo(x, 'base_margin')) infos <- c(infos, 'base_margin')
|
||||
if (length(infos) == 0) infos <- 'NA'
|
||||
infos <- names(attributes(x)$fields)
|
||||
infos <- infos[infos != "feature_name"]
|
||||
if (!NROW(infos)) infos <- "NA"
|
||||
infos <- infos[order(infos)]
|
||||
infos <- paste(infos, collapse = ", ")
|
||||
cat(infos)
|
||||
cnames <- colnames(x)
|
||||
cat(' colnames:')
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
#' Cross Validation
|
||||
#'
|
||||
#' The cross validation function of xgboost
|
||||
#' The cross validation function of xgboost.
|
||||
#'
|
||||
#' @param params the list of parameters. The complete list of parameters is
|
||||
#' available in the \href{http://xgboost.readthedocs.io/en/latest/parameter.html}{online documentation}. Below
|
||||
@@ -19,13 +19,17 @@
|
||||
#'
|
||||
#' See \code{\link{xgb.train}} for further details.
|
||||
#' See also demo/ for walkthrough example in R.
|
||||
#' @param data takes an \code{xgb.DMatrix}, \code{matrix}, or \code{dgCMatrix} as the input.
|
||||
#'
|
||||
#' Note that, while `params` accepts a `seed` entry and will use such parameter for model training if
|
||||
#' supplied, this seed is not used for creation of train-test splits, which instead rely on R's own RNG
|
||||
#' system - thus, for reproducible results, one needs to call the `set.seed` function beforehand.
|
||||
#' @param data An `xgb.DMatrix` object, with corresponding fields like `label` or bounds as required
|
||||
#' for model training by the objective.
|
||||
#'
|
||||
#' Note that only the basic `xgb.DMatrix` class is supported - variants such as `xgb.QuantileDMatrix`
|
||||
#' or `xgb.ExternalDMatrix` are not supported here.
|
||||
#' @param nrounds the max number of iterations
|
||||
#' @param nfold the original dataset is randomly partitioned into \code{nfold} equal size subsamples.
|
||||
#' @param label vector of response values. Should be provided only when data is an R-matrix.
|
||||
#' @param missing is only used when input is a dense matrix. By default is set to NA, which means
|
||||
#' that NA values should be considered as 'missing' by the algorithm.
|
||||
#' Sometimes, 0 or other extreme value might be used to represent missing values.
|
||||
#' @param prediction A logical value indicating whether to return the test fold predictions
|
||||
#' from each CV model. This parameter engages the \code{\link{xgb.cb.cv.predict}} callback.
|
||||
#' @param showsd \code{boolean}, whether to show standard deviation of cross validation
|
||||
@@ -47,13 +51,30 @@
|
||||
#' @param feval customized evaluation function. Returns
|
||||
#' \code{list(metric='metric-name', value='metric-value')} with given
|
||||
#' prediction and dtrain.
|
||||
#' @param stratified a \code{boolean} indicating whether sampling of folds should be stratified
|
||||
#' by the values of outcome labels.
|
||||
#' @param stratified A \code{boolean} indicating whether sampling of folds should be stratified
|
||||
#' by the values of outcome labels. For real-valued labels in regression objectives,
|
||||
#' stratification will be done by discretizing the labels into up to 5 buckets beforehand.
|
||||
#'
|
||||
#' If passing "auto", will be set to `TRUE` if the objective in `params` is a classification
|
||||
#' objective (from XGBoost's built-in objectives, doesn't apply to custom ones), and to
|
||||
#' `FALSE` otherwise.
|
||||
#'
|
||||
#' This parameter is ignored when `data` has a `group` field - in such case, the splitting
|
||||
#' will be based on whole groups (note that this might make the folds have different sizes).
|
||||
#'
|
||||
#' Value `TRUE` here is \bold{not} supported for custom objectives.
|
||||
#' @param folds \code{list} provides a possibility to use a list of pre-defined CV folds
|
||||
#' (each element must be a vector of test fold's indices). When folds are supplied,
|
||||
#' the \code{nfold} and \code{stratified} parameters are ignored.
|
||||
#'
|
||||
#' If `data` has a `group` field and the objective requires this field, each fold (list element)
|
||||
#' must additionally have two attributes (retrievable through \link{attributes}) named `group_test`
|
||||
#' and `group_train`, which should hold the `group` to assign through \link{setinfo.xgb.DMatrix} to
|
||||
#' the resulting DMatrices.
|
||||
#' @param train_folds \code{list} list specifying which indicies to use for training. If \code{NULL}
|
||||
#' (the default) all indices not specified in \code{folds} will be used for training.
|
||||
#'
|
||||
#' This is not supported when `data` has `group` field.
|
||||
#' @param verbose \code{boolean}, print the statistics during the process
|
||||
#' @param print_every_n Print each n-th iteration evaluation messages when \code{verbose>0}.
|
||||
#' Default is 1 which means all messages are printed. This parameter is passed to the
|
||||
@@ -118,13 +139,14 @@
|
||||
#' print(cv, verbose=TRUE)
|
||||
#'
|
||||
#' @export
|
||||
xgb.cv <- function(params = list(), data, nrounds, nfold, label = NULL, missing = NA,
|
||||
xgb.cv <- function(params = list(), data, nrounds, nfold,
|
||||
prediction = FALSE, showsd = TRUE, metrics = list(),
|
||||
obj = NULL, feval = NULL, stratified = TRUE, folds = NULL, train_folds = NULL,
|
||||
obj = NULL, feval = NULL, stratified = "auto", folds = NULL, train_folds = NULL,
|
||||
verbose = TRUE, print_every_n = 1L,
|
||||
early_stopping_rounds = NULL, maximize = NULL, callbacks = list(), ...) {
|
||||
|
||||
check.deprecation(...)
|
||||
stopifnot(inherits(data, "xgb.DMatrix"))
|
||||
if (inherits(data, "xgb.DMatrix") && .Call(XGCheckNullPtr_R, data)) {
|
||||
stop("'data' is an invalid 'xgb.DMatrix' object. Must be constructed again.")
|
||||
}
|
||||
@@ -137,16 +159,22 @@ xgb.cv <- function(params = list(), data, nrounds, nfold, label = NULL, missing
|
||||
check.custom.obj()
|
||||
check.custom.eval()
|
||||
|
||||
# Check the labels
|
||||
if ((inherits(data, 'xgb.DMatrix') && !xgb.DMatrix.hasinfo(data, 'label')) ||
|
||||
(!inherits(data, 'xgb.DMatrix') && is.null(label))) {
|
||||
stop("Labels must be provided for CV either through xgb.DMatrix, or through 'label=' when 'data' is matrix")
|
||||
} else if (inherits(data, 'xgb.DMatrix')) {
|
||||
if (!is.null(label))
|
||||
warning("xgb.cv: label will be ignored, since data is of type xgb.DMatrix")
|
||||
cv_label <- getinfo(data, 'label')
|
||||
} else {
|
||||
cv_label <- label
|
||||
if (stratified == "auto") {
|
||||
if (is.character(params$objective)) {
|
||||
stratified <- (
|
||||
(params$objective %in% .CLASSIFICATION_OBJECTIVES())
|
||||
&& !(params$objective %in% .RANKING_OBJECTIVES())
|
||||
)
|
||||
} else {
|
||||
stratified <- FALSE
|
||||
}
|
||||
}
|
||||
|
||||
# Check the labels and groups
|
||||
cv_label <- getinfo(data, "label")
|
||||
cv_group <- getinfo(data, "group")
|
||||
if (!is.null(train_folds) && NROW(cv_group)) {
|
||||
stop("'train_folds' is not supported for DMatrix object with 'group' field.")
|
||||
}
|
||||
|
||||
# CV folds
|
||||
@@ -157,7 +185,7 @@ xgb.cv <- function(params = list(), data, nrounds, nfold, label = NULL, missing
|
||||
} else {
|
||||
if (nfold <= 1)
|
||||
stop("'nfold' must be > 1")
|
||||
folds <- generate.cv.folds(nfold, nrow(data), stratified, cv_label, params)
|
||||
folds <- generate.cv.folds(nfold, nrow(data), stratified, cv_label, cv_group, params)
|
||||
}
|
||||
|
||||
# Callbacks
|
||||
@@ -195,20 +223,18 @@ xgb.cv <- function(params = list(), data, nrounds, nfold, label = NULL, missing
|
||||
|
||||
# create the booster-folds
|
||||
# train_folds
|
||||
dall <- xgb.get.DMatrix(
|
||||
data = data,
|
||||
label = label,
|
||||
missing = missing,
|
||||
weight = NULL,
|
||||
nthread = params$nthread
|
||||
)
|
||||
dall <- data
|
||||
bst_folds <- lapply(seq_along(folds), function(k) {
|
||||
dtest <- xgb.slice.DMatrix(dall, folds[[k]])
|
||||
dtest <- xgb.slice.DMatrix(dall, folds[[k]], allow_groups = TRUE)
|
||||
# code originally contributed by @RolandASc on stackoverflow
|
||||
if (is.null(train_folds))
|
||||
dtrain <- xgb.slice.DMatrix(dall, unlist(folds[-k]))
|
||||
dtrain <- xgb.slice.DMatrix(dall, unlist(folds[-k]), allow_groups = TRUE)
|
||||
else
|
||||
dtrain <- xgb.slice.DMatrix(dall, train_folds[[k]])
|
||||
dtrain <- xgb.slice.DMatrix(dall, train_folds[[k]], allow_groups = TRUE)
|
||||
if (!is.null(attributes(folds[[k]])$group_test)) {
|
||||
setinfo(dtest, "group", attributes(folds[[k]])$group_test)
|
||||
setinfo(dtrain, "group", attributes(folds[[k]])$group_train)
|
||||
}
|
||||
bst <- xgb.Booster(
|
||||
params = params,
|
||||
cachelist = list(dtrain, dtest),
|
||||
@@ -312,8 +338,8 @@ xgb.cv <- function(params = list(), data, nrounds, nfold, label = NULL, missing
|
||||
#' @examples
|
||||
#' data(agaricus.train, package='xgboost')
|
||||
#' train <- agaricus.train
|
||||
#' cv <- xgb.cv(data = train$data, label = train$label, nfold = 5, max_depth = 2,
|
||||
#' eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic")
|
||||
#' cv <- xgb.cv(data = xgb.DMatrix(train$data, label = train$label), nfold = 5, max_depth = 2,
|
||||
#' eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic")
|
||||
#' print(cv)
|
||||
#' print(cv, verbose=TRUE)
|
||||
#'
|
||||
|
||||
Reference in New Issue
Block a user