From f325930bd9791dc5d1d26994c24ad56bf3037d4e Mon Sep 17 00:00:00 2001 From: Vadim Khotilovich Date: Fri, 1 May 2015 15:08:08 -0500 Subject: [PATCH 1/3] Improved logic in stratified CV to guess class/regr Somewhat more robust and clear logic in stratified CV to guess classification/regression settings. Allows to accomodate custom objectives (classification is assumed when number of unique values in labels <= 5). --- R-package/R/utils.R | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/R-package/R/utils.R b/R-package/R/utils.R index 39609744e..49f6fdd6c 100644 --- a/R-package/R/utils.R +++ b/R-package/R/utils.R @@ -211,6 +211,7 @@ xgb.iter.eval <- function(booster, watchlist, iter, feval = NULL, prediction = F } return(msg) } + #------------------------------------------ # helper functions for cross validation # @@ -223,11 +224,23 @@ xgb.cv.mknfold <- function(dall, nfold, param, stratified, folds) { randidx <- sample(1 : xgb.numrow(dall)) if (stratified & length(y) == length(randidx)) { y <- y[randidx] - # By default assume that y is a classification label, - # and only leave it numeric for the reg:linear objective. - # WARNING: if there would be any other objectives with truly - # numerical labels, they currently would not be treated correctly! - if (param[['objective']] != 'reg:linear') y <- factor(y) + # + # WARNING: some heuristic logic is employed to identify classification setting! + # + # For classification, need to convert y labels to factor before making the folds, + # and then do stratification by factor levels. + # For regression, leave y numeric and do stratification by quantiles. + n_uniq <- length(unique(y)) + if (exists('objective', where=param)) { + # If 'objective' provided in params, assume that y is a classification label + # unless objective is reg:linear + if (param[['objective']] != 'reg:linear') y <- factor(y) + } else { + # If no 'objective' given in params, it means that user either wants to use + # the default 'reg:linear' objective or has provided a custom obj function. + # Here, assume classification setting when y has 5 or less unique values: + if (length(unique(y)) <= 5) y <- factor(y) + } folds <- xgb.createFolds(y, nfold) } else { # make simple non-stratified folds From 0a3e7722fd3c99e2a85d4c0b282b087a55832164 Mon Sep 17 00:00:00 2001 From: Vadim Khotilovich Date: Fri, 1 May 2015 15:16:30 -0500 Subject: [PATCH 2/3] a safeguard against someone using automatic folds creation with ranking --- R-package/R/utils.R | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/R-package/R/utils.R b/R-package/R/utils.R index 49f6fdd6c..7a3731546 100644 --- a/R-package/R/utils.R +++ b/R-package/R/utils.R @@ -220,6 +220,10 @@ xgb.cv.mknfold <- function(dall, nfold, param, stratified, folds) { stop("nfold must be bigger than 1") } if(is.null(folds)) { + if (exists('objective', where=param) && strtrim(param[['objective']], 5) == 'rank:') { + stop("\tAutomatic creation of CV-folds is not implemented for ranking!\n", + "\tConsider providing pre-computed CV-folds through the folds parameter.") + } y <- getinfo(dall, 'label') randidx <- sample(1 : xgb.numrow(dall)) if (stratified & length(y) == length(randidx)) { From c18e081f486792ce6eeacb9e8110755ea685c0ef Mon Sep 17 00:00:00 2001 From: Vadim Khotilovich Date: Fri, 1 May 2015 16:16:50 -0500 Subject: [PATCH 3/3] cleanup --- R-package/R/utils.R | 1 - 1 file changed, 1 deletion(-) diff --git a/R-package/R/utils.R b/R-package/R/utils.R index 7a3731546..4a5d99c7d 100644 --- a/R-package/R/utils.R +++ b/R-package/R/utils.R @@ -234,7 +234,6 @@ xgb.cv.mknfold <- function(dall, nfold, param, stratified, folds) { # For classification, need to convert y labels to factor before making the folds, # and then do stratification by factor levels. # For regression, leave y numeric and do stratification by quantiles. - n_uniq <- length(unique(y)) if (exists('objective', where=param)) { # If 'objective' provided in params, assume that y is a classification label # unless objective is reg:linear