[R] Remove enable_categorical parameter (#10018)

This commit is contained in:
david-cortes 2024-01-30 22:17:36 +01:00 committed by GitHub
parent 3abbbe41ac
commit df7cf744b4
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 15 additions and 79 deletions

View File

@ -16,10 +16,6 @@
#' \item `matrix` objects, with types `numeric`, `integer`, or `logical`. #' \item `matrix` objects, with types `numeric`, `integer`, or `logical`.
#' \item `data.frame` objects, with columns of types `numeric`, `integer`, `logical`, or `factor`. #' \item `data.frame` objects, with columns of types `numeric`, `integer`, `logical`, or `factor`.
#' #'
#' If passing `enable_categorical=TRUE`, columns with `factor` type will be treated as categorical.
#' Otherwise, if passing `enable_categorical=FALSE` and the data contains `factor` columns, an error
#' will be thrown.
#'
#' Note that xgboost uses base-0 encoding for categorical types, hence `factor` types (which use base-1 #' Note that xgboost uses base-0 encoding for categorical types, hence `factor` types (which use base-1
#' encoding') will be converted inside the function call. Be aware that the encoding used for `factor` #' encoding') will be converted inside the function call. Be aware that the encoding used for `factor`
#' types is not kept as part of the model, so in subsequent calls to `predict`, it is the user's #' types is not kept as part of the model, so in subsequent calls to `predict`, it is the user's
@ -59,7 +55,7 @@
#' must be the same as in the DMatrix construction, regardless of the column names. #' must be the same as in the DMatrix construction, regardless of the column names.
#' @param feature_types Set types for features. #' @param feature_types Set types for features.
#' #'
#' If `data` is a `data.frame` and passing `enable_categorical=TRUE`, the types will be deduced #' If `data` is a `data.frame` and passing `feature_types` is not supplied, feature types will be deduced
#' automatically from the column types. #' automatically from the column types.
#' #'
#' Otherwise, one can pass a character vector with the same length as number of columns in `data`, #' Otherwise, one can pass a character vector with the same length as number of columns in `data`,
@ -79,18 +75,6 @@
#' @param label_lower_bound Lower bound for survival training. #' @param label_lower_bound Lower bound for survival training.
#' @param label_upper_bound Upper bound for survival training. #' @param label_upper_bound Upper bound for survival training.
#' @param feature_weights Set feature weights for column sampling. #' @param feature_weights Set feature weights for column sampling.
#' @param enable_categorical Experimental support of specializing for categorical features.
#'
#' If passing 'TRUE' and 'data' is a data frame,
#' columns of categorical types will automatically
#' be set to be of categorical type (feature_type='c') in the resulting DMatrix.
#'
#' If passing 'FALSE' and 'data' is a data frame with categorical columns,
#' it will result in an error being thrown.
#'
#' If 'data' is not a data frame, this argument is ignored.
#'
#' JSON/UBJSON serialization format is required for this.
#' @return An 'xgb.DMatrix' object. If calling 'xgb.QuantileDMatrix', it will have additional #' @return An 'xgb.DMatrix' object. If calling 'xgb.QuantileDMatrix', it will have additional
#' subclass 'xgb.QuantileDMatrix'. #' subclass 'xgb.QuantileDMatrix'.
#' #'
@ -127,8 +111,7 @@ xgb.DMatrix <- function(
qid = NULL, qid = NULL,
label_lower_bound = NULL, label_lower_bound = NULL,
label_upper_bound = NULL, label_upper_bound = NULL,
feature_weights = NULL, feature_weights = NULL
enable_categorical = FALSE
) { ) {
if (!is.null(group) && !is.null(qid)) { if (!is.null(group) && !is.null(qid)) {
stop("Either one of 'group' or 'qid' should be NULL") stop("Either one of 'group' or 'qid' should be NULL")
@ -180,7 +163,7 @@ xgb.DMatrix <- function(
nthread nthread
) )
} else if (is.data.frame(data)) { } else if (is.data.frame(data)) {
tmp <- .process.df.for.dmatrix(data, enable_categorical, feature_types) tmp <- .process.df.for.dmatrix(data, feature_types)
feature_types <- tmp$feature_types feature_types <- tmp$feature_types
handle <- .Call( handle <- .Call(
XGDMatrixCreateFromDF_R, tmp$lst, missing, nthread XGDMatrixCreateFromDF_R, tmp$lst, missing, nthread
@ -212,7 +195,7 @@ xgb.DMatrix <- function(
return(dmat) return(dmat)
} }
.process.df.for.dmatrix <- function(df, enable_categorical, feature_types) { .process.df.for.dmatrix <- function(df, feature_types) {
if (!nrow(df) || !ncol(df)) { if (!nrow(df) || !ncol(df)) {
stop("'data' is an empty data.frame.") stop("'data' is an empty data.frame.")
} }
@ -225,12 +208,6 @@ xgb.DMatrix <- function(
} else { } else {
feature_types <- sapply(df, function(col) { feature_types <- sapply(df, function(col) {
if (is.factor(col)) { if (is.factor(col)) {
if (!enable_categorical) {
stop(
"When factor type is used, the parameter `enable_categorical`",
" must be set to TRUE."
)
}
return("c") return("c")
} else if (is.integer(col)) { } else if (is.integer(col)) {
return("int") return("int")
@ -326,7 +303,6 @@ xgb.QuantileDMatrix <- function(
label_lower_bound = NULL, label_lower_bound = NULL,
label_upper_bound = NULL, label_upper_bound = NULL,
feature_weights = NULL, feature_weights = NULL,
enable_categorical = FALSE,
ref = NULL, ref = NULL,
max_bin = NULL max_bin = NULL
) { ) {
@ -357,8 +333,7 @@ xgb.QuantileDMatrix <- function(
qid = qid, qid = qid,
label_lower_bound = label_lower_bound, label_lower_bound = label_lower_bound,
label_upper_bound = label_upper_bound, label_upper_bound = label_upper_bound,
feature_weights = feature_weights, feature_weights = feature_weights
enable_categorical = enable_categorical
) )
) )
data_iterator <- .single.data.iterator(iterator_env) data_iterator <- .single.data.iterator(iterator_env)
@ -470,8 +445,7 @@ xgb.DataIter <- function(env = new.env(), f_next, f_reset) {
qid = env[["qid"]], qid = env[["qid"]],
label_lower_bound = env[["label_lower_bound"]], label_lower_bound = env[["label_lower_bound"]],
label_upper_bound = env[["label_upper_bound"]], label_upper_bound = env[["label_upper_bound"]],
feature_weights = env[["feature_weights"]], feature_weights = env[["feature_weights"]]
enable_categorical = env[["enable_categorical"]]
) )
) )
} }
@ -540,8 +514,7 @@ xgb.ProxyDMatrix <- function(
qid = NULL, qid = NULL,
label_lower_bound = NULL, label_lower_bound = NULL,
label_upper_bound = NULL, label_upper_bound = NULL,
feature_weights = NULL, feature_weights = NULL
enable_categorical = FALSE
) { ) {
stopifnot(inherits(data, c("matrix", "data.frame", "dgRMatrix"))) stopifnot(inherits(data, c("matrix", "data.frame", "dgRMatrix")))
out <- list( out <- list(
@ -555,8 +528,7 @@ xgb.ProxyDMatrix <- function(
qid = qid, qid = qid,
label_lower_bound = label_lower_bound, label_lower_bound = label_lower_bound,
label_upper_bound = label_upper_bound, label_upper_bound = label_upper_bound,
feature_weights = feature_weights, feature_weights = feature_weights
enable_categorical = enable_categorical
) )
class(out) <- "xgb.ProxyDMatrix" class(out) <- "xgb.ProxyDMatrix"
return(out) return(out)
@ -575,7 +547,7 @@ xgb.ProxyDMatrix.internal <- function(proxy_handle, data_iterator) {
stop("Either one of 'group' or 'qid' should be NULL") stop("Either one of 'group' or 'qid' should be NULL")
} }
if (is.data.frame(lst$data)) { if (is.data.frame(lst$data)) {
tmp <- .process.df.for.dmatrix(lst$data, lst$enable_categorical, lst$feature_types) tmp <- .process.df.for.dmatrix(lst$data, lst$feature_types)
lst$feature_types <- tmp$feature_types lst$feature_types <- tmp$feature_types
.Call(XGProxyDMatrixSetDataColumnar_R, proxy_handle, tmp$lst) .Call(XGProxyDMatrixSetDataColumnar_R, proxy_handle, tmp$lst)
rm(tmp) rm(tmp)

View File

@ -19,8 +19,7 @@ xgb.DMatrix(
qid = NULL, qid = NULL,
label_lower_bound = NULL, label_lower_bound = NULL,
label_upper_bound = NULL, label_upper_bound = NULL,
feature_weights = NULL, feature_weights = NULL
enable_categorical = FALSE
) )
xgb.QuantileDMatrix( xgb.QuantileDMatrix(
@ -37,7 +36,6 @@ xgb.QuantileDMatrix(
label_lower_bound = NULL, label_lower_bound = NULL,
label_upper_bound = NULL, label_upper_bound = NULL,
feature_weights = NULL, feature_weights = NULL,
enable_categorical = FALSE,
ref = NULL, ref = NULL,
max_bin = NULL max_bin = NULL
) )
@ -50,10 +48,6 @@ Supported input types are as follows:\itemize{
\item \code{matrix} objects, with types \code{numeric}, \code{integer}, or \code{logical}. \item \code{matrix} objects, with types \code{numeric}, \code{integer}, or \code{logical}.
\item \code{data.frame} objects, with columns of types \code{numeric}, \code{integer}, \code{logical}, or \code{factor}. \item \code{data.frame} objects, with columns of types \code{numeric}, \code{integer}, \code{logical}, or \code{factor}.
If passing \code{enable_categorical=TRUE}, columns with \code{factor} type will be treated as categorical.
Otherwise, if passing \code{enable_categorical=FALSE} and the data contains \code{factor} columns, an error
will be thrown.
Note that xgboost uses base-0 encoding for categorical types, hence \code{factor} types (which use base-1 Note that xgboost uses base-0 encoding for categorical types, hence \code{factor} types (which use base-1
encoding') will be converted inside the function call. Be aware that the encoding used for \code{factor} encoding') will be converted inside the function call. Be aware that the encoding used for \code{factor}
types is not kept as part of the model, so in subsequent calls to \code{predict}, it is the user's types is not kept as part of the model, so in subsequent calls to \code{predict}, it is the user's
@ -102,7 +96,7 @@ frame and matrix.
\item{feature_types}{Set types for features. \item{feature_types}{Set types for features.
If \code{data} is a \code{data.frame} and passing \code{enable_categorical=TRUE}, the types will be deduced If \code{data} is a \code{data.frame} and passing \code{feature_types} is not supplied, feature types will be deduced
automatically from the column types. automatically from the column types.
Otherwise, one can pass a character vector with the same length as number of columns in \code{data}, Otherwise, one can pass a character vector with the same length as number of columns in \code{data},
@ -129,20 +123,6 @@ functionalities such as feature importances.}
\item{feature_weights}{Set feature weights for column sampling.} \item{feature_weights}{Set feature weights for column sampling.}
\item{enable_categorical}{Experimental support of specializing for categorical features.
\if{html}{\out{<div class="sourceCode">}}\preformatted{ If passing 'TRUE' and 'data' is a data frame,
columns of categorical types will automatically
be set to be of categorical type (feature_type='c') in the resulting DMatrix.
If passing 'FALSE' and 'data' is a data frame with categorical columns,
it will result in an error being thrown.
If 'data' is not a data frame, this argument is ignored.
JSON/UBJSON serialization format is required for this.
}\if{html}{\out{</div>}}}
\item{ref}{The training dataset that provides quantile information, needed when creating \item{ref}{The training dataset that provides quantile information, needed when creating
validation/test dataset with \code{xgb.QuantileDMatrix}. Supplying the training DMatrix validation/test dataset with \code{xgb.QuantileDMatrix}. Supplying the training DMatrix
as a reference means that the same quantisation applied to the training data is as a reference means that the same quantisation applied to the training data is

View File

@ -15,8 +15,7 @@ xgb.ProxyDMatrix(
qid = NULL, qid = NULL,
label_lower_bound = NULL, label_lower_bound = NULL,
label_upper_bound = NULL, label_upper_bound = NULL,
feature_weights = NULL, feature_weights = NULL
enable_categorical = FALSE
) )
} }
\arguments{ \arguments{
@ -57,7 +56,7 @@ frame and matrix.
\item{feature_types}{Set types for features. \item{feature_types}{Set types for features.
If \code{data} is a \code{data.frame} and passing \code{enable_categorical=TRUE}, the types will be deduced If \code{data} is a \code{data.frame} and passing \code{feature_types} is not supplied, feature types will be deduced
automatically from the column types. automatically from the column types.
Otherwise, one can pass a character vector with the same length as number of columns in \code{data}, Otherwise, one can pass a character vector with the same length as number of columns in \code{data},
@ -81,20 +80,6 @@ functionalities such as feature importances.}
\item{label_upper_bound}{Upper bound for survival training.} \item{label_upper_bound}{Upper bound for survival training.}
\item{feature_weights}{Set feature weights for column sampling.} \item{feature_weights}{Set feature weights for column sampling.}
\item{enable_categorical}{Experimental support of specializing for categorical features.
\if{html}{\out{<div class="sourceCode">}}\preformatted{ If passing 'TRUE' and 'data' is a data frame,
columns of categorical types will automatically
be set to be of categorical type (feature_type='c') in the resulting DMatrix.
If passing 'FALSE' and 'data' is a data frame with categorical columns,
it will result in an error being thrown.
If 'data' is not a data frame, this argument is ignored.
JSON/UBJSON serialization format is required for this.
}\if{html}{\out{</div>}}}
} }
\value{ \value{
An object of class \code{xgb.ProxyDMatrix}, which is just a list containing the An object of class \code{xgb.ProxyDMatrix}, which is just a list containing the

View File

@ -338,19 +338,18 @@ test_that("xgb.DMatrix: data.frame", {
stringsAsFactors = TRUE stringsAsFactors = TRUE
) )
m <- xgb.DMatrix(df, enable_categorical = TRUE) m <- xgb.DMatrix(df)
expect_equal(colnames(m), colnames(df)) expect_equal(colnames(m), colnames(df))
expect_equal( expect_equal(
getinfo(m, "feature_type"), c("float", "float", "int", "i", "c", "c") getinfo(m, "feature_type"), c("float", "float", "int", "i", "c", "c")
) )
expect_error(xgb.DMatrix(df, enable_categorical = FALSE))
df <- data.frame( df <- data.frame(
missing = c("a", "b", "d", NA), missing = c("a", "b", "d", NA),
valid = c("a", "b", "d", "c"), valid = c("a", "b", "d", "c"),
stringsAsFactors = TRUE stringsAsFactors = TRUE
) )
m <- xgb.DMatrix(df, enable_categorical = TRUE) m <- xgb.DMatrix(df)
expect_equal(getinfo(m, "feature_type"), c("c", "c")) expect_equal(getinfo(m, "feature_type"), c("c", "c"))
}) })