[R] Remove enable_categorical parameter (#10018)
This commit is contained in:
parent
3abbbe41ac
commit
df7cf744b4
@ -16,10 +16,6 @@
|
|||||||
#' \item `matrix` objects, with types `numeric`, `integer`, or `logical`.
|
#' \item `matrix` objects, with types `numeric`, `integer`, or `logical`.
|
||||||
#' \item `data.frame` objects, with columns of types `numeric`, `integer`, `logical`, or `factor`.
|
#' \item `data.frame` objects, with columns of types `numeric`, `integer`, `logical`, or `factor`.
|
||||||
#'
|
#'
|
||||||
#' If passing `enable_categorical=TRUE`, columns with `factor` type will be treated as categorical.
|
|
||||||
#' Otherwise, if passing `enable_categorical=FALSE` and the data contains `factor` columns, an error
|
|
||||||
#' will be thrown.
|
|
||||||
#'
|
|
||||||
#' Note that xgboost uses base-0 encoding for categorical types, hence `factor` types (which use base-1
|
#' Note that xgboost uses base-0 encoding for categorical types, hence `factor` types (which use base-1
|
||||||
#' encoding') will be converted inside the function call. Be aware that the encoding used for `factor`
|
#' encoding') will be converted inside the function call. Be aware that the encoding used for `factor`
|
||||||
#' types is not kept as part of the model, so in subsequent calls to `predict`, it is the user's
|
#' types is not kept as part of the model, so in subsequent calls to `predict`, it is the user's
|
||||||
@ -59,7 +55,7 @@
|
|||||||
#' must be the same as in the DMatrix construction, regardless of the column names.
|
#' must be the same as in the DMatrix construction, regardless of the column names.
|
||||||
#' @param feature_types Set types for features.
|
#' @param feature_types Set types for features.
|
||||||
#'
|
#'
|
||||||
#' If `data` is a `data.frame` and passing `enable_categorical=TRUE`, the types will be deduced
|
#' If `data` is a `data.frame` and passing `feature_types` is not supplied, feature types will be deduced
|
||||||
#' automatically from the column types.
|
#' automatically from the column types.
|
||||||
#'
|
#'
|
||||||
#' Otherwise, one can pass a character vector with the same length as number of columns in `data`,
|
#' Otherwise, one can pass a character vector with the same length as number of columns in `data`,
|
||||||
@ -79,18 +75,6 @@
|
|||||||
#' @param label_lower_bound Lower bound for survival training.
|
#' @param label_lower_bound Lower bound for survival training.
|
||||||
#' @param label_upper_bound Upper bound for survival training.
|
#' @param label_upper_bound Upper bound for survival training.
|
||||||
#' @param feature_weights Set feature weights for column sampling.
|
#' @param feature_weights Set feature weights for column sampling.
|
||||||
#' @param enable_categorical Experimental support of specializing for categorical features.
|
|
||||||
#'
|
|
||||||
#' If passing 'TRUE' and 'data' is a data frame,
|
|
||||||
#' columns of categorical types will automatically
|
|
||||||
#' be set to be of categorical type (feature_type='c') in the resulting DMatrix.
|
|
||||||
#'
|
|
||||||
#' If passing 'FALSE' and 'data' is a data frame with categorical columns,
|
|
||||||
#' it will result in an error being thrown.
|
|
||||||
#'
|
|
||||||
#' If 'data' is not a data frame, this argument is ignored.
|
|
||||||
#'
|
|
||||||
#' JSON/UBJSON serialization format is required for this.
|
|
||||||
#' @return An 'xgb.DMatrix' object. If calling 'xgb.QuantileDMatrix', it will have additional
|
#' @return An 'xgb.DMatrix' object. If calling 'xgb.QuantileDMatrix', it will have additional
|
||||||
#' subclass 'xgb.QuantileDMatrix'.
|
#' subclass 'xgb.QuantileDMatrix'.
|
||||||
#'
|
#'
|
||||||
@ -127,8 +111,7 @@ xgb.DMatrix <- function(
|
|||||||
qid = NULL,
|
qid = NULL,
|
||||||
label_lower_bound = NULL,
|
label_lower_bound = NULL,
|
||||||
label_upper_bound = NULL,
|
label_upper_bound = NULL,
|
||||||
feature_weights = NULL,
|
feature_weights = NULL
|
||||||
enable_categorical = FALSE
|
|
||||||
) {
|
) {
|
||||||
if (!is.null(group) && !is.null(qid)) {
|
if (!is.null(group) && !is.null(qid)) {
|
||||||
stop("Either one of 'group' or 'qid' should be NULL")
|
stop("Either one of 'group' or 'qid' should be NULL")
|
||||||
@ -180,7 +163,7 @@ xgb.DMatrix <- function(
|
|||||||
nthread
|
nthread
|
||||||
)
|
)
|
||||||
} else if (is.data.frame(data)) {
|
} else if (is.data.frame(data)) {
|
||||||
tmp <- .process.df.for.dmatrix(data, enable_categorical, feature_types)
|
tmp <- .process.df.for.dmatrix(data, feature_types)
|
||||||
feature_types <- tmp$feature_types
|
feature_types <- tmp$feature_types
|
||||||
handle <- .Call(
|
handle <- .Call(
|
||||||
XGDMatrixCreateFromDF_R, tmp$lst, missing, nthread
|
XGDMatrixCreateFromDF_R, tmp$lst, missing, nthread
|
||||||
@ -212,7 +195,7 @@ xgb.DMatrix <- function(
|
|||||||
return(dmat)
|
return(dmat)
|
||||||
}
|
}
|
||||||
|
|
||||||
.process.df.for.dmatrix <- function(df, enable_categorical, feature_types) {
|
.process.df.for.dmatrix <- function(df, feature_types) {
|
||||||
if (!nrow(df) || !ncol(df)) {
|
if (!nrow(df) || !ncol(df)) {
|
||||||
stop("'data' is an empty data.frame.")
|
stop("'data' is an empty data.frame.")
|
||||||
}
|
}
|
||||||
@ -225,12 +208,6 @@ xgb.DMatrix <- function(
|
|||||||
} else {
|
} else {
|
||||||
feature_types <- sapply(df, function(col) {
|
feature_types <- sapply(df, function(col) {
|
||||||
if (is.factor(col)) {
|
if (is.factor(col)) {
|
||||||
if (!enable_categorical) {
|
|
||||||
stop(
|
|
||||||
"When factor type is used, the parameter `enable_categorical`",
|
|
||||||
" must be set to TRUE."
|
|
||||||
)
|
|
||||||
}
|
|
||||||
return("c")
|
return("c")
|
||||||
} else if (is.integer(col)) {
|
} else if (is.integer(col)) {
|
||||||
return("int")
|
return("int")
|
||||||
@ -326,7 +303,6 @@ xgb.QuantileDMatrix <- function(
|
|||||||
label_lower_bound = NULL,
|
label_lower_bound = NULL,
|
||||||
label_upper_bound = NULL,
|
label_upper_bound = NULL,
|
||||||
feature_weights = NULL,
|
feature_weights = NULL,
|
||||||
enable_categorical = FALSE,
|
|
||||||
ref = NULL,
|
ref = NULL,
|
||||||
max_bin = NULL
|
max_bin = NULL
|
||||||
) {
|
) {
|
||||||
@ -357,8 +333,7 @@ xgb.QuantileDMatrix <- function(
|
|||||||
qid = qid,
|
qid = qid,
|
||||||
label_lower_bound = label_lower_bound,
|
label_lower_bound = label_lower_bound,
|
||||||
label_upper_bound = label_upper_bound,
|
label_upper_bound = label_upper_bound,
|
||||||
feature_weights = feature_weights,
|
feature_weights = feature_weights
|
||||||
enable_categorical = enable_categorical
|
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
data_iterator <- .single.data.iterator(iterator_env)
|
data_iterator <- .single.data.iterator(iterator_env)
|
||||||
@ -470,8 +445,7 @@ xgb.DataIter <- function(env = new.env(), f_next, f_reset) {
|
|||||||
qid = env[["qid"]],
|
qid = env[["qid"]],
|
||||||
label_lower_bound = env[["label_lower_bound"]],
|
label_lower_bound = env[["label_lower_bound"]],
|
||||||
label_upper_bound = env[["label_upper_bound"]],
|
label_upper_bound = env[["label_upper_bound"]],
|
||||||
feature_weights = env[["feature_weights"]],
|
feature_weights = env[["feature_weights"]]
|
||||||
enable_categorical = env[["enable_categorical"]]
|
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
@ -540,8 +514,7 @@ xgb.ProxyDMatrix <- function(
|
|||||||
qid = NULL,
|
qid = NULL,
|
||||||
label_lower_bound = NULL,
|
label_lower_bound = NULL,
|
||||||
label_upper_bound = NULL,
|
label_upper_bound = NULL,
|
||||||
feature_weights = NULL,
|
feature_weights = NULL
|
||||||
enable_categorical = FALSE
|
|
||||||
) {
|
) {
|
||||||
stopifnot(inherits(data, c("matrix", "data.frame", "dgRMatrix")))
|
stopifnot(inherits(data, c("matrix", "data.frame", "dgRMatrix")))
|
||||||
out <- list(
|
out <- list(
|
||||||
@ -555,8 +528,7 @@ xgb.ProxyDMatrix <- function(
|
|||||||
qid = qid,
|
qid = qid,
|
||||||
label_lower_bound = label_lower_bound,
|
label_lower_bound = label_lower_bound,
|
||||||
label_upper_bound = label_upper_bound,
|
label_upper_bound = label_upper_bound,
|
||||||
feature_weights = feature_weights,
|
feature_weights = feature_weights
|
||||||
enable_categorical = enable_categorical
|
|
||||||
)
|
)
|
||||||
class(out) <- "xgb.ProxyDMatrix"
|
class(out) <- "xgb.ProxyDMatrix"
|
||||||
return(out)
|
return(out)
|
||||||
@ -575,7 +547,7 @@ xgb.ProxyDMatrix.internal <- function(proxy_handle, data_iterator) {
|
|||||||
stop("Either one of 'group' or 'qid' should be NULL")
|
stop("Either one of 'group' or 'qid' should be NULL")
|
||||||
}
|
}
|
||||||
if (is.data.frame(lst$data)) {
|
if (is.data.frame(lst$data)) {
|
||||||
tmp <- .process.df.for.dmatrix(lst$data, lst$enable_categorical, lst$feature_types)
|
tmp <- .process.df.for.dmatrix(lst$data, lst$feature_types)
|
||||||
lst$feature_types <- tmp$feature_types
|
lst$feature_types <- tmp$feature_types
|
||||||
.Call(XGProxyDMatrixSetDataColumnar_R, proxy_handle, tmp$lst)
|
.Call(XGProxyDMatrixSetDataColumnar_R, proxy_handle, tmp$lst)
|
||||||
rm(tmp)
|
rm(tmp)
|
||||||
|
|||||||
@ -19,8 +19,7 @@ xgb.DMatrix(
|
|||||||
qid = NULL,
|
qid = NULL,
|
||||||
label_lower_bound = NULL,
|
label_lower_bound = NULL,
|
||||||
label_upper_bound = NULL,
|
label_upper_bound = NULL,
|
||||||
feature_weights = NULL,
|
feature_weights = NULL
|
||||||
enable_categorical = FALSE
|
|
||||||
)
|
)
|
||||||
|
|
||||||
xgb.QuantileDMatrix(
|
xgb.QuantileDMatrix(
|
||||||
@ -37,7 +36,6 @@ xgb.QuantileDMatrix(
|
|||||||
label_lower_bound = NULL,
|
label_lower_bound = NULL,
|
||||||
label_upper_bound = NULL,
|
label_upper_bound = NULL,
|
||||||
feature_weights = NULL,
|
feature_weights = NULL,
|
||||||
enable_categorical = FALSE,
|
|
||||||
ref = NULL,
|
ref = NULL,
|
||||||
max_bin = NULL
|
max_bin = NULL
|
||||||
)
|
)
|
||||||
@ -50,10 +48,6 @@ Supported input types are as follows:\itemize{
|
|||||||
\item \code{matrix} objects, with types \code{numeric}, \code{integer}, or \code{logical}.
|
\item \code{matrix} objects, with types \code{numeric}, \code{integer}, or \code{logical}.
|
||||||
\item \code{data.frame} objects, with columns of types \code{numeric}, \code{integer}, \code{logical}, or \code{factor}.
|
\item \code{data.frame} objects, with columns of types \code{numeric}, \code{integer}, \code{logical}, or \code{factor}.
|
||||||
|
|
||||||
If passing \code{enable_categorical=TRUE}, columns with \code{factor} type will be treated as categorical.
|
|
||||||
Otherwise, if passing \code{enable_categorical=FALSE} and the data contains \code{factor} columns, an error
|
|
||||||
will be thrown.
|
|
||||||
|
|
||||||
Note that xgboost uses base-0 encoding for categorical types, hence \code{factor} types (which use base-1
|
Note that xgboost uses base-0 encoding for categorical types, hence \code{factor} types (which use base-1
|
||||||
encoding') will be converted inside the function call. Be aware that the encoding used for \code{factor}
|
encoding') will be converted inside the function call. Be aware that the encoding used for \code{factor}
|
||||||
types is not kept as part of the model, so in subsequent calls to \code{predict}, it is the user's
|
types is not kept as part of the model, so in subsequent calls to \code{predict}, it is the user's
|
||||||
@ -102,7 +96,7 @@ frame and matrix.
|
|||||||
|
|
||||||
\item{feature_types}{Set types for features.
|
\item{feature_types}{Set types for features.
|
||||||
|
|
||||||
If \code{data} is a \code{data.frame} and passing \code{enable_categorical=TRUE}, the types will be deduced
|
If \code{data} is a \code{data.frame} and passing \code{feature_types} is not supplied, feature types will be deduced
|
||||||
automatically from the column types.
|
automatically from the column types.
|
||||||
|
|
||||||
Otherwise, one can pass a character vector with the same length as number of columns in \code{data},
|
Otherwise, one can pass a character vector with the same length as number of columns in \code{data},
|
||||||
@ -129,20 +123,6 @@ functionalities such as feature importances.}
|
|||||||
|
|
||||||
\item{feature_weights}{Set feature weights for column sampling.}
|
\item{feature_weights}{Set feature weights for column sampling.}
|
||||||
|
|
||||||
\item{enable_categorical}{Experimental support of specializing for categorical features.
|
|
||||||
|
|
||||||
\if{html}{\out{<div class="sourceCode">}}\preformatted{ If passing 'TRUE' and 'data' is a data frame,
|
|
||||||
columns of categorical types will automatically
|
|
||||||
be set to be of categorical type (feature_type='c') in the resulting DMatrix.
|
|
||||||
|
|
||||||
If passing 'FALSE' and 'data' is a data frame with categorical columns,
|
|
||||||
it will result in an error being thrown.
|
|
||||||
|
|
||||||
If 'data' is not a data frame, this argument is ignored.
|
|
||||||
|
|
||||||
JSON/UBJSON serialization format is required for this.
|
|
||||||
}\if{html}{\out{</div>}}}
|
|
||||||
|
|
||||||
\item{ref}{The training dataset that provides quantile information, needed when creating
|
\item{ref}{The training dataset that provides quantile information, needed when creating
|
||||||
validation/test dataset with \code{xgb.QuantileDMatrix}. Supplying the training DMatrix
|
validation/test dataset with \code{xgb.QuantileDMatrix}. Supplying the training DMatrix
|
||||||
as a reference means that the same quantisation applied to the training data is
|
as a reference means that the same quantisation applied to the training data is
|
||||||
|
|||||||
@ -15,8 +15,7 @@ xgb.ProxyDMatrix(
|
|||||||
qid = NULL,
|
qid = NULL,
|
||||||
label_lower_bound = NULL,
|
label_lower_bound = NULL,
|
||||||
label_upper_bound = NULL,
|
label_upper_bound = NULL,
|
||||||
feature_weights = NULL,
|
feature_weights = NULL
|
||||||
enable_categorical = FALSE
|
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
\arguments{
|
\arguments{
|
||||||
@ -57,7 +56,7 @@ frame and matrix.
|
|||||||
|
|
||||||
\item{feature_types}{Set types for features.
|
\item{feature_types}{Set types for features.
|
||||||
|
|
||||||
If \code{data} is a \code{data.frame} and passing \code{enable_categorical=TRUE}, the types will be deduced
|
If \code{data} is a \code{data.frame} and passing \code{feature_types} is not supplied, feature types will be deduced
|
||||||
automatically from the column types.
|
automatically from the column types.
|
||||||
|
|
||||||
Otherwise, one can pass a character vector with the same length as number of columns in \code{data},
|
Otherwise, one can pass a character vector with the same length as number of columns in \code{data},
|
||||||
@ -81,20 +80,6 @@ functionalities such as feature importances.}
|
|||||||
\item{label_upper_bound}{Upper bound for survival training.}
|
\item{label_upper_bound}{Upper bound for survival training.}
|
||||||
|
|
||||||
\item{feature_weights}{Set feature weights for column sampling.}
|
\item{feature_weights}{Set feature weights for column sampling.}
|
||||||
|
|
||||||
\item{enable_categorical}{Experimental support of specializing for categorical features.
|
|
||||||
|
|
||||||
\if{html}{\out{<div class="sourceCode">}}\preformatted{ If passing 'TRUE' and 'data' is a data frame,
|
|
||||||
columns of categorical types will automatically
|
|
||||||
be set to be of categorical type (feature_type='c') in the resulting DMatrix.
|
|
||||||
|
|
||||||
If passing 'FALSE' and 'data' is a data frame with categorical columns,
|
|
||||||
it will result in an error being thrown.
|
|
||||||
|
|
||||||
If 'data' is not a data frame, this argument is ignored.
|
|
||||||
|
|
||||||
JSON/UBJSON serialization format is required for this.
|
|
||||||
}\if{html}{\out{</div>}}}
|
|
||||||
}
|
}
|
||||||
\value{
|
\value{
|
||||||
An object of class \code{xgb.ProxyDMatrix}, which is just a list containing the
|
An object of class \code{xgb.ProxyDMatrix}, which is just a list containing the
|
||||||
|
|||||||
@ -338,19 +338,18 @@ test_that("xgb.DMatrix: data.frame", {
|
|||||||
stringsAsFactors = TRUE
|
stringsAsFactors = TRUE
|
||||||
)
|
)
|
||||||
|
|
||||||
m <- xgb.DMatrix(df, enable_categorical = TRUE)
|
m <- xgb.DMatrix(df)
|
||||||
expect_equal(colnames(m), colnames(df))
|
expect_equal(colnames(m), colnames(df))
|
||||||
expect_equal(
|
expect_equal(
|
||||||
getinfo(m, "feature_type"), c("float", "float", "int", "i", "c", "c")
|
getinfo(m, "feature_type"), c("float", "float", "int", "i", "c", "c")
|
||||||
)
|
)
|
||||||
expect_error(xgb.DMatrix(df, enable_categorical = FALSE))
|
|
||||||
|
|
||||||
df <- data.frame(
|
df <- data.frame(
|
||||||
missing = c("a", "b", "d", NA),
|
missing = c("a", "b", "d", NA),
|
||||||
valid = c("a", "b", "d", "c"),
|
valid = c("a", "b", "d", "c"),
|
||||||
stringsAsFactors = TRUE
|
stringsAsFactors = TRUE
|
||||||
)
|
)
|
||||||
m <- xgb.DMatrix(df, enable_categorical = TRUE)
|
m <- xgb.DMatrix(df)
|
||||||
expect_equal(getinfo(m, "feature_type"), c("c", "c"))
|
expect_equal(getinfo(m, "feature_type"), c("c", "c"))
|
||||||
})
|
})
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user