[R] Remove enable_categorical parameter (#10018)

2024-01-30 22:17:36 +01:00
parent 3abbbe41ac
commit df7cf744b4
4 changed files with 15 additions and 79 deletions
--- a/R-package/R/xgb.DMatrix.R
+++ b/R-package/R/xgb.DMatrix.R
@@ -16,10 +16,6 @@
 #' \item `matrix` objects, with types `numeric`, `integer`, or `logical`.
 #' \item `data.frame` objects, with columns of types `numeric`, `integer`, `logical`, or `factor`.
 #'
 #' If passing `enable_categorical=TRUE`, columns with `factor` type will be treated as categorical.
 #' Otherwise, if passing `enable_categorical=FALSE` and the data contains `factor` columns, an error
 #' will be thrown.
 #'
 #' Note that xgboost uses base-0 encoding for categorical types, hence `factor` types (which use base-1
 #' encoding') will be converted inside the function call. Be aware that the encoding used for `factor`
 #' types is not kept as part of the model, so in subsequent calls to `predict`, it is the user's
@@ -59,7 +55,7 @@
 #'        must be the same as in the DMatrix construction, regardless of the column names.
 #' @param feature_types Set types for features.
 #'
-#' If `data` is a `data.frame` and passing `enable_categorical=TRUE`, the types will be deduced
+#' If `data` is a `data.frame` and passing `feature_types` is not supplied, feature types will be deduced
 #' automatically from the column types.
 #'
 #' Otherwise, one can pass a character vector with the same length as number of columns in `data`,
@@ -79,18 +75,6 @@
 #' @param label_lower_bound Lower bound for survival training.
 #' @param label_upper_bound Upper bound for survival training.
 #' @param feature_weights Set feature weights for column sampling.
 #' @param enable_categorical Experimental support of specializing for categorical features.
 #'
 #'                           If passing 'TRUE' and 'data' is a data frame,
 #'                           columns of categorical types will automatically
 #'                           be set to be of categorical type (feature_type='c') in the resulting DMatrix.
 #'
 #'                           If passing 'FALSE' and 'data' is a data frame with categorical columns,
 #'                           it will result in an error being thrown.
 #'
 #'                           If 'data' is not a data frame, this argument is ignored.
 #'
 #'                           JSON/UBJSON serialization format is required for this.
 #' @return An 'xgb.DMatrix' object. If calling 'xgb.QuantileDMatrix', it will have additional
 #' subclass 'xgb.QuantileDMatrix'.
 #'
@@ -127,8 +111,7 @@ xgb.DMatrix <- function(
  qid = NULL,
  label_lower_bound = NULL,
  label_upper_bound = NULL,
-  feature_weights = NULL,
+  feature_weights = NULL
  enable_categorical = FALSE
 ) {
  if (!is.null(group) && !is.null(qid)) {
    stop("Either one of 'group' or 'qid' should be NULL")
@@ -180,7 +163,7 @@ xgb.DMatrix <- function(
      nthread
    )
  } else if (is.data.frame(data)) {
-    tmp <- .process.df.for.dmatrix(data, enable_categorical, feature_types)
+    tmp <- .process.df.for.dmatrix(data, feature_types)
    feature_types <- tmp$feature_types
    handle <- .Call(
      XGDMatrixCreateFromDF_R, tmp$lst, missing, nthread
@@ -212,7 +195,7 @@ xgb.DMatrix <- function(
  return(dmat)
 }
-.process.df.for.dmatrix <- function(df, enable_categorical, feature_types) {
+.process.df.for.dmatrix <- function(df, feature_types) {
  if (!nrow(df) || !ncol(df)) {
    stop("'data' is an empty data.frame.")
  }
@@ -225,12 +208,6 @@ xgb.DMatrix <- function(
  } else {
    feature_types <- sapply(df, function(col) {
      if (is.factor(col)) {
        if (!enable_categorical) {
          stop(
            "When factor type is used, the parameter `enable_categorical`",
            " must be set to TRUE."
          )
        }
        return("c")
      } else if (is.integer(col)) {
        return("int")
@@ -326,7 +303,6 @@ xgb.QuantileDMatrix <- function(
  label_lower_bound = NULL,
  label_upper_bound = NULL,
  feature_weights = NULL,
  enable_categorical = FALSE,
  ref = NULL,
  max_bin = NULL
 ) {
@@ -357,8 +333,7 @@ xgb.QuantileDMatrix <- function(
      qid = qid,
      label_lower_bound = label_lower_bound,
      label_upper_bound = label_upper_bound,
-      feature_weights = feature_weights,
+      feature_weights = feature_weights
      enable_categorical = enable_categorical
    )
  )
  data_iterator <- .single.data.iterator(iterator_env)
@@ -470,8 +445,7 @@ xgb.DataIter <- function(env = new.env(), f_next, f_reset) {
      qid = env[["qid"]],
      label_lower_bound = env[["label_lower_bound"]],
      label_upper_bound = env[["label_upper_bound"]],
-      feature_weights = env[["feature_weights"]],
+      feature_weights = env[["feature_weights"]]
      enable_categorical = env[["enable_categorical"]]
    )
  )
 }
@@ -540,8 +514,7 @@ xgb.ProxyDMatrix <- function(
  qid = NULL,
  label_lower_bound = NULL,
  label_upper_bound = NULL,
-  feature_weights = NULL,
+  feature_weights = NULL
  enable_categorical = FALSE
 ) {
  stopifnot(inherits(data, c("matrix", "data.frame", "dgRMatrix")))
  out <- list(
@@ -555,8 +528,7 @@ xgb.ProxyDMatrix <- function(
    qid = qid,
    label_lower_bound = label_lower_bound,
    label_upper_bound = label_upper_bound,
-    feature_weights = feature_weights,
+    feature_weights = feature_weights
    enable_categorical = enable_categorical
  )
  class(out) <- "xgb.ProxyDMatrix"
  return(out)
@@ -575,7 +547,7 @@ xgb.ProxyDMatrix.internal <- function(proxy_handle, data_iterator) {
    stop("Either one of 'group' or 'qid' should be NULL")
  }
  if (is.data.frame(lst$data)) {
-    tmp <- .process.df.for.dmatrix(lst$data, lst$enable_categorical, lst$feature_types)
+    tmp <- .process.df.for.dmatrix(lst$data, lst$feature_types)
    lst$feature_types <- tmp$feature_types
    .Call(XGProxyDMatrixSetDataColumnar_R, proxy_handle, tmp$lst)
    rm(tmp)
--- a/R-package/man/xgb.DMatrix.Rd
+++ b/R-package/man/xgb.DMatrix.Rd
@@ -19,8 +19,7 @@ xgb.DMatrix(
  qid = NULL,
  label_lower_bound = NULL,
  label_upper_bound = NULL,
-  feature_weights = NULL,
+  feature_weights = NULL
  enable_categorical = FALSE
 )
 xgb.QuantileDMatrix(
@@ -37,7 +36,6 @@ xgb.QuantileDMatrix(
  label_lower_bound = NULL,
  label_upper_bound = NULL,
  feature_weights = NULL,
  enable_categorical = FALSE,
  ref = NULL,
  max_bin = NULL
 )
@@ -50,10 +48,6 @@ Supported input types are as follows:\itemize{
 \item \code{matrix} objects, with types \code{numeric}, \code{integer}, or \code{logical}.
 \item \code{data.frame} objects, with columns of types \code{numeric}, \code{integer}, \code{logical}, or \code{factor}.
 If passing \code{enable_categorical=TRUE}, columns with \code{factor} type will be treated as categorical.
 Otherwise, if passing \code{enable_categorical=FALSE} and the data contains \code{factor} columns, an error
 will be thrown.
 Note that xgboost uses base-0 encoding for categorical types, hence \code{factor} types (which use base-1
 encoding') will be converted inside the function call. Be aware that the encoding used for \code{factor}
 types is not kept as part of the model, so in subsequent calls to \code{predict}, it is the user's
@@ -102,7 +96,7 @@ frame and matrix.
 \item{feature_types}{Set types for features.
-If \code{data} is a \code{data.frame} and passing \code{enable_categorical=TRUE}, the types will be deduced
+If \code{data} is a \code{data.frame} and passing \code{feature_types} is not supplied, feature types will be deduced
 automatically from the column types.
 Otherwise, one can pass a character vector with the same length as number of columns in \code{data},
@@ -129,20 +123,6 @@ functionalities such as feature importances.}
 \item{feature_weights}{Set feature weights for column sampling.}
 \item{enable_categorical}{Experimental support of specializing for categorical features.
 \if{html}{\out{<div class="sourceCode">}}\preformatted{                      If passing 'TRUE' and 'data' is a data frame,
                      columns of categorical types will automatically
                      be set to be of categorical type (feature_type='c') in the resulting DMatrix.
                      If passing 'FALSE' and 'data' is a data frame with categorical columns,
                      it will result in an error being thrown.
                      If 'data' is not a data frame, this argument is ignored.
                      JSON/UBJSON serialization format is required for this.
 }\if{html}{\out{</div>}}}
 \item{ref}{The training dataset that provides quantile information, needed when creating
 validation/test dataset with \code{xgb.QuantileDMatrix}. Supplying the training DMatrix
 as a reference means that the same quantisation applied to the training data is
--- a/R-package/man/xgb.ProxyDMatrix.Rd
+++ b/R-package/man/xgb.ProxyDMatrix.Rd
@@ -15,8 +15,7 @@ xgb.ProxyDMatrix(
  qid = NULL,
  label_lower_bound = NULL,
  label_upper_bound = NULL,
-  feature_weights = NULL,
+  feature_weights = NULL
  enable_categorical = FALSE
 )
 }
 \arguments{
@@ -57,7 +56,7 @@ frame and matrix.
 \item{feature_types}{Set types for features.
-If \code{data} is a \code{data.frame} and passing \code{enable_categorical=TRUE}, the types will be deduced
+If \code{data} is a \code{data.frame} and passing \code{feature_types} is not supplied, feature types will be deduced
 automatically from the column types.
 Otherwise, one can pass a character vector with the same length as number of columns in \code{data},
@@ -81,20 +80,6 @@ functionalities such as feature importances.}
 \item{label_upper_bound}{Upper bound for survival training.}
 \item{feature_weights}{Set feature weights for column sampling.}
 \item{enable_categorical}{Experimental support of specializing for categorical features.
 \if{html}{\out{<div class="sourceCode">}}\preformatted{                      If passing 'TRUE' and 'data' is a data frame,
                      columns of categorical types will automatically
                      be set to be of categorical type (feature_type='c') in the resulting DMatrix.
                      If passing 'FALSE' and 'data' is a data frame with categorical columns,
                      it will result in an error being thrown.
                      If 'data' is not a data frame, this argument is ignored.
                      JSON/UBJSON serialization format is required for this.
 }\if{html}{\out{</div>}}}
 }
 \value{
 An object of class \code{xgb.ProxyDMatrix}, which is just a list containing the
--- a/R-package/tests/testthat/test_dmatrix.R
+++ b/R-package/tests/testthat/test_dmatrix.R
@@ -338,19 +338,18 @@ test_that("xgb.DMatrix: data.frame", {
    stringsAsFactors = TRUE
  )
-  m <- xgb.DMatrix(df, enable_categorical = TRUE)
+  m <- xgb.DMatrix(df)
  expect_equal(colnames(m), colnames(df))
  expect_equal(
    getinfo(m, "feature_type"), c("float", "float", "int", "i", "c", "c")
  )
  expect_error(xgb.DMatrix(df, enable_categorical = FALSE))
  df <- data.frame(
    missing = c("a", "b", "d", NA),
    valid = c("a", "b", "d", "c"),
    stringsAsFactors = TRUE
  )
-  m <- xgb.DMatrix(df, enable_categorical = TRUE)
+  m <- xgb.DMatrix(df)
  expect_equal(getinfo(m, "feature_type"), c("c", "c"))
 })