[R] switch to URI reader (#10024)

2024-02-04 22:03:38 +01:00
parent f2095f1d5b
commit 4de866211d
6 changed files with 103 additions and 20 deletions
--- a/R-package/R/xgb.DMatrix.R
+++ b/R-package/R/xgb.DMatrix.R
@@ -28,10 +28,27 @@
 #' 'xgb.QuantileDMatrix'.
 #' \item Single-row CSR matrices, as class `dsparseVector` from package `Matrix`, which is interpreted
 #' as a single row (only when making predictions from a fitted model).
-#' \item Text files in SVMLight / LibSVM formats, passed as a path to the file. These are \bold{not}
-#' supported for xgb.QuantileDMatrix'.
-#' \item Binary files generated by \link{xgb.DMatrix.save},  passed as a path to the file. These are
-#' \bold{not} supported for xgb.QuantileDMatrix'.
+#' \item Text files in a supported format, passed as a `character` variable containing the URI path to
+#' the file, with an optional format specifier.
+#'
+#' These are \bold{not} supported for `xgb.QuantileDMatrix`. Supported formats are:\itemize{
+#'   \item XGBoost's own binary format for DMatrices, as produced by \link{xgb.DMatrix.save}.
+#'   \item SVMLight (a.k.a. LibSVM) format for CSR matrices. This format can be signaled by suffix
+#'         `?format=libsvm` at the end of the file path. It will be the default format if not
+#'         otherwise specified.
+#'   \item CSV files (comma-separated values). This format can be specified by adding suffix
+#'         `?format=csv` at the end ofthe file path. It will \bold{not} be auto-deduced from file extensions.
+#'   }
+#'
+#' Be aware that the format of the file will not be auto-deduced - for example, if a file is named 'file.csv',
+#' it will not look at the extension or file contents to determine that it is a comma-separated value.
+#' Instead, the format must be specified following the URI format, so the input to `data` should be passed
+#' like this: `"file.csv?format=csv"` (or `"file.csv?format=csv&label_column=0"` if the first column
+#' corresponds to the labels).
+#'
+#' For more information about passing text files as input, see the articles
+#' \href{https://xgboost.readthedocs.io/en/stable/tutorials/input_format.html}{Text Input Format of DMatrix} and
+#' \href{https://xgboost.readthedocs.io/en/stable/python/python_intro.html#python-data-interface}{Data Interface}.
 #' }
 #' @param label Label of the training data. For classification problems, should be passed encoded as
 #' integers with numeration starting at zero.
@@ -81,6 +98,13 @@
 #' @param label_lower_bound Lower bound for survival training.
 #' @param label_upper_bound Upper bound for survival training.
 #' @param feature_weights Set feature weights for column sampling.
+#' @param data_split_mode When passing a URI (as R `character`) as input, this signals
+#' whether to split by row or column. Allowed values are `"row"` and `"col"`.
+#'
+#' In distributed mode, the file is split accordingly; otherwise this is only an indicator on
+#' how the file was split beforehand. Default to row.
+#'
+#' This is not used when `data` is not a URI.
 #' @return An 'xgb.DMatrix' object. If calling 'xgb.QuantileDMatrix', it will have additional
 #' subclass 'xgb.QuantileDMatrix'.
 #'
@@ -117,7 +141,8 @@ xgb.DMatrix <- function(
  qid = NULL,
  label_lower_bound = NULL,
  label_upper_bound = NULL,
-  feature_weights = NULL
+  feature_weights = NULL,
+  data_split_mode = "row"
 ) {
  if (!is.null(group) && !is.null(qid)) {
    stop("Either one of 'group' or 'qid' should be NULL")
@@ -131,7 +156,14 @@ xgb.DMatrix <- function(
      )
    }
    data <- path.expand(data)
-    handle <- .Call(XGDMatrixCreateFromFile_R, data, as.integer(silent))
+    if (data_split_mode == "row") {
+      data_split_mode <- 0L
+    } else if (data_split_mode == "col") {
+      data_split_mode <- 1L
+    } else {
+      stop("Passed invalid 'data_split_mode': ", data_split_mode)
+    }
+    handle <- .Call(XGDMatrixCreateFromURI_R, data, as.integer(silent), data_split_mode)
  } else if (is.matrix(data)) {
    handle <- .Call(
      XGDMatrixCreateFromMat_R, data, missing, nthread