[R] switch to URI reader (#10024)

This commit is contained in:
david-cortes
2024-02-04 22:03:38 +01:00
committed by GitHub
parent f2095f1d5b
commit 4de866211d
6 changed files with 103 additions and 20 deletions

View File

@@ -28,10 +28,27 @@
#' 'xgb.QuantileDMatrix'.
#' \item Single-row CSR matrices, as class `dsparseVector` from package `Matrix`, which is interpreted
#' as a single row (only when making predictions from a fitted model).
#' \item Text files in SVMLight / LibSVM formats, passed as a path to the file. These are \bold{not}
#' supported for xgb.QuantileDMatrix'.
#' \item Binary files generated by \link{xgb.DMatrix.save}, passed as a path to the file. These are
#' \bold{not} supported for xgb.QuantileDMatrix'.
#' \item Text files in a supported format, passed as a `character` variable containing the URI path to
#' the file, with an optional format specifier.
#'
#' These are \bold{not} supported for `xgb.QuantileDMatrix`. Supported formats are:\itemize{
#' \item XGBoost's own binary format for DMatrices, as produced by \link{xgb.DMatrix.save}.
#' \item SVMLight (a.k.a. LibSVM) format for CSR matrices. This format can be signaled by suffix
#' `?format=libsvm` at the end of the file path. It will be the default format if not
#' otherwise specified.
#' \item CSV files (comma-separated values). This format can be specified by adding suffix
#' `?format=csv` at the end ofthe file path. It will \bold{not} be auto-deduced from file extensions.
#' }
#'
#' Be aware that the format of the file will not be auto-deduced - for example, if a file is named 'file.csv',
#' it will not look at the extension or file contents to determine that it is a comma-separated value.
#' Instead, the format must be specified following the URI format, so the input to `data` should be passed
#' like this: `"file.csv?format=csv"` (or `"file.csv?format=csv&label_column=0"` if the first column
#' corresponds to the labels).
#'
#' For more information about passing text files as input, see the articles
#' \href{https://xgboost.readthedocs.io/en/stable/tutorials/input_format.html}{Text Input Format of DMatrix} and
#' \href{https://xgboost.readthedocs.io/en/stable/python/python_intro.html#python-data-interface}{Data Interface}.
#' }
#' @param label Label of the training data. For classification problems, should be passed encoded as
#' integers with numeration starting at zero.
@@ -81,6 +98,13 @@
#' @param label_lower_bound Lower bound for survival training.
#' @param label_upper_bound Upper bound for survival training.
#' @param feature_weights Set feature weights for column sampling.
#' @param data_split_mode When passing a URI (as R `character`) as input, this signals
#' whether to split by row or column. Allowed values are `"row"` and `"col"`.
#'
#' In distributed mode, the file is split accordingly; otherwise this is only an indicator on
#' how the file was split beforehand. Default to row.
#'
#' This is not used when `data` is not a URI.
#' @return An 'xgb.DMatrix' object. If calling 'xgb.QuantileDMatrix', it will have additional
#' subclass 'xgb.QuantileDMatrix'.
#'
@@ -117,7 +141,8 @@ xgb.DMatrix <- function(
qid = NULL,
label_lower_bound = NULL,
label_upper_bound = NULL,
feature_weights = NULL
feature_weights = NULL,
data_split_mode = "row"
) {
if (!is.null(group) && !is.null(qid)) {
stop("Either one of 'group' or 'qid' should be NULL")
@@ -131,7 +156,14 @@ xgb.DMatrix <- function(
)
}
data <- path.expand(data)
handle <- .Call(XGDMatrixCreateFromFile_R, data, as.integer(silent))
if (data_split_mode == "row") {
data_split_mode <- 0L
} else if (data_split_mode == "col") {
data_split_mode <- 1L
} else {
stop("Passed invalid 'data_split_mode': ", data_split_mode)
}
handle <- .Call(XGDMatrixCreateFromURI_R, data, as.integer(silent), data_split_mode)
} else if (is.matrix(data)) {
handle <- .Call(
XGDMatrixCreateFromMat_R, data, missing, nthread