[R] switch to URI reader (#10024)
This commit is contained in:
@@ -28,10 +28,27 @@
|
||||
#' 'xgb.QuantileDMatrix'.
|
||||
#' \item Single-row CSR matrices, as class `dsparseVector` from package `Matrix`, which is interpreted
|
||||
#' as a single row (only when making predictions from a fitted model).
|
||||
#' \item Text files in SVMLight / LibSVM formats, passed as a path to the file. These are \bold{not}
|
||||
#' supported for xgb.QuantileDMatrix'.
|
||||
#' \item Binary files generated by \link{xgb.DMatrix.save}, passed as a path to the file. These are
|
||||
#' \bold{not} supported for xgb.QuantileDMatrix'.
|
||||
#' \item Text files in a supported format, passed as a `character` variable containing the URI path to
|
||||
#' the file, with an optional format specifier.
|
||||
#'
|
||||
#' These are \bold{not} supported for `xgb.QuantileDMatrix`. Supported formats are:\itemize{
|
||||
#' \item XGBoost's own binary format for DMatrices, as produced by \link{xgb.DMatrix.save}.
|
||||
#' \item SVMLight (a.k.a. LibSVM) format for CSR matrices. This format can be signaled by suffix
|
||||
#' `?format=libsvm` at the end of the file path. It will be the default format if not
|
||||
#' otherwise specified.
|
||||
#' \item CSV files (comma-separated values). This format can be specified by adding suffix
|
||||
#' `?format=csv` at the end ofthe file path. It will \bold{not} be auto-deduced from file extensions.
|
||||
#' }
|
||||
#'
|
||||
#' Be aware that the format of the file will not be auto-deduced - for example, if a file is named 'file.csv',
|
||||
#' it will not look at the extension or file contents to determine that it is a comma-separated value.
|
||||
#' Instead, the format must be specified following the URI format, so the input to `data` should be passed
|
||||
#' like this: `"file.csv?format=csv"` (or `"file.csv?format=csv&label_column=0"` if the first column
|
||||
#' corresponds to the labels).
|
||||
#'
|
||||
#' For more information about passing text files as input, see the articles
|
||||
#' \href{https://xgboost.readthedocs.io/en/stable/tutorials/input_format.html}{Text Input Format of DMatrix} and
|
||||
#' \href{https://xgboost.readthedocs.io/en/stable/python/python_intro.html#python-data-interface}{Data Interface}.
|
||||
#' }
|
||||
#' @param label Label of the training data. For classification problems, should be passed encoded as
|
||||
#' integers with numeration starting at zero.
|
||||
@@ -81,6 +98,13 @@
|
||||
#' @param label_lower_bound Lower bound for survival training.
|
||||
#' @param label_upper_bound Upper bound for survival training.
|
||||
#' @param feature_weights Set feature weights for column sampling.
|
||||
#' @param data_split_mode When passing a URI (as R `character`) as input, this signals
|
||||
#' whether to split by row or column. Allowed values are `"row"` and `"col"`.
|
||||
#'
|
||||
#' In distributed mode, the file is split accordingly; otherwise this is only an indicator on
|
||||
#' how the file was split beforehand. Default to row.
|
||||
#'
|
||||
#' This is not used when `data` is not a URI.
|
||||
#' @return An 'xgb.DMatrix' object. If calling 'xgb.QuantileDMatrix', it will have additional
|
||||
#' subclass 'xgb.QuantileDMatrix'.
|
||||
#'
|
||||
@@ -117,7 +141,8 @@ xgb.DMatrix <- function(
|
||||
qid = NULL,
|
||||
label_lower_bound = NULL,
|
||||
label_upper_bound = NULL,
|
||||
feature_weights = NULL
|
||||
feature_weights = NULL,
|
||||
data_split_mode = "row"
|
||||
) {
|
||||
if (!is.null(group) && !is.null(qid)) {
|
||||
stop("Either one of 'group' or 'qid' should be NULL")
|
||||
@@ -131,7 +156,14 @@ xgb.DMatrix <- function(
|
||||
)
|
||||
}
|
||||
data <- path.expand(data)
|
||||
handle <- .Call(XGDMatrixCreateFromFile_R, data, as.integer(silent))
|
||||
if (data_split_mode == "row") {
|
||||
data_split_mode <- 0L
|
||||
} else if (data_split_mode == "col") {
|
||||
data_split_mode <- 1L
|
||||
} else {
|
||||
stop("Passed invalid 'data_split_mode': ", data_split_mode)
|
||||
}
|
||||
handle <- .Call(XGDMatrixCreateFromURI_R, data, as.integer(silent), data_split_mode)
|
||||
} else if (is.matrix(data)) {
|
||||
handle <- .Call(
|
||||
XGDMatrixCreateFromMat_R, data, missing, nthread
|
||||
|
||||
Reference in New Issue
Block a user