[R] switch to URI reader (#10024)

This commit is contained in:
david-cortes 2024-02-04 22:03:38 +01:00 committed by GitHub
parent f2095f1d5b
commit 4de866211d
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 103 additions and 20 deletions

View File

@ -28,10 +28,27 @@
#' 'xgb.QuantileDMatrix'.
#' \item Single-row CSR matrices, as class `dsparseVector` from package `Matrix`, which is interpreted
#' as a single row (only when making predictions from a fitted model).
#' \item Text files in SVMLight / LibSVM formats, passed as a path to the file. These are \bold{not}
#' supported for xgb.QuantileDMatrix'.
#' \item Binary files generated by \link{xgb.DMatrix.save}, passed as a path to the file. These are
#' \bold{not} supported for xgb.QuantileDMatrix'.
#' \item Text files in a supported format, passed as a `character` variable containing the URI path to
#' the file, with an optional format specifier.
#'
#' These are \bold{not} supported for `xgb.QuantileDMatrix`. Supported formats are:\itemize{
#' \item XGBoost's own binary format for DMatrices, as produced by \link{xgb.DMatrix.save}.
#' \item SVMLight (a.k.a. LibSVM) format for CSR matrices. This format can be signaled by suffix
#' `?format=libsvm` at the end of the file path. It will be the default format if not
#' otherwise specified.
#' \item CSV files (comma-separated values). This format can be specified by adding suffix
#' `?format=csv` at the end ofthe file path. It will \bold{not} be auto-deduced from file extensions.
#' }
#'
#' Be aware that the format of the file will not be auto-deduced - for example, if a file is named 'file.csv',
#' it will not look at the extension or file contents to determine that it is a comma-separated value.
#' Instead, the format must be specified following the URI format, so the input to `data` should be passed
#' like this: `"file.csv?format=csv"` (or `"file.csv?format=csv&label_column=0"` if the first column
#' corresponds to the labels).
#'
#' For more information about passing text files as input, see the articles
#' \href{https://xgboost.readthedocs.io/en/stable/tutorials/input_format.html}{Text Input Format of DMatrix} and
#' \href{https://xgboost.readthedocs.io/en/stable/python/python_intro.html#python-data-interface}{Data Interface}.
#' }
#' @param label Label of the training data. For classification problems, should be passed encoded as
#' integers with numeration starting at zero.
@ -81,6 +98,13 @@
#' @param label_lower_bound Lower bound for survival training.
#' @param label_upper_bound Upper bound for survival training.
#' @param feature_weights Set feature weights for column sampling.
#' @param data_split_mode When passing a URI (as R `character`) as input, this signals
#' whether to split by row or column. Allowed values are `"row"` and `"col"`.
#'
#' In distributed mode, the file is split accordingly; otherwise this is only an indicator on
#' how the file was split beforehand. Default to row.
#'
#' This is not used when `data` is not a URI.
#' @return An 'xgb.DMatrix' object. If calling 'xgb.QuantileDMatrix', it will have additional
#' subclass 'xgb.QuantileDMatrix'.
#'
@ -117,7 +141,8 @@ xgb.DMatrix <- function(
qid = NULL,
label_lower_bound = NULL,
label_upper_bound = NULL,
feature_weights = NULL
feature_weights = NULL,
data_split_mode = "row"
) {
if (!is.null(group) && !is.null(qid)) {
stop("Either one of 'group' or 'qid' should be NULL")
@ -131,7 +156,14 @@ xgb.DMatrix <- function(
)
}
data <- path.expand(data)
handle <- .Call(XGDMatrixCreateFromFile_R, data, as.integer(silent))
if (data_split_mode == "row") {
data_split_mode <- 0L
} else if (data_split_mode == "col") {
data_split_mode <- 1L
} else {
stop("Passed invalid 'data_split_mode': ", data_split_mode)
}
handle <- .Call(XGDMatrixCreateFromURI_R, data, as.integer(silent), data_split_mode)
} else if (is.matrix(data)) {
handle <- .Call(
XGDMatrixCreateFromMat_R, data, missing, nthread

View File

@ -19,7 +19,8 @@ xgb.DMatrix(
qid = NULL,
label_lower_bound = NULL,
label_upper_bound = NULL,
feature_weights = NULL
feature_weights = NULL,
data_split_mode = "row"
)
xgb.QuantileDMatrix(
@ -60,10 +61,27 @@ Other column types are not supported.
'xgb.QuantileDMatrix'.
\item Single-row CSR matrices, as class \code{dsparseVector} from package \code{Matrix}, which is interpreted
as a single row (only when making predictions from a fitted model).
\item Text files in SVMLight / LibSVM formats, passed as a path to the file. These are \bold{not}
supported for xgb.QuantileDMatrix'.
\item Binary files generated by \link{xgb.DMatrix.save}, passed as a path to the file. These are
\bold{not} supported for xgb.QuantileDMatrix'.
\item Text files in a supported format, passed as a \code{character} variable containing the URI path to
the file, with an optional format specifier.
These are \bold{not} supported for \code{xgb.QuantileDMatrix}. Supported formats are:\itemize{
\item XGBoost's own binary format for DMatrices, as produced by \link{xgb.DMatrix.save}.
\item SVMLight (a.k.a. LibSVM) format for CSR matrices. This format can be signaled by suffix
\code{?format=libsvm} at the end of the file path. It will be the default format if not
otherwise specified.
\item CSV files (comma-separated values). This format can be specified by adding suffix
\code{?format=csv} at the end ofthe file path. It will \bold{not} be auto-deduced from file extensions.
}
Be aware that the format of the file will not be auto-deduced - for example, if a file is named 'file.csv',
it will not look at the extension or file contents to determine that it is a comma-separated value.
Instead, the format must be specified following the URI format, so the input to \code{data} should be passed
like this: \code{"file.csv?format=csv"} (or \code{"file.csv?format=csv&label_column=0"} if the first column
corresponds to the labels).
For more information about passing text files as input, see the articles
\href{https://xgboost.readthedocs.io/en/stable/tutorials/input_format.html}{Text Input Format of DMatrix} and
\href{https://xgboost.readthedocs.io/en/stable/python/python_intro.html#python-data-interface}{Data Interface}.
}}
\item{label}{Label of the training data. For classification problems, should be passed encoded as
@ -129,6 +147,14 @@ not be saved, so make sure that \code{factor} columns passed to \code{predict} h
\item{feature_weights}{Set feature weights for column sampling.}
\item{data_split_mode}{When passing a URI (as R \code{character}) as input, this signals
whether to split by row or column. Allowed values are \code{"row"} and \code{"col"}.
In distributed mode, the file is split accordingly; otherwise this is only an indicator on
how the file was split beforehand. Default to row.
This is not used when \code{data} is not a URI.}
\item{ref}{The training dataset that provides quantile information, needed when creating
validation/test dataset with \code{xgb.QuantileDMatrix}. Supplying the training DMatrix
as a reference means that the same quantisation applied to the training data is

View File

@ -46,7 +46,7 @@ extern SEXP XGSetArrayDimInplace_R(SEXP, SEXP);
extern SEXP XGSetArrayDimNamesInplace_R(SEXP, SEXP);
extern SEXP XGDMatrixCreateFromCSC_R(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP);
extern SEXP XGDMatrixCreateFromCSR_R(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP);
extern SEXP XGDMatrixCreateFromFile_R(SEXP, SEXP);
extern SEXP XGDMatrixCreateFromURI_R(SEXP, SEXP, SEXP);
extern SEXP XGDMatrixCreateFromMat_R(SEXP, SEXP, SEXP);
extern SEXP XGDMatrixGetFloatInfo_R(SEXP, SEXP);
extern SEXP XGDMatrixGetUIntInfo_R(SEXP, SEXP);
@ -105,7 +105,7 @@ static const R_CallMethodDef CallEntries[] = {
{"XGSetArrayDimNamesInplace_R", (DL_FUNC) &XGSetArrayDimNamesInplace_R, 2},
{"XGDMatrixCreateFromCSC_R", (DL_FUNC) &XGDMatrixCreateFromCSC_R, 6},
{"XGDMatrixCreateFromCSR_R", (DL_FUNC) &XGDMatrixCreateFromCSR_R, 6},
{"XGDMatrixCreateFromFile_R", (DL_FUNC) &XGDMatrixCreateFromFile_R, 2},
{"XGDMatrixCreateFromURI_R", (DL_FUNC) &XGDMatrixCreateFromURI_R, 3},
{"XGDMatrixCreateFromMat_R", (DL_FUNC) &XGDMatrixCreateFromMat_R, 3},
{"XGDMatrixGetFloatInfo_R", (DL_FUNC) &XGDMatrixGetFloatInfo_R, 2},
{"XGDMatrixGetUIntInfo_R", (DL_FUNC) &XGDMatrixGetUIntInfo_R, 2},

View File

@ -365,15 +365,22 @@ XGB_DLL SEXP XGBGetGlobalConfig_R() {
return mkString(json_str);
}
XGB_DLL SEXP XGDMatrixCreateFromFile_R(SEXP fname, SEXP silent) {
SEXP ret = PROTECT(R_MakeExternalPtr(nullptr, R_NilValue, R_NilValue));
XGB_DLL SEXP XGDMatrixCreateFromURI_R(SEXP uri, SEXP silent, SEXP data_split_mode) {
SEXP ret = Rf_protect(R_MakeExternalPtr(nullptr, R_NilValue, R_NilValue));
SEXP uri_char = Rf_protect(Rf_asChar(uri));
const char *uri_ptr = CHAR(uri_char);
R_API_BEGIN();
xgboost::Json jconfig{xgboost::Object{}};
jconfig["uri"] = std::string(uri_ptr);
jconfig["silent"] = Rf_asLogical(silent);
jconfig["data_split_mode"] = Rf_asInteger(data_split_mode);
const std::string sconfig = xgboost::Json::Dump(jconfig);
DMatrixHandle handle;
CHECK_CALL(XGDMatrixCreateFromFile(CHAR(asChar(fname)), asInteger(silent), &handle));
CHECK_CALL(XGDMatrixCreateFromURI(sconfig.c_str(), &handle));
R_SetExternalPtrAddr(ret, handle);
R_RegisterCFinalizerEx(ret, _DMatrixFinalizer, TRUE);
R_API_END();
UNPROTECT(1);
Rf_unprotect(2);
return ret;
}

View File

@ -53,12 +53,13 @@ XGB_DLL SEXP XGBSetGlobalConfig_R(SEXP json_str);
XGB_DLL SEXP XGBGetGlobalConfig_R();
/*!
* \brief load a data matrix
* \param fname name of the content
* \brief load a data matrix from URI
* \param uri URI to the source file to read data from
* \param silent whether print messages
* \param Data split mode (0=rows, 1=columns)
* \return a loaded data matrix
*/
XGB_DLL SEXP XGDMatrixCreateFromFile_R(SEXP fname, SEXP silent);
XGB_DLL SEXP XGDMatrixCreateFromURI_R(SEXP uri, SEXP silent, SEXP data_split_mode);
/*!
* \brief create matrix content from dense matrix

View File

@ -692,3 +692,20 @@ test_that("xgb.DMatrix: quantile cuts look correct", {
}
)
})
test_that("xgb.DMatrix: can read CSV", {
txt <- paste(
"1,2,3",
"-1,3,2",
sep = "\n"
)
fname <- file.path(tempdir(), "data.csv")
writeChar(txt, fname)
uri <- paste0(fname, "?format=csv&label_column=0")
dm <- xgb.DMatrix(uri, silent = TRUE)
expect_equal(getinfo(dm, "label"), c(1, -1))
expect_equal(
as.matrix(xgb.get.DMatrix.data(dm)),
matrix(c(2, 3, 3, 2), nrow = 2, byrow = TRUE)
)
})