From 4de866211d5bba706f6b94d1ba4a102fe885c1b9 Mon Sep 17 00:00:00 2001 From: david-cortes Date: Sun, 4 Feb 2024 22:03:38 +0100 Subject: [PATCH] [R] switch to URI reader (#10024) --- R-package/R/xgb.DMatrix.R | 44 +++++++++++++++++++++---- R-package/man/xgb.DMatrix.Rd | 36 +++++++++++++++++--- R-package/src/init.c | 4 +-- R-package/src/xgboost_R.cc | 15 ++++++--- R-package/src/xgboost_R.h | 7 ++-- R-package/tests/testthat/test_dmatrix.R | 17 ++++++++++ 6 files changed, 103 insertions(+), 20 deletions(-) diff --git a/R-package/R/xgb.DMatrix.R b/R-package/R/xgb.DMatrix.R index ba0686cf9..edbc267c1 100644 --- a/R-package/R/xgb.DMatrix.R +++ b/R-package/R/xgb.DMatrix.R @@ -28,10 +28,27 @@ #' 'xgb.QuantileDMatrix'. #' \item Single-row CSR matrices, as class `dsparseVector` from package `Matrix`, which is interpreted #' as a single row (only when making predictions from a fitted model). -#' \item Text files in SVMLight / LibSVM formats, passed as a path to the file. These are \bold{not} -#' supported for xgb.QuantileDMatrix'. -#' \item Binary files generated by \link{xgb.DMatrix.save}, passed as a path to the file. These are -#' \bold{not} supported for xgb.QuantileDMatrix'. +#' \item Text files in a supported format, passed as a `character` variable containing the URI path to +#' the file, with an optional format specifier. +#' +#' These are \bold{not} supported for `xgb.QuantileDMatrix`. Supported formats are:\itemize{ +#' \item XGBoost's own binary format for DMatrices, as produced by \link{xgb.DMatrix.save}. +#' \item SVMLight (a.k.a. LibSVM) format for CSR matrices. This format can be signaled by suffix +#' `?format=libsvm` at the end of the file path. It will be the default format if not +#' otherwise specified. +#' \item CSV files (comma-separated values). This format can be specified by adding suffix +#' `?format=csv` at the end ofthe file path. It will \bold{not} be auto-deduced from file extensions. +#' } +#' +#' Be aware that the format of the file will not be auto-deduced - for example, if a file is named 'file.csv', +#' it will not look at the extension or file contents to determine that it is a comma-separated value. +#' Instead, the format must be specified following the URI format, so the input to `data` should be passed +#' like this: `"file.csv?format=csv"` (or `"file.csv?format=csv&label_column=0"` if the first column +#' corresponds to the labels). +#' +#' For more information about passing text files as input, see the articles +#' \href{https://xgboost.readthedocs.io/en/stable/tutorials/input_format.html}{Text Input Format of DMatrix} and +#' \href{https://xgboost.readthedocs.io/en/stable/python/python_intro.html#python-data-interface}{Data Interface}. #' } #' @param label Label of the training data. For classification problems, should be passed encoded as #' integers with numeration starting at zero. @@ -81,6 +98,13 @@ #' @param label_lower_bound Lower bound for survival training. #' @param label_upper_bound Upper bound for survival training. #' @param feature_weights Set feature weights for column sampling. +#' @param data_split_mode When passing a URI (as R `character`) as input, this signals +#' whether to split by row or column. Allowed values are `"row"` and `"col"`. +#' +#' In distributed mode, the file is split accordingly; otherwise this is only an indicator on +#' how the file was split beforehand. Default to row. +#' +#' This is not used when `data` is not a URI. #' @return An 'xgb.DMatrix' object. If calling 'xgb.QuantileDMatrix', it will have additional #' subclass 'xgb.QuantileDMatrix'. #' @@ -117,7 +141,8 @@ xgb.DMatrix <- function( qid = NULL, label_lower_bound = NULL, label_upper_bound = NULL, - feature_weights = NULL + feature_weights = NULL, + data_split_mode = "row" ) { if (!is.null(group) && !is.null(qid)) { stop("Either one of 'group' or 'qid' should be NULL") @@ -131,7 +156,14 @@ xgb.DMatrix <- function( ) } data <- path.expand(data) - handle <- .Call(XGDMatrixCreateFromFile_R, data, as.integer(silent)) + if (data_split_mode == "row") { + data_split_mode <- 0L + } else if (data_split_mode == "col") { + data_split_mode <- 1L + } else { + stop("Passed invalid 'data_split_mode': ", data_split_mode) + } + handle <- .Call(XGDMatrixCreateFromURI_R, data, as.integer(silent), data_split_mode) } else if (is.matrix(data)) { handle <- .Call( XGDMatrixCreateFromMat_R, data, missing, nthread diff --git a/R-package/man/xgb.DMatrix.Rd b/R-package/man/xgb.DMatrix.Rd index d18270733..5f764ed45 100644 --- a/R-package/man/xgb.DMatrix.Rd +++ b/R-package/man/xgb.DMatrix.Rd @@ -19,7 +19,8 @@ xgb.DMatrix( qid = NULL, label_lower_bound = NULL, label_upper_bound = NULL, - feature_weights = NULL + feature_weights = NULL, + data_split_mode = "row" ) xgb.QuantileDMatrix( @@ -60,10 +61,27 @@ Other column types are not supported. 'xgb.QuantileDMatrix'. \item Single-row CSR matrices, as class \code{dsparseVector} from package \code{Matrix}, which is interpreted as a single row (only when making predictions from a fitted model). -\item Text files in SVMLight / LibSVM formats, passed as a path to the file. These are \bold{not} -supported for xgb.QuantileDMatrix'. -\item Binary files generated by \link{xgb.DMatrix.save}, passed as a path to the file. These are -\bold{not} supported for xgb.QuantileDMatrix'. +\item Text files in a supported format, passed as a \code{character} variable containing the URI path to +the file, with an optional format specifier. + +These are \bold{not} supported for \code{xgb.QuantileDMatrix}. Supported formats are:\itemize{ +\item XGBoost's own binary format for DMatrices, as produced by \link{xgb.DMatrix.save}. +\item SVMLight (a.k.a. LibSVM) format for CSR matrices. This format can be signaled by suffix +\code{?format=libsvm} at the end of the file path. It will be the default format if not +otherwise specified. +\item CSV files (comma-separated values). This format can be specified by adding suffix +\code{?format=csv} at the end ofthe file path. It will \bold{not} be auto-deduced from file extensions. +} + +Be aware that the format of the file will not be auto-deduced - for example, if a file is named 'file.csv', +it will not look at the extension or file contents to determine that it is a comma-separated value. +Instead, the format must be specified following the URI format, so the input to \code{data} should be passed +like this: \code{"file.csv?format=csv"} (or \code{"file.csv?format=csv&label_column=0"} if the first column +corresponds to the labels). + +For more information about passing text files as input, see the articles +\href{https://xgboost.readthedocs.io/en/stable/tutorials/input_format.html}{Text Input Format of DMatrix} and +\href{https://xgboost.readthedocs.io/en/stable/python/python_intro.html#python-data-interface}{Data Interface}. }} \item{label}{Label of the training data. For classification problems, should be passed encoded as @@ -129,6 +147,14 @@ not be saved, so make sure that \code{factor} columns passed to \code{predict} h \item{feature_weights}{Set feature weights for column sampling.} +\item{data_split_mode}{When passing a URI (as R \code{character}) as input, this signals +whether to split by row or column. Allowed values are \code{"row"} and \code{"col"}. + +In distributed mode, the file is split accordingly; otherwise this is only an indicator on +how the file was split beforehand. Default to row. + +This is not used when \code{data} is not a URI.} + \item{ref}{The training dataset that provides quantile information, needed when creating validation/test dataset with \code{xgb.QuantileDMatrix}. Supplying the training DMatrix as a reference means that the same quantisation applied to the training data is diff --git a/R-package/src/init.c b/R-package/src/init.c index a9f3f3e38..36f3e8953 100644 --- a/R-package/src/init.c +++ b/R-package/src/init.c @@ -46,7 +46,7 @@ extern SEXP XGSetArrayDimInplace_R(SEXP, SEXP); extern SEXP XGSetArrayDimNamesInplace_R(SEXP, SEXP); extern SEXP XGDMatrixCreateFromCSC_R(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP); extern SEXP XGDMatrixCreateFromCSR_R(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP); -extern SEXP XGDMatrixCreateFromFile_R(SEXP, SEXP); +extern SEXP XGDMatrixCreateFromURI_R(SEXP, SEXP, SEXP); extern SEXP XGDMatrixCreateFromMat_R(SEXP, SEXP, SEXP); extern SEXP XGDMatrixGetFloatInfo_R(SEXP, SEXP); extern SEXP XGDMatrixGetUIntInfo_R(SEXP, SEXP); @@ -105,7 +105,7 @@ static const R_CallMethodDef CallEntries[] = { {"XGSetArrayDimNamesInplace_R", (DL_FUNC) &XGSetArrayDimNamesInplace_R, 2}, {"XGDMatrixCreateFromCSC_R", (DL_FUNC) &XGDMatrixCreateFromCSC_R, 6}, {"XGDMatrixCreateFromCSR_R", (DL_FUNC) &XGDMatrixCreateFromCSR_R, 6}, - {"XGDMatrixCreateFromFile_R", (DL_FUNC) &XGDMatrixCreateFromFile_R, 2}, + {"XGDMatrixCreateFromURI_R", (DL_FUNC) &XGDMatrixCreateFromURI_R, 3}, {"XGDMatrixCreateFromMat_R", (DL_FUNC) &XGDMatrixCreateFromMat_R, 3}, {"XGDMatrixGetFloatInfo_R", (DL_FUNC) &XGDMatrixGetFloatInfo_R, 2}, {"XGDMatrixGetUIntInfo_R", (DL_FUNC) &XGDMatrixGetUIntInfo_R, 2}, diff --git a/R-package/src/xgboost_R.cc b/R-package/src/xgboost_R.cc index c91fb94c4..4192f82fb 100644 --- a/R-package/src/xgboost_R.cc +++ b/R-package/src/xgboost_R.cc @@ -365,15 +365,22 @@ XGB_DLL SEXP XGBGetGlobalConfig_R() { return mkString(json_str); } -XGB_DLL SEXP XGDMatrixCreateFromFile_R(SEXP fname, SEXP silent) { - SEXP ret = PROTECT(R_MakeExternalPtr(nullptr, R_NilValue, R_NilValue)); +XGB_DLL SEXP XGDMatrixCreateFromURI_R(SEXP uri, SEXP silent, SEXP data_split_mode) { + SEXP ret = Rf_protect(R_MakeExternalPtr(nullptr, R_NilValue, R_NilValue)); + SEXP uri_char = Rf_protect(Rf_asChar(uri)); + const char *uri_ptr = CHAR(uri_char); R_API_BEGIN(); + xgboost::Json jconfig{xgboost::Object{}}; + jconfig["uri"] = std::string(uri_ptr); + jconfig["silent"] = Rf_asLogical(silent); + jconfig["data_split_mode"] = Rf_asInteger(data_split_mode); + const std::string sconfig = xgboost::Json::Dump(jconfig); DMatrixHandle handle; - CHECK_CALL(XGDMatrixCreateFromFile(CHAR(asChar(fname)), asInteger(silent), &handle)); + CHECK_CALL(XGDMatrixCreateFromURI(sconfig.c_str(), &handle)); R_SetExternalPtrAddr(ret, handle); R_RegisterCFinalizerEx(ret, _DMatrixFinalizer, TRUE); R_API_END(); - UNPROTECT(1); + Rf_unprotect(2); return ret; } diff --git a/R-package/src/xgboost_R.h b/R-package/src/xgboost_R.h index d2e0ae828..652345e52 100644 --- a/R-package/src/xgboost_R.h +++ b/R-package/src/xgboost_R.h @@ -53,12 +53,13 @@ XGB_DLL SEXP XGBSetGlobalConfig_R(SEXP json_str); XGB_DLL SEXP XGBGetGlobalConfig_R(); /*! - * \brief load a data matrix - * \param fname name of the content + * \brief load a data matrix from URI + * \param uri URI to the source file to read data from * \param silent whether print messages + * \param Data split mode (0=rows, 1=columns) * \return a loaded data matrix */ -XGB_DLL SEXP XGDMatrixCreateFromFile_R(SEXP fname, SEXP silent); +XGB_DLL SEXP XGDMatrixCreateFromURI_R(SEXP uri, SEXP silent, SEXP data_split_mode); /*! * \brief create matrix content from dense matrix diff --git a/R-package/tests/testthat/test_dmatrix.R b/R-package/tests/testthat/test_dmatrix.R index 50621f241..45bcac08d 100644 --- a/R-package/tests/testthat/test_dmatrix.R +++ b/R-package/tests/testthat/test_dmatrix.R @@ -692,3 +692,20 @@ test_that("xgb.DMatrix: quantile cuts look correct", { } ) }) + +test_that("xgb.DMatrix: can read CSV", { + txt <- paste( + "1,2,3", + "-1,3,2", + sep = "\n" + ) + fname <- file.path(tempdir(), "data.csv") + writeChar(txt, fname) + uri <- paste0(fname, "?format=csv&label_column=0") + dm <- xgb.DMatrix(uri, silent = TRUE) + expect_equal(getinfo(dm, "label"), c(1, -1)) + expect_equal( + as.matrix(xgb.get.DMatrix.data(dm)), + matrix(c(2, 3, 3, 2), nrow = 2, byrow = TRUE) + ) +})