[R] switch to URI reader (#10024)
This commit is contained in:
parent
f2095f1d5b
commit
4de866211d
@ -28,10 +28,27 @@
|
||||
#' 'xgb.QuantileDMatrix'.
|
||||
#' \item Single-row CSR matrices, as class `dsparseVector` from package `Matrix`, which is interpreted
|
||||
#' as a single row (only when making predictions from a fitted model).
|
||||
#' \item Text files in SVMLight / LibSVM formats, passed as a path to the file. These are \bold{not}
|
||||
#' supported for xgb.QuantileDMatrix'.
|
||||
#' \item Binary files generated by \link{xgb.DMatrix.save}, passed as a path to the file. These are
|
||||
#' \bold{not} supported for xgb.QuantileDMatrix'.
|
||||
#' \item Text files in a supported format, passed as a `character` variable containing the URI path to
|
||||
#' the file, with an optional format specifier.
|
||||
#'
|
||||
#' These are \bold{not} supported for `xgb.QuantileDMatrix`. Supported formats are:\itemize{
|
||||
#' \item XGBoost's own binary format for DMatrices, as produced by \link{xgb.DMatrix.save}.
|
||||
#' \item SVMLight (a.k.a. LibSVM) format for CSR matrices. This format can be signaled by suffix
|
||||
#' `?format=libsvm` at the end of the file path. It will be the default format if not
|
||||
#' otherwise specified.
|
||||
#' \item CSV files (comma-separated values). This format can be specified by adding suffix
|
||||
#' `?format=csv` at the end ofthe file path. It will \bold{not} be auto-deduced from file extensions.
|
||||
#' }
|
||||
#'
|
||||
#' Be aware that the format of the file will not be auto-deduced - for example, if a file is named 'file.csv',
|
||||
#' it will not look at the extension or file contents to determine that it is a comma-separated value.
|
||||
#' Instead, the format must be specified following the URI format, so the input to `data` should be passed
|
||||
#' like this: `"file.csv?format=csv"` (or `"file.csv?format=csv&label_column=0"` if the first column
|
||||
#' corresponds to the labels).
|
||||
#'
|
||||
#' For more information about passing text files as input, see the articles
|
||||
#' \href{https://xgboost.readthedocs.io/en/stable/tutorials/input_format.html}{Text Input Format of DMatrix} and
|
||||
#' \href{https://xgboost.readthedocs.io/en/stable/python/python_intro.html#python-data-interface}{Data Interface}.
|
||||
#' }
|
||||
#' @param label Label of the training data. For classification problems, should be passed encoded as
|
||||
#' integers with numeration starting at zero.
|
||||
@ -81,6 +98,13 @@
|
||||
#' @param label_lower_bound Lower bound for survival training.
|
||||
#' @param label_upper_bound Upper bound for survival training.
|
||||
#' @param feature_weights Set feature weights for column sampling.
|
||||
#' @param data_split_mode When passing a URI (as R `character`) as input, this signals
|
||||
#' whether to split by row or column. Allowed values are `"row"` and `"col"`.
|
||||
#'
|
||||
#' In distributed mode, the file is split accordingly; otherwise this is only an indicator on
|
||||
#' how the file was split beforehand. Default to row.
|
||||
#'
|
||||
#' This is not used when `data` is not a URI.
|
||||
#' @return An 'xgb.DMatrix' object. If calling 'xgb.QuantileDMatrix', it will have additional
|
||||
#' subclass 'xgb.QuantileDMatrix'.
|
||||
#'
|
||||
@ -117,7 +141,8 @@ xgb.DMatrix <- function(
|
||||
qid = NULL,
|
||||
label_lower_bound = NULL,
|
||||
label_upper_bound = NULL,
|
||||
feature_weights = NULL
|
||||
feature_weights = NULL,
|
||||
data_split_mode = "row"
|
||||
) {
|
||||
if (!is.null(group) && !is.null(qid)) {
|
||||
stop("Either one of 'group' or 'qid' should be NULL")
|
||||
@ -131,7 +156,14 @@ xgb.DMatrix <- function(
|
||||
)
|
||||
}
|
||||
data <- path.expand(data)
|
||||
handle <- .Call(XGDMatrixCreateFromFile_R, data, as.integer(silent))
|
||||
if (data_split_mode == "row") {
|
||||
data_split_mode <- 0L
|
||||
} else if (data_split_mode == "col") {
|
||||
data_split_mode <- 1L
|
||||
} else {
|
||||
stop("Passed invalid 'data_split_mode': ", data_split_mode)
|
||||
}
|
||||
handle <- .Call(XGDMatrixCreateFromURI_R, data, as.integer(silent), data_split_mode)
|
||||
} else if (is.matrix(data)) {
|
||||
handle <- .Call(
|
||||
XGDMatrixCreateFromMat_R, data, missing, nthread
|
||||
|
||||
@ -19,7 +19,8 @@ xgb.DMatrix(
|
||||
qid = NULL,
|
||||
label_lower_bound = NULL,
|
||||
label_upper_bound = NULL,
|
||||
feature_weights = NULL
|
||||
feature_weights = NULL,
|
||||
data_split_mode = "row"
|
||||
)
|
||||
|
||||
xgb.QuantileDMatrix(
|
||||
@ -60,10 +61,27 @@ Other column types are not supported.
|
||||
'xgb.QuantileDMatrix'.
|
||||
\item Single-row CSR matrices, as class \code{dsparseVector} from package \code{Matrix}, which is interpreted
|
||||
as a single row (only when making predictions from a fitted model).
|
||||
\item Text files in SVMLight / LibSVM formats, passed as a path to the file. These are \bold{not}
|
||||
supported for xgb.QuantileDMatrix'.
|
||||
\item Binary files generated by \link{xgb.DMatrix.save}, passed as a path to the file. These are
|
||||
\bold{not} supported for xgb.QuantileDMatrix'.
|
||||
\item Text files in a supported format, passed as a \code{character} variable containing the URI path to
|
||||
the file, with an optional format specifier.
|
||||
|
||||
These are \bold{not} supported for \code{xgb.QuantileDMatrix}. Supported formats are:\itemize{
|
||||
\item XGBoost's own binary format for DMatrices, as produced by \link{xgb.DMatrix.save}.
|
||||
\item SVMLight (a.k.a. LibSVM) format for CSR matrices. This format can be signaled by suffix
|
||||
\code{?format=libsvm} at the end of the file path. It will be the default format if not
|
||||
otherwise specified.
|
||||
\item CSV files (comma-separated values). This format can be specified by adding suffix
|
||||
\code{?format=csv} at the end ofthe file path. It will \bold{not} be auto-deduced from file extensions.
|
||||
}
|
||||
|
||||
Be aware that the format of the file will not be auto-deduced - for example, if a file is named 'file.csv',
|
||||
it will not look at the extension or file contents to determine that it is a comma-separated value.
|
||||
Instead, the format must be specified following the URI format, so the input to \code{data} should be passed
|
||||
like this: \code{"file.csv?format=csv"} (or \code{"file.csv?format=csv&label_column=0"} if the first column
|
||||
corresponds to the labels).
|
||||
|
||||
For more information about passing text files as input, see the articles
|
||||
\href{https://xgboost.readthedocs.io/en/stable/tutorials/input_format.html}{Text Input Format of DMatrix} and
|
||||
\href{https://xgboost.readthedocs.io/en/stable/python/python_intro.html#python-data-interface}{Data Interface}.
|
||||
}}
|
||||
|
||||
\item{label}{Label of the training data. For classification problems, should be passed encoded as
|
||||
@ -129,6 +147,14 @@ not be saved, so make sure that \code{factor} columns passed to \code{predict} h
|
||||
|
||||
\item{feature_weights}{Set feature weights for column sampling.}
|
||||
|
||||
\item{data_split_mode}{When passing a URI (as R \code{character}) as input, this signals
|
||||
whether to split by row or column. Allowed values are \code{"row"} and \code{"col"}.
|
||||
|
||||
In distributed mode, the file is split accordingly; otherwise this is only an indicator on
|
||||
how the file was split beforehand. Default to row.
|
||||
|
||||
This is not used when \code{data} is not a URI.}
|
||||
|
||||
\item{ref}{The training dataset that provides quantile information, needed when creating
|
||||
validation/test dataset with \code{xgb.QuantileDMatrix}. Supplying the training DMatrix
|
||||
as a reference means that the same quantisation applied to the training data is
|
||||
|
||||
@ -46,7 +46,7 @@ extern SEXP XGSetArrayDimInplace_R(SEXP, SEXP);
|
||||
extern SEXP XGSetArrayDimNamesInplace_R(SEXP, SEXP);
|
||||
extern SEXP XGDMatrixCreateFromCSC_R(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP);
|
||||
extern SEXP XGDMatrixCreateFromCSR_R(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP);
|
||||
extern SEXP XGDMatrixCreateFromFile_R(SEXP, SEXP);
|
||||
extern SEXP XGDMatrixCreateFromURI_R(SEXP, SEXP, SEXP);
|
||||
extern SEXP XGDMatrixCreateFromMat_R(SEXP, SEXP, SEXP);
|
||||
extern SEXP XGDMatrixGetFloatInfo_R(SEXP, SEXP);
|
||||
extern SEXP XGDMatrixGetUIntInfo_R(SEXP, SEXP);
|
||||
@ -105,7 +105,7 @@ static const R_CallMethodDef CallEntries[] = {
|
||||
{"XGSetArrayDimNamesInplace_R", (DL_FUNC) &XGSetArrayDimNamesInplace_R, 2},
|
||||
{"XGDMatrixCreateFromCSC_R", (DL_FUNC) &XGDMatrixCreateFromCSC_R, 6},
|
||||
{"XGDMatrixCreateFromCSR_R", (DL_FUNC) &XGDMatrixCreateFromCSR_R, 6},
|
||||
{"XGDMatrixCreateFromFile_R", (DL_FUNC) &XGDMatrixCreateFromFile_R, 2},
|
||||
{"XGDMatrixCreateFromURI_R", (DL_FUNC) &XGDMatrixCreateFromURI_R, 3},
|
||||
{"XGDMatrixCreateFromMat_R", (DL_FUNC) &XGDMatrixCreateFromMat_R, 3},
|
||||
{"XGDMatrixGetFloatInfo_R", (DL_FUNC) &XGDMatrixGetFloatInfo_R, 2},
|
||||
{"XGDMatrixGetUIntInfo_R", (DL_FUNC) &XGDMatrixGetUIntInfo_R, 2},
|
||||
|
||||
@ -365,15 +365,22 @@ XGB_DLL SEXP XGBGetGlobalConfig_R() {
|
||||
return mkString(json_str);
|
||||
}
|
||||
|
||||
XGB_DLL SEXP XGDMatrixCreateFromFile_R(SEXP fname, SEXP silent) {
|
||||
SEXP ret = PROTECT(R_MakeExternalPtr(nullptr, R_NilValue, R_NilValue));
|
||||
XGB_DLL SEXP XGDMatrixCreateFromURI_R(SEXP uri, SEXP silent, SEXP data_split_mode) {
|
||||
SEXP ret = Rf_protect(R_MakeExternalPtr(nullptr, R_NilValue, R_NilValue));
|
||||
SEXP uri_char = Rf_protect(Rf_asChar(uri));
|
||||
const char *uri_ptr = CHAR(uri_char);
|
||||
R_API_BEGIN();
|
||||
xgboost::Json jconfig{xgboost::Object{}};
|
||||
jconfig["uri"] = std::string(uri_ptr);
|
||||
jconfig["silent"] = Rf_asLogical(silent);
|
||||
jconfig["data_split_mode"] = Rf_asInteger(data_split_mode);
|
||||
const std::string sconfig = xgboost::Json::Dump(jconfig);
|
||||
DMatrixHandle handle;
|
||||
CHECK_CALL(XGDMatrixCreateFromFile(CHAR(asChar(fname)), asInteger(silent), &handle));
|
||||
CHECK_CALL(XGDMatrixCreateFromURI(sconfig.c_str(), &handle));
|
||||
R_SetExternalPtrAddr(ret, handle);
|
||||
R_RegisterCFinalizerEx(ret, _DMatrixFinalizer, TRUE);
|
||||
R_API_END();
|
||||
UNPROTECT(1);
|
||||
Rf_unprotect(2);
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
@ -53,12 +53,13 @@ XGB_DLL SEXP XGBSetGlobalConfig_R(SEXP json_str);
|
||||
XGB_DLL SEXP XGBGetGlobalConfig_R();
|
||||
|
||||
/*!
|
||||
* \brief load a data matrix
|
||||
* \param fname name of the content
|
||||
* \brief load a data matrix from URI
|
||||
* \param uri URI to the source file to read data from
|
||||
* \param silent whether print messages
|
||||
* \param Data split mode (0=rows, 1=columns)
|
||||
* \return a loaded data matrix
|
||||
*/
|
||||
XGB_DLL SEXP XGDMatrixCreateFromFile_R(SEXP fname, SEXP silent);
|
||||
XGB_DLL SEXP XGDMatrixCreateFromURI_R(SEXP uri, SEXP silent, SEXP data_split_mode);
|
||||
|
||||
/*!
|
||||
* \brief create matrix content from dense matrix
|
||||
|
||||
@ -692,3 +692,20 @@ test_that("xgb.DMatrix: quantile cuts look correct", {
|
||||
}
|
||||
)
|
||||
})
|
||||
|
||||
test_that("xgb.DMatrix: can read CSV", {
|
||||
txt <- paste(
|
||||
"1,2,3",
|
||||
"-1,3,2",
|
||||
sep = "\n"
|
||||
)
|
||||
fname <- file.path(tempdir(), "data.csv")
|
||||
writeChar(txt, fname)
|
||||
uri <- paste0(fname, "?format=csv&label_column=0")
|
||||
dm <- xgb.DMatrix(uri, silent = TRUE)
|
||||
expect_equal(getinfo(dm, "label"), c(1, -1))
|
||||
expect_equal(
|
||||
as.matrix(xgb.get.DMatrix.data(dm)),
|
||||
matrix(c(2, 3, 3, 2), nrow = 2, byrow = TRUE)
|
||||
)
|
||||
})
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user