1349 lines
46 KiB
R
1349 lines
46 KiB
R
#' Construct xgb.DMatrix object
|
|
#'
|
|
#' Construct an 'xgb.DMatrix' object from a given data source, which can then be passed to functions
|
|
#' such as [xgb.train()] or [predict()].
|
|
#'
|
|
#' Function `xgb.QuantileDMatrix()` will construct a DMatrix with quantization for the histogram
|
|
#' method already applied to it, which can be used to reduce memory usage (compared to using a
|
|
#' a regular DMatrix first and then creating a quantization out of it) when using the histogram
|
|
#' method (`tree_method = "hist"`, which is the default algorithm), but is not usable for the
|
|
#' sorted-indices method (`tree_method = "exact"`), nor for the approximate method
|
|
#' (`tree_method = "approx"`).
|
|
#' @param data Data from which to create a DMatrix, which can then be used for fitting models or
|
|
#' for getting predictions out of a fitted model.
|
|
#'
|
|
#' Supported input types are as follows:\itemize{
|
|
#' \item `matrix` objects, with types `numeric`, `integer`, or `logical`.
|
|
#' \item `data.frame` objects, with columns of types `numeric`, `integer`, `logical`, or `factor`.
|
|
#'
|
|
#' Note that xgboost uses base-0 encoding for categorical types, hence `factor` types (which use base-1
|
|
#' encoding') will be converted inside the function call. Be aware that the encoding used for `factor`
|
|
#' types is not kept as part of the model, so in subsequent calls to `predict`, it is the user's
|
|
#' responsibility to ensure that factor columns have the same levels as the ones from which the DMatrix
|
|
#' was constructed.
|
|
#'
|
|
#' Other column types are not supported.
|
|
#' \item CSR matrices, as class `dgRMatrix` from package `Matrix`.
|
|
#' \item CSC matrices, as class `dgCMatrix` from package `Matrix`. These are **not** supported for
|
|
#' 'xgb.QuantileDMatrix'.
|
|
#' \item Single-row CSR matrices, as class `dsparseVector` from package `Matrix`, which is interpreted
|
|
#' as a single row (only when making predictions from a fitted model).
|
|
#' \item Text files in a supported format, passed as a `character` variable containing the URI path to
|
|
#' the file, with an optional format specifier.
|
|
#'
|
|
#' These are **not** supported for `xgb.QuantileDMatrix`. Supported formats are:\itemize{
|
|
#' \item XGBoost's own binary format for DMatrices, as produced by [xgb.DMatrix.save()].
|
|
#' \item SVMLight (a.k.a. LibSVM) format for CSR matrices. This format can be signaled by suffix
|
|
#' `?format=libsvm` at the end of the file path. It will be the default format if not
|
|
#' otherwise specified.
|
|
#' \item CSV files (comma-separated values). This format can be specified by adding suffix
|
|
#' `?format=csv` at the end ofthe file path. It will **not** be auto-deduced from file extensions.
|
|
#' }
|
|
#'
|
|
#' Be aware that the format of the file will not be auto-deduced - for example, if a file is named 'file.csv',
|
|
#' it will not look at the extension or file contents to determine that it is a comma-separated value.
|
|
#' Instead, the format must be specified following the URI format, so the input to `data` should be passed
|
|
#' like this: `"file.csv?format=csv"` (or `"file.csv?format=csv&label_column=0"` if the first column
|
|
#' corresponds to the labels).
|
|
#'
|
|
#' For more information about passing text files as input, see the articles
|
|
#' \href{https://xgboost.readthedocs.io/en/stable/tutorials/input_format.html}{Text Input Format of DMatrix} and
|
|
#' \href{https://xgboost.readthedocs.io/en/stable/python/python_intro.html#python-data-interface}{Data Interface}.
|
|
#' }
|
|
#' @param label Label of the training data. For classification problems, should be passed encoded as
|
|
#' integers with numeration starting at zero.
|
|
#' @param weight Weight for each instance.
|
|
#'
|
|
#' Note that, for ranking task, weights are per-group. In ranking task, one weight
|
|
#' is assigned to each group (not each data point). This is because we
|
|
#' only care about the relative ordering of data points within each group,
|
|
#' so it doesn't make sense to assign weights to individual data points.
|
|
#' @param base_margin Base margin used for boosting from existing model.
|
|
#'
|
|
#' In the case of multi-output models, one can also pass multi-dimensional base_margin.
|
|
#' @param missing A float value to represents missing values in data (not used when creating DMatrix
|
|
#' from text files). It is useful to change when a zero, infinite, or some other
|
|
#' extreme value represents missing values in data.
|
|
#' @param silent whether to suppress printing an informational message after loading from a file.
|
|
#' @param feature_names Set names for features. Overrides column names in data frame and matrix.
|
|
#'
|
|
#' Note: columns are not referenced by name when calling `predict`, so the column order there
|
|
#' must be the same as in the DMatrix construction, regardless of the column names.
|
|
#' @param feature_types Set types for features.
|
|
#'
|
|
#' If `data` is a `data.frame` and passing `feature_types` is not supplied,
|
|
#' feature types will be deduced automatically from the column types.
|
|
#'
|
|
#' Otherwise, one can pass a character vector with the same length as number of columns in `data`,
|
|
#' with the following possible values:
|
|
#' - "c", which represents categorical columns.
|
|
#' - "q", which represents numeric columns.
|
|
#' - "int", which represents integer columns.
|
|
#' - "i", which represents logical (boolean) columns.
|
|
#'
|
|
#' Note that, while categorical types are treated differently from the rest for model fitting
|
|
#' purposes, the other types do not influence the generated model, but have effects in other
|
|
#' functionalities such as feature importances.
|
|
#'
|
|
#' **Important**: Categorical features, if specified manually through `feature_types`, must
|
|
#' be encoded as integers with numeration starting at zero, and the same encoding needs to be
|
|
#' applied when passing data to [predict()]. Even if passing `factor` types, the encoding will
|
|
#' not be saved, so make sure that `factor` columns passed to `predict` have the same `levels`.
|
|
#' @param nthread Number of threads used for creating DMatrix.
|
|
#' @param group Group size for all ranking group.
|
|
#' @param qid Query ID for data samples, used for ranking.
|
|
#' @param label_lower_bound Lower bound for survival training.
|
|
#' @param label_upper_bound Upper bound for survival training.
|
|
#' @param feature_weights Set feature weights for column sampling.
|
|
#' @param data_split_mode When passing a URI (as R `character`) as input, this signals
|
|
#' whether to split by row or column. Allowed values are `"row"` and `"col"`.
|
|
#'
|
|
#' In distributed mode, the file is split accordingly; otherwise this is only an indicator on
|
|
#' how the file was split beforehand. Default to row.
|
|
#'
|
|
#' This is not used when `data` is not a URI.
|
|
#' @return An 'xgb.DMatrix' object. If calling 'xgb.QuantileDMatrix', it will have additional
|
|
#' subclass 'xgb.QuantileDMatrix'.
|
|
#'
|
|
#' @details
|
|
#' Note that DMatrix objects are not serializable through R functions such as [saveRDS()] or [save()].
|
|
#' If a DMatrix gets serialized and then de-serialized (for example, when saving data in an R session or caching
|
|
#' chunks in an Rmd file), the resulting object will not be usable anymore and will need to be reconstructed
|
|
#' from the original source of data.
|
|
#'
|
|
#' @examples
|
|
#' data(agaricus.train, package = "xgboost")
|
|
#'
|
|
#' ## Keep the number of threads to 1 for examples
|
|
#' nthread <- 1
|
|
#' data.table::setDTthreads(nthread)
|
|
#' dtrain <- with(
|
|
#' agaricus.train, xgb.DMatrix(data, label = label, nthread = nthread)
|
|
#' )
|
|
#' fname <- file.path(tempdir(), "xgb.DMatrix.data")
|
|
#' xgb.DMatrix.save(dtrain, fname)
|
|
#' dtrain <- xgb.DMatrix(fname)
|
|
#' @export
|
|
#' @rdname xgb.DMatrix
|
|
xgb.DMatrix <- function(
|
|
data,
|
|
label = NULL,
|
|
weight = NULL,
|
|
base_margin = NULL,
|
|
missing = NA,
|
|
silent = FALSE,
|
|
feature_names = colnames(data),
|
|
feature_types = NULL,
|
|
nthread = NULL,
|
|
group = NULL,
|
|
qid = NULL,
|
|
label_lower_bound = NULL,
|
|
label_upper_bound = NULL,
|
|
feature_weights = NULL,
|
|
data_split_mode = "row"
|
|
) {
|
|
if (!is.null(group) && !is.null(qid)) {
|
|
stop("Either one of 'group' or 'qid' should be NULL")
|
|
}
|
|
nthread <- as.integer(NVL(nthread, -1L))
|
|
if (typeof(data) == "character") {
|
|
if (length(data) > 1) {
|
|
stop(
|
|
"'data' has class 'character' and length ", length(data),
|
|
".\n 'data' accepts either a numeric matrix or a single filename."
|
|
)
|
|
}
|
|
data <- path.expand(data)
|
|
if (data_split_mode == "row") {
|
|
data_split_mode <- 0L
|
|
} else if (data_split_mode == "col") {
|
|
data_split_mode <- 1L
|
|
} else {
|
|
stop("Passed invalid 'data_split_mode': ", data_split_mode)
|
|
}
|
|
handle <- .Call(XGDMatrixCreateFromURI_R, data, as.integer(silent), data_split_mode)
|
|
} else if (is.matrix(data)) {
|
|
handle <- .Call(
|
|
XGDMatrixCreateFromMat_R, data, missing, nthread
|
|
)
|
|
} else if (inherits(data, "dgCMatrix")) {
|
|
handle <- .Call(
|
|
XGDMatrixCreateFromCSC_R,
|
|
data@p,
|
|
data@i,
|
|
data@x,
|
|
nrow(data),
|
|
missing,
|
|
nthread
|
|
)
|
|
} else if (inherits(data, "dgRMatrix")) {
|
|
handle <- .Call(
|
|
XGDMatrixCreateFromCSR_R,
|
|
data@p,
|
|
data@j,
|
|
data@x,
|
|
ncol(data),
|
|
missing,
|
|
nthread
|
|
)
|
|
} else if (inherits(data, "dsparseVector")) {
|
|
indptr <- c(0L, as.integer(length(data@i)))
|
|
ind <- as.integer(data@i) - 1L
|
|
handle <- .Call(
|
|
XGDMatrixCreateFromCSR_R,
|
|
indptr,
|
|
ind,
|
|
data@x,
|
|
length(data),
|
|
missing,
|
|
nthread
|
|
)
|
|
} else if (is.data.frame(data)) {
|
|
tmp <- .process.df.for.dmatrix(data, feature_types)
|
|
feature_types <- tmp$feature_types
|
|
handle <- .Call(
|
|
XGDMatrixCreateFromDF_R, tmp$lst, missing, nthread
|
|
)
|
|
rm(tmp)
|
|
} else {
|
|
stop("xgb.DMatrix does not support construction from ", typeof(data))
|
|
}
|
|
|
|
dmat <- handle
|
|
attributes(dmat) <- list(
|
|
class = "xgb.DMatrix",
|
|
fields = new.env()
|
|
)
|
|
.set.dmatrix.fields(
|
|
dmat = dmat,
|
|
label = label,
|
|
weight = weight,
|
|
base_margin = base_margin,
|
|
feature_names = feature_names,
|
|
feature_types = feature_types,
|
|
group = group,
|
|
qid = qid,
|
|
label_lower_bound = label_lower_bound,
|
|
label_upper_bound = label_upper_bound,
|
|
feature_weights = feature_weights
|
|
)
|
|
|
|
return(dmat)
|
|
}
|
|
|
|
.process.df.for.dmatrix <- function(df, feature_types) {
|
|
if (!nrow(df) || !ncol(df)) {
|
|
stop("'data' is an empty data.frame.")
|
|
}
|
|
if (!is.null(feature_types)) {
|
|
if (!is.character(feature_types) || length(feature_types) != ncol(df)) {
|
|
stop(
|
|
"'feature_types' must be a character vector with one entry per column in 'data'."
|
|
)
|
|
}
|
|
} else {
|
|
feature_types <- sapply(df, function(col) {
|
|
if (is.factor(col)) {
|
|
return("c")
|
|
} else if (is.integer(col)) {
|
|
return("int")
|
|
} else if (is.logical(col)) {
|
|
return("i")
|
|
} else {
|
|
if (!is.numeric(col)) {
|
|
stop("Invalid type in dataframe.")
|
|
}
|
|
return("float")
|
|
}
|
|
})
|
|
}
|
|
|
|
lst <- lapply(df, function(col) {
|
|
is_factor <- is.factor(col)
|
|
col <- as.numeric(col)
|
|
if (is_factor) {
|
|
col <- col - 1
|
|
}
|
|
return(col)
|
|
})
|
|
|
|
return(list(lst = lst, feature_types = feature_types))
|
|
}
|
|
|
|
.set.dmatrix.fields <- function(
|
|
dmat,
|
|
label,
|
|
weight,
|
|
base_margin,
|
|
feature_names,
|
|
feature_types,
|
|
group,
|
|
qid,
|
|
label_lower_bound,
|
|
label_upper_bound,
|
|
feature_weights
|
|
) {
|
|
if (!is.null(label)) {
|
|
setinfo(dmat, "label", label)
|
|
}
|
|
if (!is.null(weight)) {
|
|
setinfo(dmat, "weight", weight)
|
|
}
|
|
if (!is.null(base_margin)) {
|
|
setinfo(dmat, "base_margin", base_margin)
|
|
}
|
|
if (!is.null(feature_names)) {
|
|
setinfo(dmat, "feature_name", feature_names)
|
|
}
|
|
if (!is.null(feature_types)) {
|
|
setinfo(dmat, "feature_type", feature_types)
|
|
}
|
|
if (!is.null(group)) {
|
|
setinfo(dmat, "group", group)
|
|
}
|
|
if (!is.null(qid)) {
|
|
setinfo(dmat, "qid", qid)
|
|
}
|
|
if (!is.null(label_lower_bound)) {
|
|
setinfo(dmat, "label_lower_bound", label_lower_bound)
|
|
}
|
|
if (!is.null(label_upper_bound)) {
|
|
setinfo(dmat, "label_upper_bound", label_upper_bound)
|
|
}
|
|
if (!is.null(feature_weights)) {
|
|
setinfo(dmat, "feature_weights", feature_weights)
|
|
}
|
|
}
|
|
|
|
#' @param ref The training dataset that provides quantile information, needed when creating
|
|
#' validation/test dataset with [xgb.QuantileDMatrix()]. Supplying the training DMatrix
|
|
#' as a reference means that the same quantisation applied to the training data is
|
|
#' applied to the validation/test data
|
|
#' @param max_bin The number of histogram bin, should be consistent with the training parameter
|
|
#' `max_bin`.
|
|
#'
|
|
#' This is only supported when constructing a QuantileDMatrix.
|
|
#' @export
|
|
#' @rdname xgb.DMatrix
|
|
xgb.QuantileDMatrix <- function(
|
|
data,
|
|
label = NULL,
|
|
weight = NULL,
|
|
base_margin = NULL,
|
|
missing = NA,
|
|
feature_names = colnames(data),
|
|
feature_types = NULL,
|
|
nthread = NULL,
|
|
group = NULL,
|
|
qid = NULL,
|
|
label_lower_bound = NULL,
|
|
label_upper_bound = NULL,
|
|
feature_weights = NULL,
|
|
ref = NULL,
|
|
max_bin = NULL
|
|
) {
|
|
nthread <- as.integer(NVL(nthread, -1L))
|
|
if (!is.null(ref) && !inherits(ref, "xgb.DMatrix")) {
|
|
stop("'ref' must be an xgb.DMatrix object.")
|
|
}
|
|
|
|
# Note: when passing an integer matrix, it won't get casted to numeric.
|
|
# Since 'int' values as understood by languages like C cannot have missing values,
|
|
# R represents missingness there by assigning them a value equal to the minimum
|
|
# integer. The 'missing' value here is set before the data, so in case of integers,
|
|
# need to make the conversion manually beforehand.
|
|
if (is.matrix(data) && storage.mode(data) %in% c("integer", "logical") && is.na(missing)) {
|
|
missing <- .Call(XGGetRNAIntAsDouble)
|
|
}
|
|
|
|
iterator_env <- as.environment(
|
|
list(
|
|
data = data,
|
|
label = label,
|
|
weight = weight,
|
|
base_margin = base_margin,
|
|
missing = missing,
|
|
feature_names = feature_names,
|
|
feature_types = feature_types,
|
|
group = group,
|
|
qid = qid,
|
|
label_lower_bound = label_lower_bound,
|
|
label_upper_bound = label_upper_bound,
|
|
feature_weights = feature_weights
|
|
)
|
|
)
|
|
data_iterator <- .single.data.iterator(iterator_env)
|
|
|
|
# Note: the ProxyDMatrix has its finalizer assigned in the R externalptr
|
|
# object, but that finalizer will only be called once the object is
|
|
# garbage-collected, which doesn't happen immediately after it goes out
|
|
# of scope, hence this piece of code to tigger its destruction earlier
|
|
# and free memory right away.
|
|
proxy_handle <- .make.proxy.handle()
|
|
on.exit({
|
|
.Call(XGDMatrixFree_R, proxy_handle)
|
|
})
|
|
iterator_next <- function() {
|
|
return(xgb.ProxyDMatrix(proxy_handle, data_iterator))
|
|
}
|
|
iterator_reset <- function() {
|
|
return(data_iterator$f_reset(iterator_env))
|
|
}
|
|
calling_env <- environment()
|
|
|
|
dmat <- .Call(
|
|
XGQuantileDMatrixCreateFromCallback_R,
|
|
iterator_next,
|
|
iterator_reset,
|
|
calling_env,
|
|
proxy_handle,
|
|
nthread,
|
|
missing,
|
|
max_bin,
|
|
ref
|
|
)
|
|
attributes(dmat) <- list(
|
|
class = c("xgb.DMatrix", "xgb.QuantileDMatrix"),
|
|
fields = attributes(proxy_handle)$fields
|
|
)
|
|
return(dmat)
|
|
}
|
|
|
|
#' XGBoost Data Iterator
|
|
#'
|
|
#' @description
|
|
#' Interface to create a custom data iterator in order to construct a DMatrix
|
|
#' from external memory.
|
|
#'
|
|
#' This function is responsible for generating an R object structure containing callback
|
|
#' functions and an environment shared with them.
|
|
#'
|
|
#' The output structure from this function is then meant to be passed to [xgb.ExtMemDMatrix()],
|
|
#' which will consume the data and create a DMatrix from it by executing the callback functions.
|
|
#'
|
|
#' For more information, and for a usage example, see the documentation for [xgb.ExtMemDMatrix()].
|
|
#'
|
|
#' @param env An R environment to pass to the callback functions supplied here, which can be
|
|
#' used to keep track of variables to determine how to handle the batches.
|
|
#'
|
|
#' For example, one might want to keep track of an iteration number in this environment in order
|
|
#' to know which part of the data to pass next.
|
|
#' @param f_next `function(env)` which is responsible for:
|
|
#' - Accessing or retrieving the next batch of data in the iterator.
|
|
#' - Supplying this data by calling function [xgb.DataBatch()] on it and returning the result.
|
|
#' - Keeping track of where in the iterator batch it is or will go next, which can for example
|
|
#' be done by modifiying variables in the `env` variable that is passed here.
|
|
#' - Signaling whether there are more batches to be consumed or not, by returning `NULL`
|
|
#' when the stream of data ends (all batches in the iterator have been consumed), or the result from
|
|
#' calling [xgb.DataBatch()] when there are more batches in the line to be consumed.
|
|
#' @param f_reset `function(env)` which is responsible for reseting the data iterator
|
|
#' (i.e. taking it back to the first batch, called before and after the sequence of batches
|
|
#' has been consumed).
|
|
#'
|
|
#' Note that, after resetting the iterator, the batches will be accessed again, so the same data
|
|
#' (and in the same order) must be passed in subsequent iterations.
|
|
#' @return An `xgb.DataIter` object, containing the same inputs supplied here, which can then
|
|
#' be passed to [xgb.ExtMemDMatrix()].
|
|
#' @seealso [xgb.ExtMemDMatrix()], [xgb.DataBatch()].
|
|
#' @export
|
|
xgb.DataIter <- function(env = new.env(), f_next, f_reset) {
|
|
if (!is.function(f_next)) {
|
|
stop("'f_next' must be a function.")
|
|
}
|
|
if (!is.function(f_reset)) {
|
|
stop("'f_reset' must be a function.")
|
|
}
|
|
out <- list(
|
|
env = env,
|
|
f_next = f_next,
|
|
f_reset = f_reset
|
|
)
|
|
class(out) <- "xgb.DataIter"
|
|
return(out)
|
|
}
|
|
|
|
.qdm.single.fnext <- function(env) {
|
|
curr_iter <- env[["iter"]]
|
|
if (curr_iter >= 1L) {
|
|
return(NULL)
|
|
}
|
|
|
|
on.exit({
|
|
env[["iter"]] <- curr_iter + 1L
|
|
})
|
|
return(
|
|
xgb.DataBatch(
|
|
data = env[["data"]],
|
|
label = env[["label"]],
|
|
weight = env[["weight"]],
|
|
base_margin = env[["base_margin"]],
|
|
feature_names = env[["feature_names"]],
|
|
feature_types = env[["feature_types"]],
|
|
group = env[["group"]],
|
|
qid = env[["qid"]],
|
|
label_lower_bound = env[["label_lower_bound"]],
|
|
label_upper_bound = env[["label_upper_bound"]],
|
|
feature_weights = env[["feature_weights"]]
|
|
)
|
|
)
|
|
}
|
|
|
|
.qdm.single.freset <- function(env) {
|
|
env[["iter"]] <- 0L
|
|
return(invisible(NULL))
|
|
}
|
|
|
|
.single.data.iterator <- function(env) {
|
|
env[["iter"]] <- 0L
|
|
return(xgb.DataIter(env, .qdm.single.fnext, .qdm.single.freset))
|
|
}
|
|
|
|
# Only for internal usage
|
|
.make.proxy.handle <- function() {
|
|
out <- .Call(XGProxyDMatrixCreate_R)
|
|
attributes(out) <- list(
|
|
class = c("xgb.DMatrix", "xgb.ProxyDMatrix"),
|
|
fields = new.env()
|
|
)
|
|
return(out)
|
|
}
|
|
|
|
#' Structure for Data Batches
|
|
#'
|
|
#' @description
|
|
#' Helper function to supply data in batches of a data iterator when
|
|
#' constructing a DMatrix from external memory through [xgb.ExtMemDMatrix()]
|
|
#' or through [xgb.QuantileDMatrix.from_iterator()].
|
|
#'
|
|
#' This function is **only** meant to be called inside of a callback function (which
|
|
#' is passed as argument to function [xgb.DataIter()] to construct a data iterator)
|
|
#' when constructing a DMatrix through external memory - otherwise, one should call
|
|
#' [xgb.DMatrix()] or [xgb.QuantileDMatrix()].
|
|
#'
|
|
#' The object that results from calling this function directly is **not** like
|
|
#' an `xgb.DMatrix` - i.e. cannot be used to train a model, nor to get predictions - only
|
|
#' possible usage is to supply data to an iterator, from which a DMatrix is then constructed.
|
|
#'
|
|
#' For more information and for example usage, see the documentation for [xgb.ExtMemDMatrix()].
|
|
#' @inheritParams xgb.DMatrix
|
|
#' @param data Batch of data belonging to this batch.
|
|
#'
|
|
#' Note that not all of the input types supported by [xgb.DMatrix()] are possible
|
|
#' to pass here. Supported types are:
|
|
#' - `matrix`, with types `numeric`, `integer`, and `logical`. Note that for types
|
|
#' `integer` and `logical`, missing values might not be automatically recognized as
|
|
#' as such - see the documentation for parameter `missing` in [xgb.ExtMemDMatrix()]
|
|
#' for details on this.
|
|
#' - `data.frame`, with the same types as supported by 'xgb.DMatrix' and same
|
|
#' conversions applied to it. See the documentation for parameter `data` in
|
|
#' [xgb.DMatrix()] for details on it.
|
|
#' - CSR matrices, as class `dgRMatrix` from package "Matrix".
|
|
#' @return An object of class `xgb.DataBatch`, which is just a list containing the
|
|
#' data and parameters passed here. It does **not** inherit from `xgb.DMatrix`.
|
|
#' @seealso [xgb.DataIter()], [xgb.ExtMemDMatrix()].
|
|
#' @export
|
|
xgb.DataBatch <- function(
|
|
data,
|
|
label = NULL,
|
|
weight = NULL,
|
|
base_margin = NULL,
|
|
feature_names = colnames(data),
|
|
feature_types = NULL,
|
|
group = NULL,
|
|
qid = NULL,
|
|
label_lower_bound = NULL,
|
|
label_upper_bound = NULL,
|
|
feature_weights = NULL
|
|
) {
|
|
stopifnot(inherits(data, c("matrix", "data.frame", "dgRMatrix")))
|
|
out <- list(
|
|
data = data,
|
|
label = label,
|
|
weight = weight,
|
|
base_margin = base_margin,
|
|
feature_names = feature_names,
|
|
feature_types = feature_types,
|
|
group = group,
|
|
qid = qid,
|
|
label_lower_bound = label_lower_bound,
|
|
label_upper_bound = label_upper_bound,
|
|
feature_weights = feature_weights
|
|
)
|
|
class(out) <- "xgb.DataBatch"
|
|
return(out)
|
|
}
|
|
|
|
# This is only for internal usage, class is not exposed to the user.
|
|
xgb.ProxyDMatrix <- function(proxy_handle, data_iterator) {
|
|
lst <- data_iterator$f_next(data_iterator$env)
|
|
if (is.null(lst)) {
|
|
return(0L)
|
|
}
|
|
if (!inherits(lst, "xgb.DataBatch")) {
|
|
stop("DataIter 'f_next' must return either NULL or the result from calling 'xgb.DataBatch'.")
|
|
}
|
|
|
|
if (!is.null(lst$group) && !is.null(lst$qid)) {
|
|
stop("Either one of 'group' or 'qid' should be NULL")
|
|
}
|
|
if (is.data.frame(lst$data)) {
|
|
tmp <- .process.df.for.dmatrix(lst$data, lst$feature_types)
|
|
lst$feature_types <- tmp$feature_types
|
|
.Call(XGProxyDMatrixSetDataColumnar_R, proxy_handle, tmp$lst)
|
|
rm(tmp)
|
|
} else if (is.matrix(lst$data)) {
|
|
.Call(XGProxyDMatrixSetDataDense_R, proxy_handle, lst$data)
|
|
} else if (inherits(lst$data, "dgRMatrix")) {
|
|
tmp <- list(p = lst$data@p, j = lst$data@j, x = lst$data@x, ncol = ncol(lst$data))
|
|
.Call(XGProxyDMatrixSetDataCSR_R, proxy_handle, tmp)
|
|
} else {
|
|
stop("'data' has unsupported type.")
|
|
}
|
|
|
|
.set.dmatrix.fields(
|
|
dmat = proxy_handle,
|
|
label = lst$label,
|
|
weight = lst$weight,
|
|
base_margin = lst$base_margin,
|
|
feature_names = lst$feature_names,
|
|
feature_types = lst$feature_types,
|
|
group = lst$group,
|
|
qid = lst$qid,
|
|
label_lower_bound = lst$label_lower_bound,
|
|
label_upper_bound = lst$label_upper_bound,
|
|
feature_weights = lst$feature_weights
|
|
)
|
|
|
|
return(1L)
|
|
}
|
|
|
|
#' DMatrix from External Data
|
|
#'
|
|
#' @description
|
|
#' Create a special type of XGBoost 'DMatrix' object from external data
|
|
#' supplied by an [xgb.DataIter()] object, potentially passed in batches from a
|
|
#' bigger set that might not fit entirely in memory.
|
|
#'
|
|
#' The data supplied by the iterator is accessed on-demand as needed, multiple times,
|
|
#' without being concatenated, but note that fields like 'label' **will** be
|
|
#' concatenated from multiple calls to the data iterator.
|
|
#'
|
|
#' For more information, see the guide 'Using XGBoost External Memory Version':
|
|
#' \url{https://xgboost.readthedocs.io/en/stable/tutorials/external_memory.html}
|
|
#' @inheritParams xgb.DMatrix
|
|
#' @param data_iterator A data iterator structure as returned by [xgb.DataIter()],
|
|
#' which includes an environment shared between function calls, and functions to access
|
|
#' the data in batches on-demand.
|
|
#' @param cache_prefix The path of cache file, caller must initialize all the directories in this path.
|
|
#' @param missing A float value to represents missing values in data.
|
|
#'
|
|
#' Note that, while functions like [xgb.DMatrix()] can take a generic `NA` and interpret it
|
|
#' correctly for different types like `numeric` and `integer`, if an `NA` value is passed here,
|
|
#' it will not be adapted for different input types.
|
|
#'
|
|
#' For example, in R `integer` types, missing values are represented by integer number `-2147483648`
|
|
#' (since machine 'integer' types do not have an inherent 'NA' value) - hence, if one passes `NA`,
|
|
#' which is interpreted as a floating-point NaN by [xgb.ExtMemDMatrix()] and by
|
|
#' [xgb.QuantileDMatrix.from_iterator()], these integer missing values will not be treated as missing.
|
|
#' This should not pose any problem for `numeric` types, since they do have an inheret NaN value.
|
|
#' @return An 'xgb.DMatrix' object, with subclass 'xgb.ExtMemDMatrix', in which the data is not
|
|
#' held internally but accessed through the iterator when needed.
|
|
#' @seealso [xgb.DataIter()], [xgb.DataBatch()], [xgb.QuantileDMatrix.from_iterator()]
|
|
#' @examples
|
|
#' data(mtcars)
|
|
#'
|
|
#' # This custom environment will be passed to the iterator
|
|
#' # functions at each call. It is up to the user to keep
|
|
#' # track of the iteration number in this environment.
|
|
#' iterator_env <- as.environment(
|
|
#' list(
|
|
#' iter = 0,
|
|
#' x = mtcars[, -1],
|
|
#' y = mtcars[, 1]
|
|
#' )
|
|
#' )
|
|
#'
|
|
#' # Data is passed in two batches.
|
|
#' # In this example, batches are obtained by subsetting the 'x' variable.
|
|
#' # This is not advantageous to do, since the data is already loaded in memory
|
|
#' # and can be passed in full in one go, but there can be situations in which
|
|
#' # only a subset of the data will fit in the computer's memory, and it can
|
|
#' # be loaded in batches that are accessed one-at-a-time only.
|
|
#' iterator_next <- function(iterator_env) {
|
|
#' curr_iter <- iterator_env[["iter"]]
|
|
#' if (curr_iter >= 2) {
|
|
#' # there are only two batches, so this signals end of the stream
|
|
#' return(NULL)
|
|
#' }
|
|
#'
|
|
#' if (curr_iter == 0) {
|
|
#' x_batch <- iterator_env[["x"]][1:16, ]
|
|
#' y_batch <- iterator_env[["y"]][1:16]
|
|
#' } else {
|
|
#' x_batch <- iterator_env[["x"]][17:32, ]
|
|
#' y_batch <- iterator_env[["y"]][17:32]
|
|
#' }
|
|
#' on.exit({
|
|
#' iterator_env[["iter"]] <- curr_iter + 1
|
|
#' })
|
|
#'
|
|
#' # Function 'xgb.DataBatch' must be called manually
|
|
#' # at each batch with all the appropriate attributes,
|
|
#' # such as feature names and feature types.
|
|
#' return(xgb.DataBatch(data = x_batch, label = y_batch))
|
|
#' }
|
|
#'
|
|
#' # This moves the iterator back to its beginning
|
|
#' iterator_reset <- function(iterator_env) {
|
|
#' iterator_env[["iter"]] <- 0
|
|
#' }
|
|
#'
|
|
#' data_iterator <- xgb.DataIter(
|
|
#' env = iterator_env,
|
|
#' f_next = iterator_next,
|
|
#' f_reset = iterator_reset
|
|
#' )
|
|
#' cache_prefix <- tempdir()
|
|
#'
|
|
#' # DMatrix will be constructed from the iterator's batches
|
|
#' dm <- xgb.ExtMemDMatrix(data_iterator, cache_prefix, nthread = 1)
|
|
#'
|
|
#' # After construction, can be used as a regular DMatrix
|
|
#' params <- list(nthread = 1, objective = "reg:squarederror")
|
|
#' model <- xgb.train(data = dm, nrounds = 2, params = params)
|
|
#'
|
|
#' # Predictions can also be called on it, and should be the same
|
|
#' # as if the data were passed differently.
|
|
#' pred_dm <- predict(model, dm)
|
|
#' pred_mat <- predict(model, as.matrix(mtcars[, -1]))
|
|
#' @export
|
|
xgb.ExtMemDMatrix <- function(
|
|
data_iterator,
|
|
cache_prefix = tempdir(),
|
|
missing = NA,
|
|
nthread = NULL
|
|
) {
|
|
stopifnot(inherits(data_iterator, "xgb.DataIter"))
|
|
stopifnot(is.character(cache_prefix))
|
|
|
|
cache_prefix <- path.expand(cache_prefix)
|
|
nthread <- as.integer(NVL(nthread, -1L))
|
|
|
|
proxy_handle <- .make.proxy.handle()
|
|
on.exit({
|
|
.Call(XGDMatrixFree_R, proxy_handle)
|
|
})
|
|
iterator_next <- function() {
|
|
return(xgb.ProxyDMatrix(proxy_handle, data_iterator))
|
|
}
|
|
iterator_reset <- function() {
|
|
return(data_iterator$f_reset(data_iterator$env))
|
|
}
|
|
calling_env <- environment()
|
|
|
|
dmat <- .Call(
|
|
XGDMatrixCreateFromCallback_R,
|
|
iterator_next,
|
|
iterator_reset,
|
|
calling_env,
|
|
proxy_handle,
|
|
nthread,
|
|
missing,
|
|
cache_prefix
|
|
)
|
|
|
|
attributes(dmat) <- list(
|
|
class = c("xgb.DMatrix", "xgb.ExtMemDMatrix"),
|
|
fields = attributes(proxy_handle)$fields
|
|
)
|
|
return(dmat)
|
|
}
|
|
|
|
|
|
#' QuantileDMatrix from External Data
|
|
#'
|
|
#' @description
|
|
#' Create an `xgb.QuantileDMatrix` object (exact same class as would be returned by
|
|
#' calling function [xgb.QuantileDMatrix()], with the same advantages and limitations) from
|
|
#' external data supplied by [xgb.DataIter()], potentially passed in batches from
|
|
#' a bigger set that might not fit entirely in memory, same way as [xgb.ExtMemDMatrix()].
|
|
#'
|
|
#' Note that, while external data will only be loaded through the iterator (thus the full data
|
|
#' might not be held entirely in-memory), the quantized representation of the data will get
|
|
#' created in-memory, being concatenated from multiple calls to the data iterator. The quantized
|
|
#' version is typically lighter than the original data, so there might be cases in which this
|
|
#' representation could potentially fit in memory even if the full data does not.
|
|
#'
|
|
#' For more information, see the guide 'Using XGBoost External Memory Version':
|
|
#' \url{https://xgboost.readthedocs.io/en/stable/tutorials/external_memory.html}
|
|
#' @inheritParams xgb.ExtMemDMatrix
|
|
#' @inheritParams xgb.QuantileDMatrix
|
|
#' @return An 'xgb.DMatrix' object, with subclass 'xgb.QuantileDMatrix'.
|
|
#' @seealso [xgb.DataIter()], [xgb.DataBatch()], [xgb.ExtMemDMatrix()],
|
|
#' [xgb.QuantileDMatrix()]
|
|
#' @export
|
|
xgb.QuantileDMatrix.from_iterator <- function( # nolint
|
|
data_iterator,
|
|
missing = NA,
|
|
nthread = NULL,
|
|
ref = NULL,
|
|
max_bin = NULL
|
|
) {
|
|
stopifnot(inherits(data_iterator, "xgb.DataIter"))
|
|
if (!is.null(ref) && !inherits(ref, "xgb.DMatrix")) {
|
|
stop("'ref' must be an xgb.DMatrix object.")
|
|
}
|
|
|
|
nthread <- as.integer(NVL(nthread, -1L))
|
|
|
|
proxy_handle <- .make.proxy.handle()
|
|
on.exit({
|
|
.Call(XGDMatrixFree_R, proxy_handle)
|
|
})
|
|
iterator_next <- function() {
|
|
return(xgb.ProxyDMatrix(proxy_handle, data_iterator))
|
|
}
|
|
iterator_reset <- function() {
|
|
return(data_iterator$f_reset(data_iterator$env))
|
|
}
|
|
calling_env <- environment()
|
|
|
|
dmat <- .Call(
|
|
XGQuantileDMatrixCreateFromCallback_R,
|
|
iterator_next,
|
|
iterator_reset,
|
|
calling_env,
|
|
proxy_handle,
|
|
nthread,
|
|
missing,
|
|
max_bin,
|
|
ref
|
|
)
|
|
|
|
attributes(dmat) <- list(
|
|
class = c("xgb.DMatrix", "xgb.QuantileDMatrix"),
|
|
fields = attributes(proxy_handle)$fields
|
|
)
|
|
return(dmat)
|
|
}
|
|
|
|
#' Check whether DMatrix object has a field
|
|
#'
|
|
#' Checks whether an xgb.DMatrix object has a given field assigned to
|
|
#' it, such as weights, labels, etc.
|
|
#' @param object The DMatrix object to check for the given `info` field.
|
|
#' @param info The field to check for presence or absence in `object`.
|
|
#' @seealso [xgb.DMatrix()], [getinfo.xgb.DMatrix()], [setinfo.xgb.DMatrix()]
|
|
#' @examples
|
|
#' x <- matrix(1:10, nrow = 5)
|
|
#' dm <- xgb.DMatrix(x, nthread = 1)
|
|
#'
|
|
#' # 'dm' so far does not have any fields set
|
|
#' xgb.DMatrix.hasinfo(dm, "label")
|
|
#'
|
|
#' # Fields can be added after construction
|
|
#' setinfo(dm, "label", 1:5)
|
|
#' xgb.DMatrix.hasinfo(dm, "label")
|
|
#' @export
|
|
xgb.DMatrix.hasinfo <- function(object, info) {
|
|
if (!inherits(object, "xgb.DMatrix")) {
|
|
stop("Object is not an 'xgb.DMatrix'.")
|
|
}
|
|
if (.Call(XGCheckNullPtr_R, object)) {
|
|
warning("xgb.DMatrix object is invalid. Must be constructed again.")
|
|
return(FALSE)
|
|
}
|
|
return(NVL(attr(object, "fields")[[info]], FALSE))
|
|
}
|
|
|
|
|
|
#' Dimensions of xgb.DMatrix
|
|
#'
|
|
#' Returns a vector of numbers of rows and of columns in an `xgb.DMatrix`.
|
|
#'
|
|
#' @param x Object of class `xgb.DMatrix`
|
|
#'
|
|
#' @details
|
|
#' Note: since [nrow()] and [ncol()] internally use [dim()], they can also
|
|
#' be directly used with an `xgb.DMatrix` object.
|
|
#'
|
|
#' @examples
|
|
#' data(agaricus.train, package = "xgboost")
|
|
#'
|
|
#' train <- agaricus.train
|
|
#' dtrain <- xgb.DMatrix(train$data, label = train$label, nthread = 2)
|
|
#'
|
|
#' stopifnot(nrow(dtrain) == nrow(train$data))
|
|
#' stopifnot(ncol(dtrain) == ncol(train$data))
|
|
#' stopifnot(all(dim(dtrain) == dim(train$data)))
|
|
#'
|
|
#' @export
|
|
dim.xgb.DMatrix <- function(x) {
|
|
c(.Call(XGDMatrixNumRow_R, x), .Call(XGDMatrixNumCol_R, x))
|
|
}
|
|
|
|
|
|
#' Handling of column names of `xgb.DMatrix`
|
|
#'
|
|
#' Only column names are supported for `xgb.DMatrix`, thus setting of
|
|
#' row names would have no effect and returned row names would be `NULL`.
|
|
#'
|
|
#' @param x Object of class `xgb.DMatrix`.
|
|
#' @param value A list of two elements: the first one is ignored
|
|
#' and the second one is column names
|
|
#'
|
|
#' @details
|
|
#' Generic [dimnames()] methods are used by [colnames()].
|
|
#' Since row names are irrelevant, it is recommended to use [colnames()] directly.
|
|
#'
|
|
#' @examples
|
|
#' data(agaricus.train, package = "xgboost")
|
|
#'
|
|
#' train <- agaricus.train
|
|
#' dtrain <- xgb.DMatrix(train$data, label = train$label, nthread = 2)
|
|
#' dimnames(dtrain)
|
|
#' colnames(dtrain)
|
|
#' colnames(dtrain) <- make.names(1:ncol(train$data))
|
|
#' print(dtrain, verbose = TRUE)
|
|
#'
|
|
#' @rdname dimnames.xgb.DMatrix
|
|
#' @export
|
|
dimnames.xgb.DMatrix <- function(x) {
|
|
fn <- getinfo(x, "feature_name")
|
|
## row names is null.
|
|
list(NULL, fn)
|
|
}
|
|
|
|
#' @rdname dimnames.xgb.DMatrix
|
|
#' @export
|
|
`dimnames<-.xgb.DMatrix` <- function(x, value) {
|
|
if (!is.list(value) || length(value) != 2L)
|
|
stop("invalid 'dimnames' given: must be a list of two elements")
|
|
if (!is.null(value[[1L]]))
|
|
stop("xgb.DMatrix does not have rownames")
|
|
if (is.null(value[[2]])) {
|
|
setinfo(x, "feature_name", NULL)
|
|
return(x)
|
|
}
|
|
if (ncol(x) != length(value[[2]])) {
|
|
stop("can't assign ", length(value[[2]]), " colnames to a ", ncol(x), " column xgb.DMatrix")
|
|
}
|
|
setinfo(x, "feature_name", value[[2]])
|
|
x
|
|
}
|
|
|
|
|
|
#' Get or set information of xgb.DMatrix and xgb.Booster objects
|
|
#'
|
|
#' @param object Object of class `xgb.DMatrix` or `xgb.Booster`.
|
|
#' @param name The name of the information field to get (see details).
|
|
#' @return For `getinfo()`, will return the requested field. For `setinfo()`,
|
|
#' will always return value `TRUE` if it succeeds.
|
|
#' @details
|
|
#' The `name` field can be one of the following for `xgb.DMatrix`:
|
|
#' - label
|
|
#' - weight
|
|
#' - base_margin
|
|
#' - label_lower_bound
|
|
#' - label_upper_bound
|
|
#' - group
|
|
#' - feature_type
|
|
#' - feature_name
|
|
#' - nrow
|
|
#'
|
|
#' See the documentation for [xgb.DMatrix()] for more information about these fields.
|
|
#'
|
|
#' For `xgb.Booster`, can be one of the following:
|
|
#' - `feature_type`
|
|
#' - `feature_name`
|
|
#'
|
|
#' Note that, while 'qid' cannot be retrieved, it is possible to get the equivalent 'group'
|
|
#' for a DMatrix that had 'qid' assigned.
|
|
#'
|
|
#' **Important**: when calling [setinfo()], the objects are modified in-place. See
|
|
#' [xgb.copy.Booster()] for an idea of this in-place assignment works.
|
|
#' @examples
|
|
#' data(agaricus.train, package = "xgboost")
|
|
#'
|
|
#' dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label, nthread = 2))
|
|
#'
|
|
#' labels <- getinfo(dtrain, "label")
|
|
#' setinfo(dtrain, "label", 1 - labels)
|
|
#'
|
|
#' labels2 <- getinfo(dtrain, "label")
|
|
#' stopifnot(all(labels2 == 1 - labels))
|
|
#' @rdname getinfo
|
|
#' @export
|
|
getinfo <- function(object, name) UseMethod("getinfo")
|
|
|
|
#' @rdname getinfo
|
|
#' @export
|
|
getinfo.xgb.DMatrix <- function(object, name) {
|
|
allowed_int_fields <- 'group'
|
|
allowed_float_fields <- c(
|
|
'label', 'weight', 'base_margin',
|
|
'label_lower_bound', 'label_upper_bound'
|
|
)
|
|
allowed_str_fields <- c("feature_type", "feature_name")
|
|
allowed_fields <- c(allowed_float_fields, allowed_int_fields, allowed_str_fields, 'nrow')
|
|
|
|
if (typeof(name) != "character" ||
|
|
length(name) != 1 ||
|
|
!name %in% allowed_fields) {
|
|
stop("getinfo: name must be one of the following\n",
|
|
paste(paste0("'", allowed_fields, "'"), collapse = ", "))
|
|
}
|
|
if (name == "nrow") {
|
|
ret <- nrow(object)
|
|
} else if (name %in% allowed_str_fields) {
|
|
ret <- .Call(XGDMatrixGetStrFeatureInfo_R, object, name)
|
|
} else if (name %in% allowed_float_fields) {
|
|
ret <- .Call(XGDMatrixGetFloatInfo_R, object, name)
|
|
if (length(ret) > nrow(object)) {
|
|
ret <- matrix(ret, nrow = nrow(object), byrow = TRUE)
|
|
}
|
|
} else if (name %in% allowed_int_fields) {
|
|
if (name == "group") {
|
|
name <- "group_ptr"
|
|
}
|
|
ret <- .Call(XGDMatrixGetUIntInfo_R, object, name)
|
|
if (length(ret) > nrow(object)) {
|
|
ret <- matrix(ret, nrow = nrow(object), byrow = TRUE)
|
|
}
|
|
}
|
|
if (length(ret) == 0) return(NULL)
|
|
return(ret)
|
|
}
|
|
|
|
#' @rdname getinfo
|
|
#' @param info The specific field of information to set.
|
|
#'
|
|
#' @details
|
|
#' See the documentation for [xgb.DMatrix()] for possible fields that can be set
|
|
#' (which correspond to arguments in that function).
|
|
#'
|
|
#' Note that the following fields are allowed in the construction of an `xgb.DMatrix`
|
|
#' but **are not** allowed here:
|
|
#' - data
|
|
#' - missing
|
|
#' - silent
|
|
#' - nthread
|
|
#'
|
|
#' @examples
|
|
#' data(agaricus.train, package = "xgboost")
|
|
#'
|
|
#' dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label, nthread = 2))
|
|
#'
|
|
#' labels <- getinfo(dtrain, "label")
|
|
#' setinfo(dtrain, "label", 1 - labels)
|
|
#'
|
|
#' labels2 <- getinfo(dtrain, "label")
|
|
#' stopifnot(all.equal(labels2, 1 - labels))
|
|
#' @export
|
|
setinfo <- function(object, name, info) UseMethod("setinfo")
|
|
|
|
#' @rdname getinfo
|
|
#' @export
|
|
setinfo.xgb.DMatrix <- function(object, name, info) {
|
|
.internal.setinfo.xgb.DMatrix(object, name, info)
|
|
attr(object, "fields")[[name]] <- TRUE
|
|
return(TRUE)
|
|
}
|
|
|
|
.internal.setinfo.xgb.DMatrix <- function(object, name, info) {
|
|
if (name == "label") {
|
|
if (NROW(info) != nrow(object))
|
|
stop("The length of labels must equal to the number of rows in the input data")
|
|
.Call(XGDMatrixSetInfo_R, object, name, info)
|
|
return(TRUE)
|
|
}
|
|
if (name == "label_lower_bound") {
|
|
if (NROW(info) != nrow(object))
|
|
stop("The length of lower-bound labels must equal to the number of rows in the input data")
|
|
.Call(XGDMatrixSetInfo_R, object, name, info)
|
|
return(TRUE)
|
|
}
|
|
if (name == "label_upper_bound") {
|
|
if (NROW(info) != nrow(object))
|
|
stop("The length of upper-bound labels must equal to the number of rows in the input data")
|
|
.Call(XGDMatrixSetInfo_R, object, name, info)
|
|
return(TRUE)
|
|
}
|
|
if (name == "weight") {
|
|
.Call(XGDMatrixSetInfo_R, object, name, info)
|
|
return(TRUE)
|
|
}
|
|
if (name == "base_margin") {
|
|
.Call(XGDMatrixSetInfo_R, object, name, info)
|
|
return(TRUE)
|
|
}
|
|
if (name == "group") {
|
|
if (sum(info) != nrow(object))
|
|
stop("The sum of groups must equal to the number of rows in the input data")
|
|
.Call(XGDMatrixSetInfo_R, object, name, info)
|
|
return(TRUE)
|
|
}
|
|
if (name == "qid") {
|
|
if (NROW(info) != nrow(object))
|
|
stop("The length of qid assignments must equal to the number of rows in the input data")
|
|
.Call(XGDMatrixSetInfo_R, object, name, info)
|
|
return(TRUE)
|
|
}
|
|
if (name == "feature_weights") {
|
|
if (NROW(info) != ncol(object)) {
|
|
stop("The number of feature weights must equal to the number of columns in the input data")
|
|
}
|
|
.Call(XGDMatrixSetInfo_R, object, name, info)
|
|
return(TRUE)
|
|
}
|
|
|
|
set_feat_info <- function(name) {
|
|
msg <- sprintf(
|
|
"The number of %s must equal to the number of columns in the input data. %s vs. %s",
|
|
name,
|
|
length(info),
|
|
ncol(object)
|
|
)
|
|
if (!is.null(info)) {
|
|
info <- as.list(info)
|
|
if (length(info) != ncol(object)) {
|
|
stop(msg)
|
|
}
|
|
}
|
|
.Call(XGDMatrixSetStrFeatureInfo_R, object, name, info)
|
|
}
|
|
if (name == "feature_name") {
|
|
set_feat_info("feature_name")
|
|
return(TRUE)
|
|
}
|
|
if (name == "feature_type") {
|
|
set_feat_info("feature_type")
|
|
return(TRUE)
|
|
}
|
|
stop("setinfo: unknown info name ", name)
|
|
}
|
|
|
|
#' Get Quantile Cuts from DMatrix
|
|
#'
|
|
#' @description
|
|
#' Get the quantile cuts (a.k.a. borders) from an `xgb.DMatrix`
|
|
#' that has been quantized for the histogram method (`tree_method = "hist"`).
|
|
#'
|
|
#' These cuts are used in order to assign observations to bins - i.e. these are ordered
|
|
#' boundaries which are used to determine assignment condition `border_low < x < border_high`.
|
|
#' As such, the first and last bin will be outside of the range of the data, so as to include
|
|
#' all of the observations there.
|
|
#'
|
|
#' If a given column has 'n' bins, then there will be 'n+1' cuts / borders for that column,
|
|
#' which will be output in sorted order from lowest to highest.
|
|
#'
|
|
#' Different columns can have different numbers of bins according to their range.
|
|
#' @param dmat An `xgb.DMatrix` object, as returned by [xgb.DMatrix()].
|
|
#' @param output Output format for the quantile cuts. Possible options are:
|
|
#' - "list"` will return the output as a list with one entry per column, where
|
|
#' each column will have a numeric vector with the cuts. The list will be named if
|
|
#' `dmat` has column names assigned to it.
|
|
#' - `"arrays"` will return a list with entries `indptr` (base-0 indexing) and
|
|
#' `data`. Here, the cuts for column 'i' are obtained by slicing 'data' from entries
|
|
#' ` indptr[i]+1` to `indptr[i+1]`.
|
|
#' @return The quantile cuts, in the format specified by parameter `output`.
|
|
#' @examples
|
|
#' data(mtcars)
|
|
#'
|
|
#' y <- mtcars$mpg
|
|
#' x <- as.matrix(mtcars[, -1])
|
|
#' dm <- xgb.DMatrix(x, label = y, nthread = 1)
|
|
#'
|
|
#' # DMatrix is not quantized right away, but will be once a hist model is generated
|
|
#' model <- xgb.train(
|
|
#' data = dm,
|
|
#' params = list(tree_method = "hist", max_bin = 8, nthread = 1),
|
|
#' nrounds = 3
|
|
#' )
|
|
#'
|
|
#' # Now can get the quantile cuts
|
|
#' xgb.get.DMatrix.qcut(dm)
|
|
#' @export
|
|
xgb.get.DMatrix.qcut <- function(dmat, output = c("list", "arrays")) { # nolint
|
|
stopifnot(inherits(dmat, "xgb.DMatrix"))
|
|
output <- head(output, 1L)
|
|
stopifnot(output %in% c("list", "arrays"))
|
|
res <- .Call(XGDMatrixGetQuantileCut_R, dmat)
|
|
if (output == "arrays") {
|
|
return(res)
|
|
} else {
|
|
feature_names <- getinfo(dmat, "feature_name")
|
|
ncols <- length(res$indptr) - 1
|
|
out <- lapply(
|
|
seq(1, ncols),
|
|
function(col) {
|
|
st <- res$indptr[col]
|
|
end <- res$indptr[col + 1]
|
|
if (end <= st) {
|
|
return(numeric())
|
|
}
|
|
return(res$data[seq(1 + st, end)])
|
|
}
|
|
)
|
|
if (NROW(feature_names)) {
|
|
names(out) <- feature_names
|
|
}
|
|
return(out)
|
|
}
|
|
}
|
|
|
|
#' Get Number of Non-Missing Entries in DMatrix
|
|
#'
|
|
#' @param dmat An `xgb.DMatrix` object, as returned by [xgb.DMatrix()].
|
|
#' @return The number of non-missing entries in the DMatrix.
|
|
#' @export
|
|
xgb.get.DMatrix.num.non.missing <- function(dmat) { # nolint
|
|
stopifnot(inherits(dmat, "xgb.DMatrix"))
|
|
return(.Call(XGDMatrixNumNonMissing_R, dmat))
|
|
}
|
|
|
|
#' Get DMatrix Data
|
|
#'
|
|
#' @param dmat An `xgb.DMatrix` object, as returned by [xgb.DMatrix()].
|
|
#' @return The data held in the DMatrix, as a sparse CSR matrix (class `dgRMatrix`
|
|
#' from package `Matrix`). If it had feature names, these will be added as column names
|
|
#' in the output.
|
|
#' @export
|
|
xgb.get.DMatrix.data <- function(dmat) {
|
|
stopifnot(inherits(dmat, "xgb.DMatrix"))
|
|
res <- .Call(XGDMatrixGetDataAsCSR_R, dmat)
|
|
out <- methods::new("dgRMatrix")
|
|
nrows <- as.integer(length(res$indptr) - 1)
|
|
out@p <- res$indptr
|
|
out@j <- res$indices
|
|
out@x <- res$data
|
|
out@Dim <- as.integer(c(nrows, res$ncols))
|
|
|
|
feature_names <- getinfo(dmat, "feature_name")
|
|
dim_names <- list(NULL, NULL)
|
|
if (NROW(feature_names)) {
|
|
dim_names[[2L]] <- feature_names
|
|
}
|
|
out@Dimnames <- dim_names
|
|
return(out)
|
|
}
|
|
|
|
#' Slice DMatrix
|
|
#'
|
|
#' Get a new DMatrix containing the specified rows of original xgb.DMatrix object.
|
|
#'
|
|
#' @param object Object of class `xgb.DMatrix`.
|
|
#' @param idxset An integer vector of indices of rows needed (base-1 indexing).
|
|
#' @param allow_groups Whether to allow slicing an `xgb.DMatrix` with `group` (or
|
|
#' equivalently `qid`) field. Note that in such case, the result will not have
|
|
#' the groups anymore - they need to be set manually through [setinfo()].
|
|
#' @param colset Currently not used (columns subsetting is not available).
|
|
#'
|
|
#' @examples
|
|
#' data(agaricus.train, package = "xgboost")
|
|
#'
|
|
#' dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label, nthread = 2))
|
|
#'
|
|
#' dsub <- xgb.slice.DMatrix(dtrain, 1:42)
|
|
#' labels1 <- getinfo(dsub, "label")
|
|
#'
|
|
#' dsub <- dtrain[1:42, ]
|
|
#' labels2 <- getinfo(dsub, "label")
|
|
#' all.equal(labels1, labels2)
|
|
#'
|
|
#' @rdname xgb.slice.DMatrix
|
|
#' @export
|
|
xgb.slice.DMatrix <- function(object, idxset, allow_groups = FALSE) {
|
|
if (!inherits(object, "xgb.DMatrix")) {
|
|
stop("object must be xgb.DMatrix")
|
|
}
|
|
ret <- .Call(XGDMatrixSliceDMatrix_R, object, idxset, allow_groups)
|
|
|
|
attr_list <- attributes(object)
|
|
nr <- nrow(object)
|
|
len <- sapply(attr_list, NROW)
|
|
ind <- which(len == nr)
|
|
if (length(ind) > 0) {
|
|
nms <- names(attr_list)[ind]
|
|
for (i in seq_along(ind)) {
|
|
obj_attr <- attr(object, nms[i])
|
|
if (NCOL(obj_attr) > 1) {
|
|
attr(ret, nms[i]) <- obj_attr[idxset, ]
|
|
} else {
|
|
attr(ret, nms[i]) <- obj_attr[idxset]
|
|
}
|
|
}
|
|
}
|
|
|
|
out <- structure(ret, class = "xgb.DMatrix")
|
|
parent_fields <- as.list(attributes(object)$fields)
|
|
if (NROW(parent_fields)) {
|
|
child_fields <- parent_fields[!(names(parent_fields) %in% c("group", "qid"))]
|
|
child_fields <- as.environment(child_fields)
|
|
attributes(out)$fields <- child_fields
|
|
}
|
|
return(out)
|
|
}
|
|
|
|
#' @rdname xgb.slice.DMatrix
|
|
#' @export
|
|
`[.xgb.DMatrix` <- function(object, idxset, colset = NULL) {
|
|
xgb.slice.DMatrix(object, idxset)
|
|
}
|
|
|
|
|
|
#' Print xgb.DMatrix
|
|
#'
|
|
#' Print information about xgb.DMatrix.
|
|
#' Currently it displays dimensions and presence of info-fields and colnames.
|
|
#'
|
|
#' @param x An xgb.DMatrix object.
|
|
#' @param verbose Whether to print colnames (when present).
|
|
#' @param ... Not currently used.
|
|
#'
|
|
#' @examples
|
|
#' data(agaricus.train, package = "xgboost")
|
|
#'
|
|
#' dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label, nthread = 2))
|
|
#' dtrain
|
|
#'
|
|
#' print(dtrain, verbose = TRUE)
|
|
#'
|
|
#' @method print xgb.DMatrix
|
|
#' @export
|
|
print.xgb.DMatrix <- function(x, verbose = FALSE, ...) {
|
|
if (.Call(XGCheckNullPtr_R, x)) {
|
|
cat("INVALID xgb.DMatrix object. Must be constructed anew.\n")
|
|
return(invisible(x))
|
|
}
|
|
class_print <- if (inherits(x, "xgb.QuantileDMatrix")) {
|
|
"xgb.QuantileDMatrix"
|
|
} else if (inherits(x, "xgb.ExtMemDMatrix")) {
|
|
"xgb.ExtMemDMatrix"
|
|
} else if (inherits(x, "xgb.ProxyDMatrix")) {
|
|
"xgb.ProxyDMatrix"
|
|
} else {
|
|
"xgb.DMatrix"
|
|
}
|
|
|
|
cat(class_print, ' dim:', nrow(x), 'x', ncol(x), ' info: ')
|
|
infos <- names(attributes(x)$fields)
|
|
infos <- infos[infos != "feature_name"]
|
|
if (!NROW(infos)) infos <- "NA"
|
|
infos <- infos[order(infos)]
|
|
infos <- paste(infos, collapse = ", ")
|
|
cat(infos)
|
|
cnames <- colnames(x)
|
|
cat(' colnames:')
|
|
if (verbose && !is.null(cnames)) {
|
|
cat("\n'")
|
|
cat(cnames, sep = "','")
|
|
cat("'")
|
|
} else {
|
|
if (is.null(cnames)) cat(' no')
|
|
else cat(' yes')
|
|
}
|
|
cat("\n")
|
|
invisible(x)
|
|
}
|