xgboost/R-package/R/xgb.DMatrix.R
david-cortes 662854c7d7
[R] Document handling of indexes (#10019)
---------

Co-authored-by: Jiaming Yuan <jm.yuan@outlook.com>
2024-02-02 05:39:09 +08:00

1330 lines
45 KiB
R

#' Construct xgb.DMatrix object
#'
#' Construct an 'xgb.DMatrix' object from a given data source, which can then be passed to functions
#' such as \link{xgb.train} or \link{predict.xgb.Booster}.
#'
#' Function 'xgb.QuantileDMatrix' will construct a DMatrix with quantization for the histogram
#' method already applied to it, which can be used to reduce memory usage (compared to using a
#' a regular DMatrix first and then creating a quantization out of it) when using the histogram
#' method (`tree_method = "hist"`, which is the default algorithm), but is not usable for the
#' sorted-indices method (`tree_method = "exact"`), nor for the approximate method
#' (`tree_method = "approx"`).
#' @param data Data from which to create a DMatrix, which can then be used for fitting models or
#' for getting predictions out of a fitted model.
#'
#' Supported input types are as follows:\itemize{
#' \item `matrix` objects, with types `numeric`, `integer`, or `logical`.
#' \item `data.frame` objects, with columns of types `numeric`, `integer`, `logical`, or `factor`.
#'
#' Note that xgboost uses base-0 encoding for categorical types, hence `factor` types (which use base-1
#' encoding') will be converted inside the function call. Be aware that the encoding used for `factor`
#' types is not kept as part of the model, so in subsequent calls to `predict`, it is the user's
#' responsibility to ensure that factor columns have the same levels as the ones from which the DMatrix
#' was constructed.
#'
#' Other column types are not supported.
#' \item CSR matrices, as class `dgRMatrix` from package `Matrix`.
#' \item CSC matrices, as class `dgCMatrix` from package `Matrix`. These are \bold{not} supported for
#' 'xgb.QuantileDMatrix'.
#' \item Single-row CSR matrices, as class `dsparseVector` from package `Matrix`, which is interpreted
#' as a single row (only when making predictions from a fitted model).
#' \item Text files in SVMLight / LibSVM formats, passed as a path to the file. These are \bold{not}
#' supported for xgb.QuantileDMatrix'.
#' \item Binary files generated by \link{xgb.DMatrix.save}, passed as a path to the file. These are
#' \bold{not} supported for xgb.QuantileDMatrix'.
#' }
#' @param label Label of the training data. For classification problems, should be passed encoded as
#' integers with numeration starting at zero.
#' @param weight Weight for each instance.
#'
#' Note that, for ranking task, weights are per-group. In ranking task, one weight
#' is assigned to each group (not each data point). This is because we
#' only care about the relative ordering of data points within each group,
#' so it doesn't make sense to assign weights to individual data points.
#' @param base_margin Base margin used for boosting from existing model.
#'
#' In the case of multi-output models, one can also pass multi-dimensional base_margin.
#' @param missing A float value to represents missing values in data (not used when creating DMatrix
#' from text files).
#' It is useful to change when a zero, infinite, or some other extreme value represents missing
#' values in data.
#' @param silent whether to suppress printing an informational message after loading from a file.
#' @param feature_names Set names for features. Overrides column names in data
#' frame and matrix.
#'
#' Note: columns are not referenced by name when calling `predict`, so the column order there
#' must be the same as in the DMatrix construction, regardless of the column names.
#' @param feature_types Set types for features.
#'
#' If `data` is a `data.frame` and passing `feature_types` is not supplied, feature types will be deduced
#' automatically from the column types.
#'
#' Otherwise, one can pass a character vector with the same length as number of columns in `data`,
#' with the following possible values:\itemize{
#' \item "c", which represents categorical columns.
#' \item "q", which represents numeric columns.
#' \item "int", which represents integer columns.
#' \item "i", which represents logical (boolean) columns.
#' }
#'
#' Note that, while categorical types are treated differently from the rest for model fitting
#' purposes, the other types do not influence the generated model, but have effects in other
#' functionalities such as feature importances.
#'
#' \bold{Important}: categorical features, if specified manually through `feature_types`, must
#' be encoded as integers with numeration starting at zero, and the same encoding needs to be
#' applied when passing data to `predict`. Even if passing `factor` types, the encoding will
#' not be saved, so make sure that `factor` columns passed to `predict` have the same `levels`.
#' @param nthread Number of threads used for creating DMatrix.
#' @param group Group size for all ranking group.
#' @param qid Query ID for data samples, used for ranking.
#' @param label_lower_bound Lower bound for survival training.
#' @param label_upper_bound Upper bound for survival training.
#' @param feature_weights Set feature weights for column sampling.
#' @return An 'xgb.DMatrix' object. If calling 'xgb.QuantileDMatrix', it will have additional
#' subclass 'xgb.QuantileDMatrix'.
#'
#' @details
#' Note that DMatrix objects are not serializable through R functions such as \code{saveRDS} or \code{save}.
#' If a DMatrix gets serialized and then de-serialized (for example, when saving data in an R session or caching
#' chunks in an Rmd file), the resulting object will not be usable anymore and will need to be reconstructed
#' from the original source of data.
#'
#' @examples
#' data(agaricus.train, package='xgboost')
#' ## Keep the number of threads to 1 for examples
#' nthread <- 1
#' data.table::setDTthreads(nthread)
#' dtrain <- with(
#' agaricus.train, xgb.DMatrix(data, label = label, nthread = nthread)
#' )
#' fname <- file.path(tempdir(), "xgb.DMatrix.data")
#' xgb.DMatrix.save(dtrain, fname)
#' dtrain <- xgb.DMatrix(fname)
#' @export
#' @rdname xgb.DMatrix
xgb.DMatrix <- function(
data,
label = NULL,
weight = NULL,
base_margin = NULL,
missing = NA,
silent = FALSE,
feature_names = colnames(data),
feature_types = NULL,
nthread = NULL,
group = NULL,
qid = NULL,
label_lower_bound = NULL,
label_upper_bound = NULL,
feature_weights = NULL
) {
if (!is.null(group) && !is.null(qid)) {
stop("Either one of 'group' or 'qid' should be NULL")
}
nthread <- as.integer(NVL(nthread, -1L))
if (typeof(data) == "character") {
if (length(data) > 1) {
stop(
"'data' has class 'character' and length ", length(data),
".\n 'data' accepts either a numeric matrix or a single filename."
)
}
data <- path.expand(data)
handle <- .Call(XGDMatrixCreateFromFile_R, data, as.integer(silent))
} else if (is.matrix(data)) {
handle <- .Call(
XGDMatrixCreateFromMat_R, data, missing, nthread
)
} else if (inherits(data, "dgCMatrix")) {
handle <- .Call(
XGDMatrixCreateFromCSC_R,
data@p,
data@i,
data@x,
nrow(data),
missing,
nthread
)
} else if (inherits(data, "dgRMatrix")) {
handle <- .Call(
XGDMatrixCreateFromCSR_R,
data@p,
data@j,
data@x,
ncol(data),
missing,
nthread
)
} else if (inherits(data, "dsparseVector")) {
indptr <- c(0L, as.integer(length(data@i)))
ind <- as.integer(data@i) - 1L
handle <- .Call(
XGDMatrixCreateFromCSR_R,
indptr,
ind,
data@x,
length(data),
missing,
nthread
)
} else if (is.data.frame(data)) {
tmp <- .process.df.for.dmatrix(data, feature_types)
feature_types <- tmp$feature_types
handle <- .Call(
XGDMatrixCreateFromDF_R, tmp$lst, missing, nthread
)
rm(tmp)
} else {
stop("xgb.DMatrix does not support construction from ", typeof(data))
}
dmat <- handle
attributes(dmat) <- list(
class = "xgb.DMatrix",
fields = new.env()
)
.set.dmatrix.fields(
dmat = dmat,
label = label,
weight = weight,
base_margin = base_margin,
feature_names = feature_names,
feature_types = feature_types,
group = group,
qid = qid,
label_lower_bound = label_lower_bound,
label_upper_bound = label_upper_bound,
feature_weights = feature_weights
)
return(dmat)
}
.process.df.for.dmatrix <- function(df, feature_types) {
if (!nrow(df) || !ncol(df)) {
stop("'data' is an empty data.frame.")
}
if (!is.null(feature_types)) {
if (!is.character(feature_types) || length(feature_types) != ncol(df)) {
stop(
"'feature_types' must be a character vector with one entry per column in 'data'."
)
}
} else {
feature_types <- sapply(df, function(col) {
if (is.factor(col)) {
return("c")
} else if (is.integer(col)) {
return("int")
} else if (is.logical(col)) {
return("i")
} else {
if (!is.numeric(col)) {
stop("Invalid type in dataframe.")
}
return("float")
}
})
}
lst <- lapply(df, function(col) {
is_factor <- is.factor(col)
col <- as.numeric(col)
if (is_factor) {
col <- col - 1
}
return(col)
})
return(list(lst = lst, feature_types = feature_types))
}
.set.dmatrix.fields <- function(
dmat,
label,
weight,
base_margin,
feature_names,
feature_types,
group,
qid,
label_lower_bound,
label_upper_bound,
feature_weights
) {
if (!is.null(label)) {
setinfo(dmat, "label", label)
}
if (!is.null(weight)) {
setinfo(dmat, "weight", weight)
}
if (!is.null(base_margin)) {
setinfo(dmat, "base_margin", base_margin)
}
if (!is.null(feature_names)) {
setinfo(dmat, "feature_name", feature_names)
}
if (!is.null(feature_types)) {
setinfo(dmat, "feature_type", feature_types)
}
if (!is.null(group)) {
setinfo(dmat, "group", group)
}
if (!is.null(qid)) {
setinfo(dmat, "qid", qid)
}
if (!is.null(label_lower_bound)) {
setinfo(dmat, "label_lower_bound", label_lower_bound)
}
if (!is.null(label_upper_bound)) {
setinfo(dmat, "label_upper_bound", label_upper_bound)
}
if (!is.null(feature_weights)) {
setinfo(dmat, "feature_weights", feature_weights)
}
}
#' @param ref The training dataset that provides quantile information, needed when creating
#' validation/test dataset with `xgb.QuantileDMatrix`. Supplying the training DMatrix
#' as a reference means that the same quantisation applied to the training data is
#' applied to the validation/test data
#' @param max_bin The number of histogram bin, should be consistent with the training parameter
#' `max_bin`.
#'
#' This is only supported when constructing a QuantileDMatrix.
#' @export
#' @rdname xgb.DMatrix
xgb.QuantileDMatrix <- function(
data,
label = NULL,
weight = NULL,
base_margin = NULL,
missing = NA,
feature_names = colnames(data),
feature_types = NULL,
nthread = NULL,
group = NULL,
qid = NULL,
label_lower_bound = NULL,
label_upper_bound = NULL,
feature_weights = NULL,
ref = NULL,
max_bin = NULL
) {
nthread <- as.integer(NVL(nthread, -1L))
if (!is.null(ref) && !inherits(ref, "xgb.DMatrix")) {
stop("'ref' must be an xgb.DMatrix object.")
}
# Note: when passing an integer matrix, it won't get casted to numeric.
# Since 'int' values as understood by languages like C cannot have missing values,
# R represents missingness there by assigning them a value equal to the minimum
# integer. The 'missing' value here is set before the data, so in case of integers,
# need to make the conversion manually beforehand.
if (is.matrix(data) && storage.mode(data) %in% c("integer", "logical") && is.na(missing)) {
missing <- .Call(XGGetRNAIntAsDouble)
}
iterator_env <- as.environment(
list(
data = data,
label = label,
weight = weight,
base_margin = base_margin,
missing = missing,
feature_names = feature_names,
feature_types = feature_types,
group = group,
qid = qid,
label_lower_bound = label_lower_bound,
label_upper_bound = label_upper_bound,
feature_weights = feature_weights
)
)
data_iterator <- .single.data.iterator(iterator_env)
# Note: the ProxyDMatrix has its finalizer assigned in the R externalptr
# object, but that finalizer will only be called once the object is
# garbage-collected, which doesn't happen immediately after it goes out
# of scope, hence this piece of code to tigger its destruction earlier
# and free memory right away.
proxy_handle <- .make.proxy.handle()
on.exit({
.Call(XGDMatrixFree_R, proxy_handle)
})
iterator_next <- function() {
return(xgb.ProxyDMatrix(proxy_handle, data_iterator))
}
iterator_reset <- function() {
return(data_iterator$f_reset(iterator_env))
}
calling_env <- environment()
dmat <- .Call(
XGQuantileDMatrixCreateFromCallback_R,
iterator_next,
iterator_reset,
calling_env,
proxy_handle,
nthread,
missing,
max_bin,
ref
)
attributes(dmat) <- list(
class = c("xgb.DMatrix", "xgb.QuantileDMatrix"),
fields = attributes(proxy_handle)$fields
)
return(dmat)
}
#' @title XGBoost Data Iterator
#' @description Interface to create a custom data iterator in order to construct a DMatrix
#' from external memory.
#'
#' This function is responsible for generating an R object structure containing callback
#' functions and an environment shared with them.
#'
#' The output structure from this function is then meant to be passed to \link{xgb.ExternalDMatrix},
#' which will consume the data and create a DMatrix from it by executing the callback functions.
#'
#' For more information, and for a usage example, see the documentation for \link{xgb.ExternalDMatrix}.
#' @param env An R environment to pass to the callback functions supplied here, which can be
#' used to keep track of variables to determine how to handle the batches.
#'
#' For example, one might want to keep track of an iteration number in this environment in order
#' to know which part of the data to pass next.
#' @param f_next `function(env)` which is responsible for:\itemize{
#' \item Accessing or retrieving the next batch of data in the iterator.
#' \item Supplying this data by calling function \link{xgb.DataBatch} on it and returning the result.
#' \item Keeping track of where in the iterator batch it is or will go next, which can for example
#' be done by modifiying variables in the `env` variable that is passed here.
#' \item Signaling whether there are more batches to be consumed or not, by returning `NULL`
#' when the stream of data ends (all batches in the iterator have been consumed), or the result from
#' calling \link{xgb.DataBatch} when there are more batches in the line to be consumed.
#' }
#' @param f_reset `function(env)` which is responsible for reseting the data iterator
#' (i.e. taking it back to the first batch, called before and after the sequence of batches
#' has been consumed).
#'
#' Note that, after resetting the iterator, the batches will be accessed again, so the same data
#' (and in the same order) must be passed in subsequent iterations.
#' @return An `xgb.DataIter` object, containing the same inputs supplied here, which can then
#' be passed to \link{xgb.ExternalDMatrix}.
#' @seealso \link{xgb.ExternalDMatrix}, \link{xgb.DataBatch}.
#' @export
xgb.DataIter <- function(env = new.env(), f_next, f_reset) {
if (!is.function(f_next)) {
stop("'f_next' must be a function.")
}
if (!is.function(f_reset)) {
stop("'f_reset' must be a function.")
}
out <- list(
env = env,
f_next = f_next,
f_reset = f_reset
)
class(out) <- "xgb.DataIter"
return(out)
}
.qdm.single.fnext <- function(env) {
curr_iter <- env[["iter"]]
if (curr_iter >= 1L) {
return(NULL)
}
on.exit({
env[["iter"]] <- curr_iter + 1L
})
return(
xgb.DataBatch(
data = env[["data"]],
label = env[["label"]],
weight = env[["weight"]],
base_margin = env[["base_margin"]],
feature_names = env[["feature_names"]],
feature_types = env[["feature_types"]],
group = env[["group"]],
qid = env[["qid"]],
label_lower_bound = env[["label_lower_bound"]],
label_upper_bound = env[["label_upper_bound"]],
feature_weights = env[["feature_weights"]]
)
)
}
.qdm.single.freset <- function(env) {
env[["iter"]] <- 0L
return(invisible(NULL))
}
.single.data.iterator <- function(env) {
env[["iter"]] <- 0L
return(xgb.DataIter(env, .qdm.single.fnext, .qdm.single.freset))
}
# Only for internal usage
.make.proxy.handle <- function() {
out <- .Call(XGProxyDMatrixCreate_R)
attributes(out) <- list(
class = c("xgb.DMatrix", "xgb.ProxyDMatrix"),
fields = new.env()
)
return(out)
}
#' @title Structure for Data Batches
#' @description Helper function to supply data in batches of a data iterator when
#' constructing a DMatrix from external memory through \link{xgb.ExternalDMatrix}
#' or through \link{xgb.QuantileDMatrix.from_iterator}.
#'
#' This function is \bold{only} meant to be called inside of a callback function (which
#' is passed as argument to function \link{xgb.DataIter} to construct a data iterator)
#' when constructing a DMatrix through external memory - otherwise, one should call
#' \link{xgb.DMatrix} or \link{xgb.QuantileDMatrix}.
#'
#' The object that results from calling this function directly is \bold{not} like
#' an `xgb.DMatrix` - i.e. cannot be used to train a model, nor to get predictions - only
#' possible usage is to supply data to an iterator, from which a DMatrix is then constructed.
#'
#' For more information and for example usage, see the documentation for \link{xgb.ExternalDMatrix}.
#' @inheritParams xgb.DMatrix
#' @param data Batch of data belonging to this batch.
#'
#' Note that not all of the input types supported by \link{xgb.DMatrix} are possible
#' to pass here. Supported types are:\itemize{
#' \item `matrix`, with types `numeric`, `integer`, and `logical`. Note that for types
#' `integer` and `logical`, missing values might not be automatically recognized as
#' as such - see the documentation for parameter `missing` in \link{xgb.ExternalDMatrix}
#' for details on this.
#' \item `data.frame`, with the same types as supported by 'xgb.DMatrix' and same
#' conversions applied to it. See the documentation for parameter `data` in
#' \link{xgb.DMatrix} for details on it.
#' \item CSR matrices, as class `dgRMatrix` from package `Matrix`.
#' }
#' @return An object of class `xgb.DataBatch`, which is just a list containing the
#' data and parameters passed here. It does \bold{not} inherit from `xgb.DMatrix`.
#' @seealso \link{xgb.DataIter}, \link{xgb.ExternalDMatrix}.
#' @export
xgb.DataBatch <- function(
data,
label = NULL,
weight = NULL,
base_margin = NULL,
feature_names = colnames(data),
feature_types = NULL,
group = NULL,
qid = NULL,
label_lower_bound = NULL,
label_upper_bound = NULL,
feature_weights = NULL
) {
stopifnot(inherits(data, c("matrix", "data.frame", "dgRMatrix")))
out <- list(
data = data,
label = label,
weight = weight,
base_margin = base_margin,
feature_names = feature_names,
feature_types = feature_types,
group = group,
qid = qid,
label_lower_bound = label_lower_bound,
label_upper_bound = label_upper_bound,
feature_weights = feature_weights
)
class(out) <- "xgb.DataBatch"
return(out)
}
# This is only for internal usage, class is not exposed to the user.
xgb.ProxyDMatrix <- function(proxy_handle, data_iterator) {
lst <- data_iterator$f_next(data_iterator$env)
if (is.null(lst)) {
return(0L)
}
if (!inherits(lst, "xgb.DataBatch")) {
stop("DataIter 'f_next' must return either NULL or the result from calling 'xgb.DataBatch'.")
}
if (!is.null(lst$group) && !is.null(lst$qid)) {
stop("Either one of 'group' or 'qid' should be NULL")
}
if (is.data.frame(lst$data)) {
tmp <- .process.df.for.dmatrix(lst$data, lst$feature_types)
lst$feature_types <- tmp$feature_types
.Call(XGProxyDMatrixSetDataColumnar_R, proxy_handle, tmp$lst)
rm(tmp)
} else if (is.matrix(lst$data)) {
.Call(XGProxyDMatrixSetDataDense_R, proxy_handle, lst$data)
} else if (inherits(lst$data, "dgRMatrix")) {
tmp <- list(p = lst$data@p, j = lst$data@j, x = lst$data@x, ncol = ncol(lst$data))
.Call(XGProxyDMatrixSetDataCSR_R, proxy_handle, tmp)
} else {
stop("'data' has unsupported type.")
}
.set.dmatrix.fields(
dmat = proxy_handle,
label = lst$label,
weight = lst$weight,
base_margin = lst$base_margin,
feature_names = lst$feature_names,
feature_types = lst$feature_types,
group = lst$group,
qid = lst$qid,
label_lower_bound = lst$label_lower_bound,
label_upper_bound = lst$label_upper_bound,
feature_weights = lst$feature_weights
)
return(1L)
}
#' @title DMatrix from External Data
#' @description Create a special type of xgboost 'DMatrix' object from external data
#' supplied by an \link{xgb.DataIter} object, potentially passed in batches from a
#' bigger set that might not fit entirely in memory.
#'
#' The data supplied by the iterator is accessed on-demand as needed, multiple times,
#' without being concatenated, but note that fields like 'label' \bold{will} be
#' concatenated from multiple calls to the data iterator.
#'
#' For more information, see the guide 'Using XGBoost External Memory Version':
#' \url{https://xgboost.readthedocs.io/en/stable/tutorials/external_memory.html}
#' @inheritParams xgb.DMatrix
#' @param data_iterator A data iterator structure as returned by \link{xgb.DataIter},
#' which includes an environment shared between function calls, and functions to access
#' the data in batches on-demand.
#' @param cache_prefix The path of cache file, caller must initialize all the directories in this path.
#' @param missing A float value to represents missing values in data.
#'
#' Note that, while functions like \link{xgb.DMatrix} can take a generic `NA` and interpret it
#' correctly for different types like `numeric` and `integer`, if an `NA` value is passed here,
#' it will not be adapted for different input types.
#'
#' For example, in R `integer` types, missing values are represented by integer number `-2147483648`
#' (since machine 'integer' types do not have an inherent 'NA' value) - hence, if one passes `NA`,
#' which is interpreted as a floating-point NaN by 'xgb.ExternalDMatrix' and by
#' 'xgb.QuantileDMatrix.from_iterator', these integer missing values will not be treated as missing.
#' This should not pose any problem for `numeric` types, since they do have an inheret NaN value.
#' @return An 'xgb.DMatrix' object, with subclass 'xgb.ExternalDMatrix', in which the data is not
#' held internally but accessed through the iterator when needed.
#' @seealso \link{xgb.DataIter}, \link{xgb.DataBatch}, \link{xgb.QuantileDMatrix.from_iterator}
#' @examples
#' library(xgboost)
#' data(mtcars)
#'
#' # this custom environment will be passed to the iterator
#' # functions at each call. It's up to the user to keep
#' # track of the iteration number in this environment.
#' iterator_env <- as.environment(
#' list(
#' iter = 0,
#' x = mtcars[, -1],
#' y = mtcars[, 1]
#' )
#' )
#'
#' # Data is passed in two batches.
#' # In this example, batches are obtained by subsetting the 'x' variable.
#' # This is not advantageous to do, since the data is already loaded in memory
#' # and can be passed in full in one go, but there can be situations in which
#' # only a subset of the data will fit in the computer's memory, and it can
#' # be loaded in batches that are accessed one-at-a-time only.
#' iterator_next <- function(iterator_env) {
#' curr_iter <- iterator_env[["iter"]]
#' if (curr_iter >= 2) {
#' # there are only two batches, so this signals end of the stream
#' return(NULL)
#' }
#'
#' if (curr_iter == 0) {
#' x_batch <- iterator_env[["x"]][1:16, ]
#' y_batch <- iterator_env[["y"]][1:16]
#' } else {
#' x_batch <- iterator_env[["x"]][17:32, ]
#' y_batch <- iterator_env[["y"]][17:32]
#' }
#' on.exit({
#' iterator_env[["iter"]] <- curr_iter + 1
#' })
#'
#' # Function 'xgb.DataBatch' must be called manually
#' # at each batch with all the appropriate attributes,
#' # such as feature names and feature types.
#' return(xgb.DataBatch(data = x_batch, label = y_batch))
#' }
#'
#' # This moves the iterator back to its beginning
#' iterator_reset <- function(iterator_env) {
#' iterator_env[["iter"]] <- 0
#' }
#'
#' data_iterator <- xgb.DataIter(
#' env = iterator_env,
#' f_next = iterator_next,
#' f_reset = iterator_reset
#' )
#' cache_prefix <- tempdir()
#'
#' # DMatrix will be constructed from the iterator's batches
#' dm <- xgb.ExternalDMatrix(data_iterator, cache_prefix, nthread = 1)
#'
#' # After construction, can be used as a regular DMatrix
#' params <- list(nthread = 1, objective = "reg:squarederror")
#' model <- xgb.train(data = dm, nrounds = 2, params = params)
#'
#' # Predictions can also be called on it, and should be the same
#' # as if the data were passed differently.
#' pred_dm <- predict(model, dm)
#' pred_mat <- predict(model, as.matrix(mtcars[, -1]))
#' @export
xgb.ExternalDMatrix <- function(
data_iterator,
cache_prefix = tempdir(),
missing = NA,
nthread = NULL
) {
stopifnot(inherits(data_iterator, "xgb.DataIter"))
stopifnot(is.character(cache_prefix))
cache_prefix <- path.expand(cache_prefix)
nthread <- as.integer(NVL(nthread, -1L))
proxy_handle <- .make.proxy.handle()
on.exit({
.Call(XGDMatrixFree_R, proxy_handle)
})
iterator_next <- function() {
return(xgb.ProxyDMatrix(proxy_handle, data_iterator))
}
iterator_reset <- function() {
return(data_iterator$f_reset(data_iterator$env))
}
calling_env <- environment()
dmat <- .Call(
XGDMatrixCreateFromCallback_R,
iterator_next,
iterator_reset,
calling_env,
proxy_handle,
nthread,
missing,
cache_prefix
)
attributes(dmat) <- list(
class = c("xgb.DMatrix", "xgb.ExternalDMatrix"),
fields = attributes(proxy_handle)$fields
)
return(dmat)
}
#' @title QuantileDMatrix from External Data
#' @description Create an `xgb.QuantileDMatrix` object (exact same class as would be returned by
#' calling function \link{xgb.QuantileDMatrix}, with the same advantages and limitations) from
#' external data supplied by an \link{xgb.DataIter} object, potentially passed in batches from
#' a bigger set that might not fit entirely in memory, same way as \link{xgb.ExternalDMatrix}.
#'
#' Note that, while external data will only be loaded through the iterator (thus the full data
#' might not be held entirely in-memory), the quantized representation of the data will get
#' created in-memory, being concatenated from multiple calls to the data iterator. The quantized
#' version is typically lighter than the original data, so there might be cases in which this
#' representation could potentially fit in memory even if the full data doesn't.
#'
#' For more information, see the guide 'Using XGBoost External Memory Version':
#' \url{https://xgboost.readthedocs.io/en/stable/tutorials/external_memory.html}
#' @inheritParams xgb.ExternalDMatrix
#' @inheritParams xgb.QuantileDMatrix
#' @return An 'xgb.DMatrix' object, with subclass 'xgb.QuantileDMatrix'.
#' @seealso \link{xgb.DataIter}, \link{xgb.DataBatch}, \link{xgb.ExternalDMatrix},
#' \link{xgb.QuantileDMatrix}
#' @export
xgb.QuantileDMatrix.from_iterator <- function( # nolint
data_iterator,
missing = NA,
nthread = NULL,
ref = NULL,
max_bin = NULL
) {
stopifnot(inherits(data_iterator, "xgb.DataIter"))
if (!is.null(ref) && !inherits(ref, "xgb.DMatrix")) {
stop("'ref' must be an xgb.DMatrix object.")
}
nthread <- as.integer(NVL(nthread, -1L))
proxy_handle <- .make.proxy.handle()
on.exit({
.Call(XGDMatrixFree_R, proxy_handle)
})
iterator_next <- function() {
return(xgb.ProxyDMatrix(proxy_handle, data_iterator))
}
iterator_reset <- function() {
return(data_iterator$f_reset(data_iterator$env))
}
calling_env <- environment()
dmat <- .Call(
XGQuantileDMatrixCreateFromCallback_R,
iterator_next,
iterator_reset,
calling_env,
proxy_handle,
nthread,
missing,
max_bin,
ref
)
attributes(dmat) <- list(
class = c("xgb.DMatrix", "xgb.QuantileDMatrix"),
fields = attributes(proxy_handle)$fields
)
return(dmat)
}
#' @title Check whether DMatrix object has a field
#' @description Checks whether an xgb.DMatrix object has a given field assigned to
#' it, such as weights, labels, etc.
#' @param object The DMatrix object to check for the given \code{info} field.
#' @param info The field to check for presence or absence in \code{object}.
#' @seealso \link{xgb.DMatrix}, \link{getinfo.xgb.DMatrix}, \link{setinfo.xgb.DMatrix}
#' @examples
#' library(xgboost)
#' x <- matrix(1:10, nrow = 5)
#' dm <- xgb.DMatrix(x, nthread = 1)
#'
#' # 'dm' so far doesn't have any fields set
#' xgb.DMatrix.hasinfo(dm, "label")
#'
#' # Fields can be added after construction
#' setinfo(dm, "label", 1:5)
#' xgb.DMatrix.hasinfo(dm, "label")
#' @export
xgb.DMatrix.hasinfo <- function(object, info) {
if (!inherits(object, "xgb.DMatrix")) {
stop("Object is not an 'xgb.DMatrix'.")
}
if (.Call(XGCheckNullPtr_R, object)) {
warning("xgb.DMatrix object is invalid. Must be constructed again.")
return(FALSE)
}
return(NVL(attr(object, "fields")[[info]], FALSE))
}
# get dmatrix from data, label
# internal helper method
xgb.get.DMatrix <- function(data, label, missing, weight, nthread) {
if (inherits(data, "dgCMatrix") || is.matrix(data)) {
if (is.null(label)) {
stop("label must be provided when data is a matrix")
}
dtrain <- xgb.DMatrix(data, label = label, missing = missing, nthread = nthread)
if (!is.null(weight)) {
setinfo(dtrain, "weight", weight)
}
} else {
if (!is.null(label)) {
warning("xgboost: label will be ignored.")
}
if (is.character(data)) {
data <- path.expand(data)
dtrain <- xgb.DMatrix(data[1])
} else if (inherits(data, "xgb.DMatrix")) {
dtrain <- data
} else if (inherits(data, "data.frame")) {
stop("xgboost doesn't support data.frame as input. Convert it to matrix first.")
} else {
stop("xgboost: invalid input data")
}
}
return(dtrain)
}
#' Dimensions of xgb.DMatrix
#'
#' Returns a vector of numbers of rows and of columns in an \code{xgb.DMatrix}.
#' @param x Object of class \code{xgb.DMatrix}
#'
#' @details
#' Note: since \code{nrow} and \code{ncol} internally use \code{dim}, they can also
#' be directly used with an \code{xgb.DMatrix} object.
#'
#' @examples
#' data(agaricus.train, package='xgboost')
#' train <- agaricus.train
#' dtrain <- xgb.DMatrix(train$data, label=train$label, nthread = 2)
#'
#' stopifnot(nrow(dtrain) == nrow(train$data))
#' stopifnot(ncol(dtrain) == ncol(train$data))
#' stopifnot(all(dim(dtrain) == dim(train$data)))
#'
#' @export
dim.xgb.DMatrix <- function(x) {
c(.Call(XGDMatrixNumRow_R, x), .Call(XGDMatrixNumCol_R, x))
}
#' Handling of column names of \code{xgb.DMatrix}
#'
#' Only column names are supported for \code{xgb.DMatrix}, thus setting of
#' row names would have no effect and returned row names would be NULL.
#'
#' @param x object of class \code{xgb.DMatrix}
#' @param value a list of two elements: the first one is ignored
#' and the second one is column names
#'
#' @details
#' Generic \code{dimnames} methods are used by \code{colnames}.
#' Since row names are irrelevant, it is recommended to use \code{colnames} directly.
#'
#' @examples
#' data(agaricus.train, package='xgboost')
#' train <- agaricus.train
#' dtrain <- xgb.DMatrix(train$data, label=train$label, nthread = 2)
#' dimnames(dtrain)
#' colnames(dtrain)
#' colnames(dtrain) <- make.names(1:ncol(train$data))
#' print(dtrain, verbose=TRUE)
#'
#' @rdname dimnames.xgb.DMatrix
#' @export
dimnames.xgb.DMatrix <- function(x) {
fn <- getinfo(x, "feature_name")
## row names is null.
list(NULL, fn)
}
#' @rdname dimnames.xgb.DMatrix
#' @export
`dimnames<-.xgb.DMatrix` <- function(x, value) {
if (!is.list(value) || length(value) != 2L)
stop("invalid 'dimnames' given: must be a list of two elements")
if (!is.null(value[[1L]]))
stop("xgb.DMatrix does not have rownames")
if (is.null(value[[2]])) {
setinfo(x, "feature_name", NULL)
return(x)
}
if (ncol(x) != length(value[[2]])) {
stop("can't assign ", length(value[[2]]), " colnames to a ", ncol(x), " column xgb.DMatrix")
}
setinfo(x, "feature_name", value[[2]])
x
}
#' @title Get or set information of xgb.DMatrix and xgb.Booster objects
#' @param object Object of class \code{xgb.DMatrix} of `xgb.Booster`.
#' @param name the name of the information field to get (see details)
#' @return For `getinfo`, will return the requested field. For `setinfo`, will always return value `TRUE`
#' if it succeeds.
#' @details
#' The \code{name} field can be one of the following for `xgb.DMatrix`:
#'
#' \itemize{
#' \item \code{label}
#' \item \code{weight}
#' \item \code{base_margin}
#' \item \code{label_lower_bound}
#' \item \code{label_upper_bound}
#' \item \code{group}
#' \item \code{feature_type}
#' \item \code{feature_name}
#' \item \code{nrow}
#' }
#' See the documentation for \link{xgb.DMatrix} for more information about these fields.
#'
#' For `xgb.Booster`, can be one of the following:
#' \itemize{
#' \item \code{feature_type}
#' \item \code{feature_name}
#' }
#'
#' Note that, while 'qid' cannot be retrieved, it's possible to get the equivalent 'group'
#' for a DMatrix that had 'qid' assigned.
#'
#' \bold{Important}: when calling `setinfo`, the objects are modified in-place. See
#' \link{xgb.copy.Booster} for an idea of this in-place assignment works.
#' @examples
#' data(agaricus.train, package='xgboost')
#' dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label, nthread = 2))
#'
#' labels <- getinfo(dtrain, 'label')
#' setinfo(dtrain, 'label', 1-labels)
#'
#' labels2 <- getinfo(dtrain, 'label')
#' stopifnot(all(labels2 == 1-labels))
#' @rdname getinfo
#' @export
getinfo <- function(object, name) UseMethod("getinfo")
#' @rdname getinfo
#' @export
getinfo.xgb.DMatrix <- function(object, name) {
allowed_int_fields <- 'group'
allowed_float_fields <- c(
'label', 'weight', 'base_margin',
'label_lower_bound', 'label_upper_bound'
)
allowed_str_fields <- c("feature_type", "feature_name")
allowed_fields <- c(allowed_float_fields, allowed_int_fields, allowed_str_fields, 'nrow')
if (typeof(name) != "character" ||
length(name) != 1 ||
!name %in% allowed_fields) {
stop("getinfo: name must be one of the following\n",
paste(paste0("'", allowed_fields, "'"), collapse = ", "))
}
if (name == "nrow") {
ret <- nrow(object)
} else if (name %in% allowed_str_fields) {
ret <- .Call(XGDMatrixGetStrFeatureInfo_R, object, name)
} else if (name %in% allowed_float_fields) {
ret <- .Call(XGDMatrixGetFloatInfo_R, object, name)
if (length(ret) > nrow(object)) {
ret <- matrix(ret, nrow = nrow(object), byrow = TRUE)
}
} else if (name %in% allowed_int_fields) {
if (name == "group") {
name <- "group_ptr"
}
ret <- .Call(XGDMatrixGetUIntInfo_R, object, name)
if (length(ret) > nrow(object)) {
ret <- matrix(ret, nrow = nrow(object), byrow = TRUE)
}
}
if (length(ret) == 0) return(NULL)
return(ret)
}
#' @rdname getinfo
#' @param info the specific field of information to set
#'
#' @details
#' See the documentation for \link{xgb.DMatrix} for possible fields that can be set
#' (which correspond to arguments in that function).
#'
#' Note that the following fields are allowed in the construction of an \code{xgb.DMatrix}
#' but \bold{aren't} allowed here:\itemize{
#' \item data
#' \item missing
#' \item silent
#' \item nthread
#' }
#'
#' @examples
#' data(agaricus.train, package='xgboost')
#' dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label, nthread = 2))
#'
#' labels <- getinfo(dtrain, 'label')
#' setinfo(dtrain, 'label', 1-labels)
#' labels2 <- getinfo(dtrain, 'label')
#' stopifnot(all.equal(labels2, 1-labels))
#' @export
setinfo <- function(object, name, info) UseMethod("setinfo")
#' @rdname getinfo
#' @export
setinfo.xgb.DMatrix <- function(object, name, info) {
.internal.setinfo.xgb.DMatrix(object, name, info)
attr(object, "fields")[[name]] <- TRUE
return(TRUE)
}
.internal.setinfo.xgb.DMatrix <- function(object, name, info) {
if (name == "label") {
if (NROW(info) != nrow(object))
stop("The length of labels must equal to the number of rows in the input data")
.Call(XGDMatrixSetInfo_R, object, name, info)
return(TRUE)
}
if (name == "label_lower_bound") {
if (NROW(info) != nrow(object))
stop("The length of lower-bound labels must equal to the number of rows in the input data")
.Call(XGDMatrixSetInfo_R, object, name, info)
return(TRUE)
}
if (name == "label_upper_bound") {
if (NROW(info) != nrow(object))
stop("The length of upper-bound labels must equal to the number of rows in the input data")
.Call(XGDMatrixSetInfo_R, object, name, info)
return(TRUE)
}
if (name == "weight") {
.Call(XGDMatrixSetInfo_R, object, name, info)
return(TRUE)
}
if (name == "base_margin") {
.Call(XGDMatrixSetInfo_R, object, name, info)
return(TRUE)
}
if (name == "group") {
if (sum(info) != nrow(object))
stop("The sum of groups must equal to the number of rows in the input data")
.Call(XGDMatrixSetInfo_R, object, name, info)
return(TRUE)
}
if (name == "qid") {
if (NROW(info) != nrow(object))
stop("The length of qid assignments must equal to the number of rows in the input data")
.Call(XGDMatrixSetInfo_R, object, name, info)
return(TRUE)
}
if (name == "feature_weights") {
if (NROW(info) != ncol(object)) {
stop("The number of feature weights must equal to the number of columns in the input data")
}
.Call(XGDMatrixSetInfo_R, object, name, info)
return(TRUE)
}
set_feat_info <- function(name) {
msg <- sprintf(
"The number of %s must equal to the number of columns in the input data. %s vs. %s",
name,
length(info),
ncol(object)
)
if (!is.null(info)) {
info <- as.list(info)
if (length(info) != ncol(object)) {
stop(msg)
}
}
.Call(XGDMatrixSetStrFeatureInfo_R, object, name, info)
}
if (name == "feature_name") {
set_feat_info("feature_name")
return(TRUE)
}
if (name == "feature_type") {
set_feat_info("feature_type")
return(TRUE)
}
stop("setinfo: unknown info name ", name)
}
#' @title Get Quantile Cuts from DMatrix
#' @description Get the quantile cuts (a.k.a. borders) from an `xgb.DMatrix`
#' that has been quantized for the histogram method (`tree_method="hist"`).
#'
#' These cuts are used in order to assign observations to bins - i.e. these are ordered
#' boundaries which are used to determine assignment condition `border_low < x < border_high`.
#' As such, the first and last bin will be outside of the range of the data, so as to include
#' all of the observations there.
#'
#' If a given column has 'n' bins, then there will be 'n+1' cuts / borders for that column,
#' which will be output in sorted order from lowest to highest.
#'
#' Different columns can have different numbers of bins according to their range.
#' @param dmat An `xgb.DMatrix` object, as returned by \link{xgb.DMatrix}.
#' @param output Output format for the quantile cuts. Possible options are:\itemize{
#' \item `"list"` will return the output as a list with one entry per column, where
#' each column will have a numeric vector with the cuts. The list will be named if
#' `dmat` has column names assigned to it.
#' \item `"arrays"` will return a list with entries `indptr` (base-0 indexing) and
#' `data`. Here, the cuts for column 'i' are obtained by slicing 'data' from entries
#' `indptr[i]+1` to `indptr[i+1]`.
#' }
#' @return The quantile cuts, in the format specified by parameter `output`.
#' @examples
#' library(xgboost)
#' data(mtcars)
#' y <- mtcars$mpg
#' x <- as.matrix(mtcars[, -1])
#' dm <- xgb.DMatrix(x, label = y, nthread = 1)
#'
#' # DMatrix is not quantized right away, but will be once a hist model is generated
#' model <- xgb.train(
#' data = dm,
#' params = list(
#' tree_method = "hist",
#' max_bin = 8,
#' nthread = 1
#' ),
#' nrounds = 3
#' )
#'
#' # Now can get the quantile cuts
#' xgb.get.DMatrix.qcut(dm)
#' @export
xgb.get.DMatrix.qcut <- function(dmat, output = c("list", "arrays")) { # nolint
stopifnot(inherits(dmat, "xgb.DMatrix"))
output <- head(output, 1L)
stopifnot(output %in% c("list", "arrays"))
res <- .Call(XGDMatrixGetQuantileCut_R, dmat)
if (output == "arrays") {
return(res)
} else {
feature_names <- getinfo(dmat, "feature_name")
ncols <- length(res$indptr) - 1
out <- lapply(
seq(1, ncols),
function(col) {
st <- res$indptr[col]
end <- res$indptr[col + 1]
if (end <= st) {
return(numeric())
}
return(res$data[seq(1 + st, end)])
}
)
if (NROW(feature_names)) {
names(out) <- feature_names
}
return(out)
}
}
#' @title Get Number of Non-Missing Entries in DMatrix
#' @param dmat An `xgb.DMatrix` object, as returned by \link{xgb.DMatrix}.
#' @return The number of non-missing entries in the DMatrix
#' @export
xgb.get.DMatrix.num.non.missing <- function(dmat) { # nolint
stopifnot(inherits(dmat, "xgb.DMatrix"))
return(.Call(XGDMatrixNumNonMissing_R, dmat))
}
#' @title Get DMatrix Data
#' @param dmat An `xgb.DMatrix` object, as returned by \link{xgb.DMatrix}.
#' @return The data held in the DMatrix, as a sparse CSR matrix (class `dgRMatrix`
#' from package `Matrix`). If it had feature names, these will be added as column names
#' in the output.
#' @export
xgb.get.DMatrix.data <- function(dmat) {
stopifnot(inherits(dmat, "xgb.DMatrix"))
res <- .Call(XGDMatrixGetDataAsCSR_R, dmat)
out <- methods::new("dgRMatrix")
nrows <- as.integer(length(res$indptr) - 1)
out@p <- res$indptr
out@j <- res$indices
out@x <- res$data
out@Dim <- as.integer(c(nrows, res$ncols))
feature_names <- getinfo(dmat, "feature_name")
dim_names <- list(NULL, NULL)
if (NROW(feature_names)) {
dim_names[[2L]] <- feature_names
}
out@Dimnames <- dim_names
return(out)
}
#' Get a new DMatrix containing the specified rows of
#' original xgb.DMatrix object
#'
#' Get a new DMatrix containing the specified rows of
#' original xgb.DMatrix object
#'
#' @param object Object of class "xgb.DMatrix"
#' @param idxset a integer vector of indices of rows needed
#' @param colset currently not used (columns subsetting is not available)
#'
#' @examples
#' data(agaricus.train, package='xgboost')
#' dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label, nthread = 2))
#'
#' dsub <- xgb.slice.DMatrix(dtrain, 1:42)
#' labels1 <- getinfo(dsub, 'label')
#' dsub <- dtrain[1:42, ]
#' labels2 <- getinfo(dsub, 'label')
#' all.equal(labels1, labels2)
#'
#' @rdname xgb.slice.DMatrix
#' @export
xgb.slice.DMatrix <- function(object, idxset) {
if (!inherits(object, "xgb.DMatrix")) {
stop("object must be xgb.DMatrix")
}
ret <- .Call(XGDMatrixSliceDMatrix_R, object, idxset)
attr_list <- attributes(object)
nr <- nrow(object)
len <- sapply(attr_list, NROW)
ind <- which(len == nr)
if (length(ind) > 0) {
nms <- names(attr_list)[ind]
for (i in seq_along(ind)) {
obj_attr <- attr(object, nms[i])
if (NCOL(obj_attr) > 1) {
attr(ret, nms[i]) <- obj_attr[idxset, ]
} else {
attr(ret, nms[i]) <- obj_attr[idxset]
}
}
}
return(structure(ret, class = "xgb.DMatrix"))
}
#' @rdname xgb.slice.DMatrix
#' @export
`[.xgb.DMatrix` <- function(object, idxset, colset = NULL) {
xgb.slice.DMatrix(object, idxset)
}
#' Print xgb.DMatrix
#'
#' Print information about xgb.DMatrix.
#' Currently it displays dimensions and presence of info-fields and colnames.
#'
#' @param x an xgb.DMatrix object
#' @param verbose whether to print colnames (when present)
#' @param ... not currently used
#'
#' @examples
#' data(agaricus.train, package='xgboost')
#' dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label, nthread = 2))
#'
#' dtrain
#' print(dtrain, verbose=TRUE)
#'
#' @method print xgb.DMatrix
#' @export
print.xgb.DMatrix <- function(x, verbose = FALSE, ...) {
if (.Call(XGCheckNullPtr_R, x)) {
cat("INVALID xgb.DMatrix object. Must be constructed anew.\n")
return(invisible(x))
}
class_print <- if (inherits(x, "xgb.QuantileDMatrix")) {
"xgb.QuantileDMatrix"
} else if (inherits(x, "xgb.ExternalDMatrix")) {
"xgb.ExternalDMatrix"
} else if (inherits(x, "xgb.ProxyDMatrix")) {
"xgb.ProxyDMatrix"
} else {
"xgb.DMatrix"
}
cat(class_print, ' dim:', nrow(x), 'x', ncol(x), ' info: ')
infos <- character(0)
if (xgb.DMatrix.hasinfo(x, 'label')) infos <- 'label'
if (xgb.DMatrix.hasinfo(x, 'weight')) infos <- c(infos, 'weight')
if (xgb.DMatrix.hasinfo(x, 'base_margin')) infos <- c(infos, 'base_margin')
if (length(infos) == 0) infos <- 'NA'
cat(infos)
cnames <- colnames(x)
cat(' colnames:')
if (verbose && !is.null(cnames)) {
cat("\n'")
cat(cnames, sep = "','")
cat("'")
} else {
if (is.null(cnames)) cat(' no')
else cat(' yes')
}
cat("\n")
invisible(x)
}