[R] Add data iterator, quantile dmatrix, external memory, and missing feature_types (#9913)
This commit is contained in:
@@ -1,13 +1,42 @@
|
||||
#' Construct xgb.DMatrix object
|
||||
#'
|
||||
#' Construct xgb.DMatrix object from either a dense matrix, a sparse matrix, or a local file.
|
||||
#' Supported input file formats are either a LIBSVM text file or a binary file that was created previously by
|
||||
#' \code{\link{xgb.DMatrix.save}}).
|
||||
#' Construct an 'xgb.DMatrix' object from a given data source, which can then be passed to functions
|
||||
#' such as \link{xgb.train} or \link{predict.xgb.Booster}.
|
||||
#'
|
||||
#' @param data a \code{matrix} object (either numeric or integer), a \code{dgCMatrix} object,
|
||||
#' a \code{dgRMatrix} object,
|
||||
#' a \code{dsparseVector} object (only when making predictions from a fitted model, will be
|
||||
#' interpreted as a row vector), or a character string representing a filename.
|
||||
#' Function 'xgb.QuantileDMatrix' will construct a DMatrix with quantization for the histogram
|
||||
#' method already applied to it, which can be used to reduce memory usage (compared to using a
|
||||
#' a regular DMatrix first and then creating a quantization out of it) when using the histogram
|
||||
#' method (`tree_method = "hist"`, which is the default algorithm), but is not usable for the
|
||||
#' sorted-indices method (`tree_method = "exact"`), nor for the approximate method
|
||||
#' (`tree_method = "approx"`).
|
||||
#' @param data Data from which to create a DMatrix, which can then be used for fitting models or
|
||||
#' for getting predictions out of a fitted model.
|
||||
#'
|
||||
#' Supported input types are as follows:\itemize{
|
||||
#' \item `matrix` objects, with types `numeric`, `integer`, or `logical`.
|
||||
#' \item `data.frame` objects, with columns of types `numeric`, `integer`, `logical`, or `factor`.
|
||||
#'
|
||||
#' If passing `enable_categorical=TRUE`, columns with `factor` type will be treated as categorical.
|
||||
#' Otherwise, if passing `enable_categorical=FALSE` and the data contains `factor` columns, an error
|
||||
#' will be thrown.
|
||||
#'
|
||||
#' Note that xgboost uses base-0 encoding for categorical types, hence `factor` types (which use base-1
|
||||
#' encoding') will be converted inside the function call. Be aware that the encoding used for `factor`
|
||||
#' types is not kept as part of the model, so in subsequent calls to `predict`, it is the user's
|
||||
#' responsibility to ensure that factor columns have the same levels as the ones from which the DMatrix
|
||||
#' was constructed.
|
||||
#'
|
||||
#' Other column types are not supported.
|
||||
#' \item CSR matrices, as class `dgRMatrix` from package `Matrix`.
|
||||
#' \item CSC matrices, as class `dgCMatrix` from package `Matrix`. These are \bold{not} supported for
|
||||
#' 'xgb.QuantileDMatrix'.
|
||||
#' \item Single-row CSR matrices, as class `dsparseVector` from package `Matrix`, which is interpreted
|
||||
#' as a single row (only when making predictions from a fitted model).
|
||||
#' \item Text files in SVMLight / LibSVM formats, passed as a path to the file. These are \bold{not}
|
||||
#' supported for xgb.QuantileDMatrix'.
|
||||
#' \item Binary files generated by \link{xgb.DMatrix.save}, passed as a path to the file. These are
|
||||
#' \bold{not} supported for xgb.QuantileDMatrix'.
|
||||
#' }
|
||||
#' @param label Label of the training data.
|
||||
#' @param weight Weight for each instance.
|
||||
#'
|
||||
@@ -18,11 +47,32 @@
|
||||
#' @param base_margin Base margin used for boosting from existing model.
|
||||
#'
|
||||
#' In the case of multi-output models, one can also pass multi-dimensional base_margin.
|
||||
#' @param missing a float value to represents missing values in data (used only when input is a dense matrix).
|
||||
#' It is useful when a 0 or some other extreme value represents missing values in data.
|
||||
#' @param missing A float value to represents missing values in data (not used when creating DMatrix
|
||||
#' from text files).
|
||||
#' It is useful to change when a zero, infinite, or some other extreme value represents missing
|
||||
#' values in data.
|
||||
#' @param silent whether to suppress printing an informational message after loading from a file.
|
||||
#' @param feature_names Set names for features. Overrides column names in data
|
||||
#' frame and matrix.
|
||||
#'
|
||||
#' Note: columns are not referenced by name when calling `predict`, so the column order there
|
||||
#' must be the same as in the DMatrix construction, regardless of the column names.
|
||||
#' @param feature_types Set types for features.
|
||||
#'
|
||||
#' If `data` is a `data.frame` and passing `enable_categorical=TRUE`, the types will be deduced
|
||||
#' automatically from the column types.
|
||||
#'
|
||||
#' Otherwise, one can pass a character vector with the same length as number of columns in `data`,
|
||||
#' with the following possible values:\itemize{
|
||||
#' \item "c", which represents categorical columns.
|
||||
#' \item "q", which represents numeric columns.
|
||||
#' \item "int", which represents integer columns.
|
||||
#' \item "i", which represents logical (boolean) columns.
|
||||
#' }
|
||||
#'
|
||||
#' Note that, while categorical types are treated differently from the rest for model fitting
|
||||
#' purposes, the other types do not influence the generated model, but have effects in other
|
||||
#' functionalities such as feature importances.
|
||||
#' @param nthread Number of threads used for creating DMatrix.
|
||||
#' @param group Group size for all ranking group.
|
||||
#' @param qid Query ID for data samples, used for ranking.
|
||||
@@ -41,6 +91,8 @@
|
||||
#' If 'data' is not a data frame, this argument is ignored.
|
||||
#'
|
||||
#' JSON/UBJSON serialization format is required for this.
|
||||
#' @return An 'xgb.DMatrix' object. If calling 'xgb.QuantileDMatrix', it will have additional
|
||||
#' subclass 'xgb.QuantileDMatrix'.
|
||||
#'
|
||||
#' @details
|
||||
#' Note that DMatrix objects are not serializable through R functions such as \code{saveRDS} or \code{save}.
|
||||
@@ -60,6 +112,7 @@
|
||||
#' xgb.DMatrix.save(dtrain, fname)
|
||||
#' dtrain <- xgb.DMatrix(fname)
|
||||
#' @export
|
||||
#' @rdname xgb.DMatrix
|
||||
xgb.DMatrix <- function(
|
||||
data,
|
||||
label = NULL,
|
||||
@@ -68,6 +121,7 @@ xgb.DMatrix <- function(
|
||||
missing = NA,
|
||||
silent = FALSE,
|
||||
feature_names = colnames(data),
|
||||
feature_types = NULL,
|
||||
nthread = NULL,
|
||||
group = NULL,
|
||||
qid = NULL,
|
||||
@@ -79,7 +133,7 @@ xgb.DMatrix <- function(
|
||||
if (!is.null(group) && !is.null(qid)) {
|
||||
stop("Either one of 'group' or 'qid' should be NULL")
|
||||
}
|
||||
ctypes <- NULL
|
||||
nthread <- as.integer(NVL(nthread, -1L))
|
||||
if (typeof(data) == "character") {
|
||||
if (length(data) > 1) {
|
||||
stop(
|
||||
@@ -91,7 +145,7 @@ xgb.DMatrix <- function(
|
||||
handle <- .Call(XGDMatrixCreateFromFile_R, data, as.integer(silent))
|
||||
} else if (is.matrix(data)) {
|
||||
handle <- .Call(
|
||||
XGDMatrixCreateFromMat_R, data, missing, as.integer(NVL(nthread, -1))
|
||||
XGDMatrixCreateFromMat_R, data, missing, nthread
|
||||
)
|
||||
} else if (inherits(data, "dgCMatrix")) {
|
||||
handle <- .Call(
|
||||
@@ -101,7 +155,7 @@ xgb.DMatrix <- function(
|
||||
data@x,
|
||||
nrow(data),
|
||||
missing,
|
||||
as.integer(NVL(nthread, -1))
|
||||
nthread
|
||||
)
|
||||
} else if (inherits(data, "dgRMatrix")) {
|
||||
handle <- .Call(
|
||||
@@ -111,7 +165,7 @@ xgb.DMatrix <- function(
|
||||
data@x,
|
||||
ncol(data),
|
||||
missing,
|
||||
as.integer(NVL(nthread, -1))
|
||||
nthread
|
||||
)
|
||||
} else if (inherits(data, "dsparseVector")) {
|
||||
indptr <- c(0L, as.integer(length(data@i)))
|
||||
@@ -123,41 +177,15 @@ xgb.DMatrix <- function(
|
||||
data@x,
|
||||
length(data),
|
||||
missing,
|
||||
as.integer(NVL(nthread, -1))
|
||||
nthread
|
||||
)
|
||||
} else if (is.data.frame(data)) {
|
||||
ctypes <- sapply(data, function(x) {
|
||||
if (is.factor(x)) {
|
||||
if (!enable_categorical) {
|
||||
stop(
|
||||
"When factor type is used, the parameter `enable_categorical`",
|
||||
" must be set to TRUE."
|
||||
)
|
||||
}
|
||||
"c"
|
||||
} else if (is.integer(x)) {
|
||||
"int"
|
||||
} else if (is.logical(x)) {
|
||||
"i"
|
||||
} else {
|
||||
if (!is.numeric(x)) {
|
||||
stop("Invalid type in dataframe.")
|
||||
}
|
||||
"float"
|
||||
}
|
||||
})
|
||||
## as.data.frame somehow converts integer/logical into real.
|
||||
data <- as.data.frame(sapply(data, function(x) {
|
||||
if (is.factor(x)) {
|
||||
## XGBoost uses 0-based indexing.
|
||||
as.numeric(x) - 1
|
||||
} else {
|
||||
x
|
||||
}
|
||||
}))
|
||||
tmp <- .process.df.for.dmatrix(data, enable_categorical, feature_types)
|
||||
feature_types <- tmp$feature_types
|
||||
handle <- .Call(
|
||||
XGDMatrixCreateFromDF_R, data, missing, as.integer(NVL(nthread, -1))
|
||||
XGDMatrixCreateFromDF_R, tmp$lst, missing, nthread
|
||||
)
|
||||
rm(tmp)
|
||||
} else {
|
||||
stop("xgb.DMatrix does not support construction from ", typeof(data))
|
||||
}
|
||||
@@ -167,7 +195,81 @@ xgb.DMatrix <- function(
|
||||
class = "xgb.DMatrix",
|
||||
fields = new.env()
|
||||
)
|
||||
.set.dmatrix.fields(
|
||||
dmat = dmat,
|
||||
label = label,
|
||||
weight = weight,
|
||||
base_margin = base_margin,
|
||||
feature_names = feature_names,
|
||||
feature_types = feature_types,
|
||||
group = group,
|
||||
qid = qid,
|
||||
label_lower_bound = label_lower_bound,
|
||||
label_upper_bound = label_upper_bound,
|
||||
feature_weights = feature_weights
|
||||
)
|
||||
|
||||
return(dmat)
|
||||
}
|
||||
|
||||
.process.df.for.dmatrix <- function(df, enable_categorical, feature_types) {
|
||||
if (!nrow(df) || !ncol(df)) {
|
||||
stop("'data' is an empty data.frame.")
|
||||
}
|
||||
if (!is.null(feature_types)) {
|
||||
if (!is.character(feature_types) || length(feature_types) != ncol(df)) {
|
||||
stop(
|
||||
"'feature_types' must be a character vector with one entry per column in 'data'."
|
||||
)
|
||||
}
|
||||
} else {
|
||||
feature_types <- sapply(df, function(col) {
|
||||
if (is.factor(col)) {
|
||||
if (!enable_categorical) {
|
||||
stop(
|
||||
"When factor type is used, the parameter `enable_categorical`",
|
||||
" must be set to TRUE."
|
||||
)
|
||||
}
|
||||
return("c")
|
||||
} else if (is.integer(col)) {
|
||||
return("int")
|
||||
} else if (is.logical(col)) {
|
||||
return("i")
|
||||
} else {
|
||||
if (!is.numeric(col)) {
|
||||
stop("Invalid type in dataframe.")
|
||||
}
|
||||
return("float")
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
lst <- lapply(df, function(col) {
|
||||
is_factor <- is.factor(col)
|
||||
col <- as.numeric(col)
|
||||
if (is_factor) {
|
||||
col <- col - 1
|
||||
}
|
||||
return(col)
|
||||
})
|
||||
|
||||
return(list(lst = lst, feature_types = feature_types))
|
||||
}
|
||||
|
||||
.set.dmatrix.fields <- function(
|
||||
dmat,
|
||||
label,
|
||||
weight,
|
||||
base_margin,
|
||||
feature_names,
|
||||
feature_types,
|
||||
group,
|
||||
qid,
|
||||
label_lower_bound,
|
||||
label_upper_bound,
|
||||
feature_weights
|
||||
) {
|
||||
if (!is.null(label)) {
|
||||
setinfo(dmat, "label", label)
|
||||
}
|
||||
@@ -180,6 +282,9 @@ xgb.DMatrix <- function(
|
||||
if (!is.null(feature_names)) {
|
||||
setinfo(dmat, "feature_name", feature_names)
|
||||
}
|
||||
if (!is.null(feature_types)) {
|
||||
setinfo(dmat, "feature_type", feature_types)
|
||||
}
|
||||
if (!is.null(group)) {
|
||||
setinfo(dmat, "group", group)
|
||||
}
|
||||
@@ -195,10 +300,515 @@ xgb.DMatrix <- function(
|
||||
if (!is.null(feature_weights)) {
|
||||
setinfo(dmat, "feature_weights", feature_weights)
|
||||
}
|
||||
if (!is.null(ctypes)) {
|
||||
setinfo(dmat, "feature_type", ctypes)
|
||||
}
|
||||
|
||||
#' @param ref The training dataset that provides quantile information, needed when creating
|
||||
#' validation/test dataset with `xgb.QuantileDMatrix`. Supplying the training DMatrix
|
||||
#' as a reference means that the same quantisation applied to the training data is
|
||||
#' applied to the validation/test data
|
||||
#' @param max_bin The number of histogram bin, should be consistent with the training parameter
|
||||
#' `max_bin`.
|
||||
#'
|
||||
#' This is only supported when constructing a QuantileDMatrix.
|
||||
#' @export
|
||||
#' @rdname xgb.DMatrix
|
||||
xgb.QuantileDMatrix <- function(
|
||||
data,
|
||||
label = NULL,
|
||||
weight = NULL,
|
||||
base_margin = NULL,
|
||||
missing = NA,
|
||||
feature_names = colnames(data),
|
||||
feature_types = NULL,
|
||||
nthread = NULL,
|
||||
group = NULL,
|
||||
qid = NULL,
|
||||
label_lower_bound = NULL,
|
||||
label_upper_bound = NULL,
|
||||
feature_weights = NULL,
|
||||
enable_categorical = FALSE,
|
||||
ref = NULL,
|
||||
max_bin = NULL
|
||||
) {
|
||||
nthread <- as.integer(NVL(nthread, -1L))
|
||||
if (!is.null(ref) && !inherits(ref, "xgb.DMatrix")) {
|
||||
stop("'ref' must be an xgb.DMatrix object.")
|
||||
}
|
||||
|
||||
# Note: when passing an integer matrix, it won't get casted to numeric.
|
||||
# Since 'int' values as understood by languages like C cannot have missing values,
|
||||
# R represents missingness there by assigning them a value equal to the minimum
|
||||
# integer. The 'missing' value here is set before the data, so in case of integers,
|
||||
# need to make the conversion manually beforehand.
|
||||
if (is.matrix(data) && storage.mode(data) %in% c("integer", "logical") && is.na(missing)) {
|
||||
missing <- .Call(XGGetRNAIntAsDouble)
|
||||
}
|
||||
|
||||
iterator_env <- as.environment(
|
||||
list(
|
||||
data = data,
|
||||
label = label,
|
||||
weight = weight,
|
||||
base_margin = base_margin,
|
||||
missing = missing,
|
||||
feature_names = feature_names,
|
||||
feature_types = feature_types,
|
||||
group = group,
|
||||
qid = qid,
|
||||
label_lower_bound = label_lower_bound,
|
||||
label_upper_bound = label_upper_bound,
|
||||
feature_weights = feature_weights,
|
||||
enable_categorical = enable_categorical
|
||||
)
|
||||
)
|
||||
data_iterator <- .single.data.iterator(iterator_env)
|
||||
|
||||
# Note: the ProxyDMatrix has its finalizer assigned in the R externalptr
|
||||
# object, but that finalizer will only be called once the object is
|
||||
# garbage-collected, which doesn't happen immediately after it goes out
|
||||
# of scope, hence this piece of code to tigger its destruction earlier
|
||||
# and free memory right away.
|
||||
proxy_handle <- .make.proxy.handle()
|
||||
on.exit({
|
||||
.Call(XGDMatrixFree_R, proxy_handle)
|
||||
})
|
||||
iterator_next <- function() {
|
||||
return(xgb.ProxyDMatrix.internal(proxy_handle, data_iterator))
|
||||
}
|
||||
iterator_reset <- function() {
|
||||
return(data_iterator$f_reset(iterator_env))
|
||||
}
|
||||
calling_env <- environment()
|
||||
|
||||
dmat <- .Call(
|
||||
XGQuantileDMatrixCreateFromCallback_R,
|
||||
iterator_next,
|
||||
iterator_reset,
|
||||
calling_env,
|
||||
proxy_handle,
|
||||
nthread,
|
||||
missing,
|
||||
max_bin,
|
||||
ref
|
||||
)
|
||||
attributes(dmat) <- list(
|
||||
class = c("xgb.DMatrix", "xgb.QuantileDMatrix"),
|
||||
fields = attributes(proxy_handle)$fields
|
||||
)
|
||||
return(dmat)
|
||||
}
|
||||
|
||||
#' @title XGBoost Data Iterator
|
||||
#' @description Interface to create a custom data iterator in order to construct a DMatrix
|
||||
#' from external memory.
|
||||
#'
|
||||
#' This function is responsible for generating an R object structure containing callback
|
||||
#' functions and an environment shared with them.
|
||||
#'
|
||||
#' The output structure from this function is then meant to be passed to \link{xgb.ExternalDMatrix},
|
||||
#' which will consume the data and create a DMatrix from it by executing the callback functions.
|
||||
#'
|
||||
#' For more information, and for a usage example, see the documentation for \link{xgb.ExternalDMatrix}.
|
||||
#' @param env An R environment to pass to the callback functions supplied here, which can be
|
||||
#' used to keep track of variables to determine how to handle the batches.
|
||||
#'
|
||||
#' For example, one might want to keep track of an iteration number in this environment in order
|
||||
#' to know which part of the data to pass next.
|
||||
#' @param f_next `function(env)` which is responsible for:\itemize{
|
||||
#' \item Accessing or retrieving the next batch of data in the iterator.
|
||||
#' \item Supplying this data by calling function \link{xgb.ProxyDMatrix} on it and returning the result.
|
||||
#' \item Keeping track of where in the iterator batch it is or will go next, which can for example
|
||||
#' be done by modifiying variables in the `env` variable that is passed here.
|
||||
#' \item Signaling whether there are more batches to be consumed or not, by returning `NULL`
|
||||
#' when the stream of data ends (all batches in the iterator have been consumed), or the result from
|
||||
#' calling \link{xgb.ProxyDMatrix} when there are more batches in the line to be consumed.
|
||||
#' }
|
||||
#' @param f_reset `function(env)` which is responsible for reseting the data iterator
|
||||
#' (i.e. taking it back to the first batch, called before and after the sequence of batches
|
||||
#' has been consumed).
|
||||
#'
|
||||
#' Note that, after resetting the iterator, the batches will be accessed again, so the same data
|
||||
#' (and in the same order) must be passed in subsequent iterations.
|
||||
#' @return An `xgb.DataIter` object, containing the same inputs supplied here, which can then
|
||||
#' be passed to \link{xgb.ExternalDMatrix}.
|
||||
#' @seealso \link{xgb.ExternalDMatrix}, \link{xgb.ProxyDMatrix}.
|
||||
#' @export
|
||||
xgb.DataIter <- function(env = new.env(), f_next, f_reset) {
|
||||
if (!is.function(f_next)) {
|
||||
stop("'f_next' must be a function.")
|
||||
}
|
||||
if (!is.function(f_reset)) {
|
||||
stop("'f_reset' must be a function.")
|
||||
}
|
||||
out <- list(
|
||||
env = env,
|
||||
f_next = f_next,
|
||||
f_reset = f_reset
|
||||
)
|
||||
class(out) <- "xgb.DataIter"
|
||||
return(out)
|
||||
}
|
||||
|
||||
.qdm.single.fnext <- function(env) {
|
||||
curr_iter <- env[["iter"]]
|
||||
if (curr_iter >= 1L) {
|
||||
return(NULL)
|
||||
}
|
||||
|
||||
on.exit({
|
||||
env[["iter"]] <- curr_iter + 1L
|
||||
})
|
||||
return(
|
||||
xgb.ProxyDMatrix(
|
||||
data = env[["data"]],
|
||||
label = env[["label"]],
|
||||
weight = env[["weight"]],
|
||||
base_margin = env[["base_margin"]],
|
||||
feature_names = env[["feature_names"]],
|
||||
feature_types = env[["feature_types"]],
|
||||
group = env[["group"]],
|
||||
qid = env[["qid"]],
|
||||
label_lower_bound = env[["label_lower_bound"]],
|
||||
label_upper_bound = env[["label_upper_bound"]],
|
||||
feature_weights = env[["feature_weights"]],
|
||||
enable_categorical = env[["enable_categorical"]]
|
||||
)
|
||||
)
|
||||
}
|
||||
|
||||
.qdm.single.freset <- function(env) {
|
||||
env[["iter"]] <- 0L
|
||||
return(invisible(NULL))
|
||||
}
|
||||
|
||||
.single.data.iterator <- function(env) {
|
||||
env[["iter"]] <- 0L
|
||||
return(xgb.DataIter(env, .qdm.single.fnext, .qdm.single.freset))
|
||||
}
|
||||
|
||||
# Only for internal usage
|
||||
.make.proxy.handle <- function() {
|
||||
out <- .Call(XGProxyDMatrixCreate_R)
|
||||
attributes(out) <- list(
|
||||
class = c("xgb.DMatrix", "xgb.ProxyDMatrixHandle"),
|
||||
fields = new.env()
|
||||
)
|
||||
return(out)
|
||||
}
|
||||
|
||||
#' @title Proxy DMatrix Updater
|
||||
#' @description Helper function to supply data in batches of a data iterator when
|
||||
#' constructing a DMatrix from external memory through \link{xgb.ExternalDMatrix}
|
||||
#' or through \link{xgb.QuantileDMatrix.from_iterator}.
|
||||
#'
|
||||
#' This function is \bold{only} meant to be called inside of a callback function (which
|
||||
#' is passed as argument to function \link{xgb.DataIter} to construct a data iterator)
|
||||
#' when constructing a DMatrix through external memory - otherwise, one should call
|
||||
#' \link{xgb.DMatrix} or \link{xgb.QuantileDMatrix}.
|
||||
#'
|
||||
#' The object that results from calling this function directly is \bold{not} like the other
|
||||
#' `xgb.DMatrix` variants - i.e. cannot be used to train a model, nor to get predictions - only
|
||||
#' possible usage is to supply data to an iterator, from which a DMatrix is then constructed.
|
||||
#'
|
||||
#' For more information and for example usage, see the documentation for \link{xgb.ExternalDMatrix}.
|
||||
#' @inheritParams xgb.DMatrix
|
||||
#' @param data Batch of data belonging to this batch.
|
||||
#'
|
||||
#' Note that not all of the input types supported by \link{xgb.DMatrix} are possible
|
||||
#' to pass here. Supported types are:\itemize{
|
||||
#' \item `matrix`, with types `numeric`, `integer`, and `logical`. Note that for types
|
||||
#' `integer` and `logical`, missing values might not be automatically recognized as
|
||||
#' as such - see the documentation for parameter `missing` in \link{xgb.ExternalDMatrix}
|
||||
#' for details on this.
|
||||
#' \item `data.frame`, with the same types as supported by 'xgb.DMatrix' and same
|
||||
#' conversions applied to it. See the documentation for parameter `data` in
|
||||
#' \link{xgb.DMatrix} for details on it.
|
||||
#' \item CSR matrices, as class `dgRMatrix` from package `Matrix`.
|
||||
#' }
|
||||
#' @return An object of class `xgb.ProxyDMatrix`, which is just a list containing the
|
||||
#' data and parameters passed here. It does \bold{not} inherit from `xgb.DMatrix`.
|
||||
#' @seealso \link{xgb.DataIter}, \link{xgb.ExternalDMatrix}.
|
||||
#' @export
|
||||
xgb.ProxyDMatrix <- function(
|
||||
data,
|
||||
label = NULL,
|
||||
weight = NULL,
|
||||
base_margin = NULL,
|
||||
feature_names = colnames(data),
|
||||
feature_types = NULL,
|
||||
group = NULL,
|
||||
qid = NULL,
|
||||
label_lower_bound = NULL,
|
||||
label_upper_bound = NULL,
|
||||
feature_weights = NULL,
|
||||
enable_categorical = FALSE
|
||||
) {
|
||||
stopifnot(inherits(data, c("matrix", "data.frame", "dgRMatrix")))
|
||||
out <- list(
|
||||
data = data,
|
||||
label = label,
|
||||
weight = weight,
|
||||
base_margin = base_margin,
|
||||
feature_names = feature_names,
|
||||
feature_types = feature_types,
|
||||
group = group,
|
||||
qid = qid,
|
||||
label_lower_bound = label_lower_bound,
|
||||
label_upper_bound = label_upper_bound,
|
||||
feature_weights = feature_weights,
|
||||
enable_categorical = enable_categorical
|
||||
)
|
||||
class(out) <- "xgb.ProxyDMatrix"
|
||||
return(out)
|
||||
}
|
||||
|
||||
xgb.ProxyDMatrix.internal <- function(proxy_handle, data_iterator) {
|
||||
lst <- data_iterator$f_next(data_iterator$env)
|
||||
if (is.null(lst)) {
|
||||
return(0L)
|
||||
}
|
||||
if (!inherits(lst, "xgb.ProxyDMatrix")) {
|
||||
stop("DataIter 'f_next' must return either NULL or the result from calling 'xgb.ProxyDMatrix'.")
|
||||
}
|
||||
|
||||
if (!is.null(lst$group) && !is.null(lst$qid)) {
|
||||
stop("Either one of 'group' or 'qid' should be NULL")
|
||||
}
|
||||
if (is.data.frame(lst$data)) {
|
||||
tmp <- .process.df.for.dmatrix(lst$data, lst$enable_categorical, lst$feature_types)
|
||||
lst$feature_types <- tmp$feature_types
|
||||
.Call(XGProxyDMatrixSetDataColumnar_R, proxy_handle, tmp$lst)
|
||||
rm(tmp)
|
||||
} else if (is.matrix(lst$data)) {
|
||||
.Call(XGProxyDMatrixSetDataDense_R, proxy_handle, lst$data)
|
||||
} else if (inherits(lst$data, "dgRMatrix")) {
|
||||
tmp <- list(p = lst$data@p, j = lst$data@j, x = lst$data@x, ncol = ncol(lst$data))
|
||||
.Call(XGProxyDMatrixSetDataCSR_R, proxy_handle, tmp)
|
||||
} else {
|
||||
stop("'data' has unsupported type.")
|
||||
}
|
||||
|
||||
.set.dmatrix.fields(
|
||||
dmat = proxy_handle,
|
||||
label = lst$label,
|
||||
weight = lst$weight,
|
||||
base_margin = lst$base_margin,
|
||||
feature_names = lst$feature_names,
|
||||
feature_types = lst$feature_types,
|
||||
group = lst$group,
|
||||
qid = lst$qid,
|
||||
label_lower_bound = lst$label_lower_bound,
|
||||
label_upper_bound = lst$label_upper_bound,
|
||||
feature_weights = lst$feature_weights
|
||||
)
|
||||
|
||||
return(1L)
|
||||
}
|
||||
|
||||
#' @title DMatrix from External Data
|
||||
#' @description Create a special type of xgboost 'DMatrix' object from external data
|
||||
#' supplied by an \link{xgb.DataIter} object, potentially passed in batches from a
|
||||
#' bigger set that might not fit entirely in memory.
|
||||
#'
|
||||
#' The data supplied by the iterator is accessed on-demand as needed, multiple times,
|
||||
#' without being concatenated, but note that fields like 'label' \bold{will} be
|
||||
#' concatenated from multiple calls to the data iterator.
|
||||
#'
|
||||
#' For more information, see the guide 'Using XGBoost External Memory Version':
|
||||
#' \url{https://xgboost.readthedocs.io/en/stable/tutorials/external_memory.html}
|
||||
#' @inheritParams xgb.DMatrix
|
||||
#' @param data_iterator A data iterator structure as returned by \link{xgb.DataIter},
|
||||
#' which includes an environment shared between function calls, and functions to access
|
||||
#' the data in batches on-demand.
|
||||
#' @param cache_prefix The path of cache file, caller must initialize all the directories in this path.
|
||||
#' @param missing A float value to represents missing values in data.
|
||||
#'
|
||||
#' Note that, while functions like \link{xgb.DMatrix} can take a generic `NA` and interpret it
|
||||
#' correctly for different types like `numeric` and `integer`, if an `NA` value is passed here,
|
||||
#' it will not be adapted for different input types.
|
||||
#'
|
||||
#' For example, in R `integer` types, missing values are represented by integer number `-2147483648`
|
||||
#' (since machine 'integer' types do not have an inherent 'NA' value) - hence, if one passes `NA`,
|
||||
#' which is interpreted as a floating-point NaN by 'xgb.ExternalDMatrix' and by
|
||||
#' 'xgb.QuantileDMatrix.from_iterator', these integer missing values will not be treated as missing.
|
||||
#' This should not pose any problem for `numeric` types, since they do have an inheret NaN value.
|
||||
#' @return An 'xgb.DMatrix' object, with subclass 'xgb.ExternalDMatrix', in which the data is not
|
||||
#' held internally but accessed through the iterator when needed.
|
||||
#' @seealso \link{xgb.DataIter}, \link{xgb.ProxyDMatrix}, \link{xgb.QuantileDMatrix.from_iterator}
|
||||
#' @examples
|
||||
#' library(xgboost)
|
||||
#' data(mtcars)
|
||||
#'
|
||||
#' # this custom environment will be passed to the iterator
|
||||
#' # functions at each call. It's up to the user to keep
|
||||
#' # track of the iteration number in this environment.
|
||||
#' iterator_env <- as.environment(
|
||||
#' list(
|
||||
#' iter = 0,
|
||||
#' x = mtcars[, -1],
|
||||
#' y = mtcars[, 1]
|
||||
#' )
|
||||
#' )
|
||||
#'
|
||||
#' # Data is passed in two batches.
|
||||
#' # In this example, batches are obtained by subsetting the 'x' variable.
|
||||
#' # This is not advantageous to do, since the data is already loaded in memory
|
||||
#' # and can be passed in full in one go, but there can be situations in which
|
||||
#' # only a subset of the data will fit in the computer's memory, and it can
|
||||
#' # be loaded in batches that are accessed one-at-a-time only.
|
||||
#' iterator_next <- function(iterator_env) {
|
||||
#' curr_iter <- iterator_env[["iter"]]
|
||||
#' if (curr_iter >= 2) {
|
||||
#' # there are only two batches, so this signals end of the stream
|
||||
#' return(NULL)
|
||||
#' }
|
||||
#'
|
||||
#' if (curr_iter == 0) {
|
||||
#' x_batch <- iterator_env[["x"]][1:16, ]
|
||||
#' y_batch <- iterator_env[["y"]][1:16]
|
||||
#' } else {
|
||||
#' x_batch <- iterator_env[["x"]][17:32, ]
|
||||
#' y_batch <- iterator_env[["y"]][17:32]
|
||||
#' }
|
||||
#' on.exit({
|
||||
#' iterator_env[["iter"]] <- curr_iter + 1
|
||||
#' })
|
||||
#'
|
||||
#' # Function 'xgb.ProxyDMatrix' must be called manually
|
||||
#' # at each batch with all the appropriate attributes,
|
||||
#' # such as feature names and feature types.
|
||||
#' return(xgb.ProxyDMatrix(data = x_batch, label = y_batch))
|
||||
#' }
|
||||
#'
|
||||
#' # This moves the iterator back to its beginning
|
||||
#' iterator_reset <- function(iterator_env) {
|
||||
#' iterator_env[["iter"]] <- 0
|
||||
#' }
|
||||
#'
|
||||
#' data_iterator <- xgb.DataIter(
|
||||
#' env = iterator_env,
|
||||
#' f_next = iterator_next,
|
||||
#' f_reset = iterator_reset
|
||||
#' )
|
||||
#' cache_prefix <- tempdir()
|
||||
#'
|
||||
#' # DMatrix will be constructed from the iterator's batches
|
||||
#' dm <- xgb.ExternalDMatrix(data_iterator, cache_prefix, nthread = 1)
|
||||
#'
|
||||
#' # After construction, can be used as a regular DMatrix
|
||||
#' params <- list(nthread = 1, objective = "reg:squarederror")
|
||||
#' model <- xgb.train(data = dm, nrounds = 2, params = params)
|
||||
#'
|
||||
#' # Predictions can also be called on it, and should be the same
|
||||
#' # as if the data were passed differently.
|
||||
#' pred_dm <- predict(model, dm)
|
||||
#' pred_mat <- predict(model, as.matrix(mtcars[, -1]))
|
||||
#' @export
|
||||
xgb.ExternalDMatrix <- function(
|
||||
data_iterator,
|
||||
cache_prefix = tempdir(),
|
||||
missing = NA,
|
||||
nthread = NULL
|
||||
) {
|
||||
stopifnot(inherits(data_iterator, "xgb.DataIter"))
|
||||
stopifnot(is.character(cache_prefix))
|
||||
|
||||
cache_prefix <- path.expand(cache_prefix)
|
||||
nthread <- as.integer(NVL(nthread, -1L))
|
||||
|
||||
proxy_handle <- .make.proxy.handle()
|
||||
on.exit({
|
||||
.Call(XGDMatrixFree_R, proxy_handle)
|
||||
})
|
||||
iterator_next <- function() {
|
||||
return(xgb.ProxyDMatrix.internal(proxy_handle, data_iterator))
|
||||
}
|
||||
iterator_reset <- function() {
|
||||
return(data_iterator$f_reset(data_iterator$env))
|
||||
}
|
||||
calling_env <- environment()
|
||||
|
||||
dmat <- .Call(
|
||||
XGDMatrixCreateFromCallback_R,
|
||||
iterator_next,
|
||||
iterator_reset,
|
||||
calling_env,
|
||||
proxy_handle,
|
||||
nthread,
|
||||
missing,
|
||||
cache_prefix
|
||||
)
|
||||
|
||||
attributes(dmat) <- list(
|
||||
class = c("xgb.DMatrix", "xgb.ExternalDMatrix"),
|
||||
fields = attributes(proxy_handle)$fields
|
||||
)
|
||||
return(dmat)
|
||||
}
|
||||
|
||||
|
||||
#' @title QuantileDMatrix from External Data
|
||||
#' @description Create an `xgb.QuantileDMatrix` object (exact same class as would be returned by
|
||||
#' calling function \link{xgb.QuantileDMatrix}, with the same advantages and limitations) from
|
||||
#' external data supplied by an \link{xgb.DataIter} object, potentially passed in batches from
|
||||
#' a bigger set that might not fit entirely in memory, same way as \link{xgb.ExternalDMatrix}.
|
||||
#'
|
||||
#' Note that, while external data will only be loaded through the iterator (thus the full data
|
||||
#' might not be held entirely in-memory), the quantized representation of the data will get
|
||||
#' created in-memory, being concatenated from multiple calls to the data iterator. The quantized
|
||||
#' version is typically lighter than the original data, so there might be cases in which this
|
||||
#' representation could potentially fit in memory even if the full data doesn't.
|
||||
#'
|
||||
#' For more information, see the guide 'Using XGBoost External Memory Version':
|
||||
#' \url{https://xgboost.readthedocs.io/en/stable/tutorials/external_memory.html}
|
||||
#' @inheritParams xgb.ExternalDMatrix
|
||||
#' @inheritParams xgb.QuantileDMatrix
|
||||
#' @return An 'xgb.DMatrix' object, with subclass 'xgb.QuantileDMatrix'.
|
||||
#' @seealso \link{xgb.DataIter}, \link{xgb.ProxyDMatrix}, \link{xgb.ExternalDMatrix},
|
||||
#' \link{xgb.QuantileDMatrix}
|
||||
#' @export
|
||||
xgb.QuantileDMatrix.from_iterator <- function( # nolint
|
||||
data_iterator,
|
||||
missing = NA,
|
||||
nthread = NULL,
|
||||
ref = NULL,
|
||||
max_bin = NULL
|
||||
) {
|
||||
stopifnot(inherits(data_iterator, "xgb.DataIter"))
|
||||
if (!is.null(ref) && !inherits(ref, "xgb.DMatrix")) {
|
||||
stop("'ref' must be an xgb.DMatrix object.")
|
||||
}
|
||||
|
||||
nthread <- as.integer(NVL(nthread, -1L))
|
||||
|
||||
proxy_handle <- .make.proxy.handle()
|
||||
on.exit({
|
||||
.Call(XGDMatrixFree_R, proxy_handle)
|
||||
})
|
||||
iterator_next <- function() {
|
||||
return(xgb.ProxyDMatrix.internal(proxy_handle, data_iterator))
|
||||
}
|
||||
iterator_reset <- function() {
|
||||
return(data_iterator$f_reset(data_iterator$env))
|
||||
}
|
||||
calling_env <- environment()
|
||||
|
||||
dmat <- .Call(
|
||||
XGQuantileDMatrixCreateFromCallback_R,
|
||||
iterator_next,
|
||||
iterator_reset,
|
||||
calling_env,
|
||||
proxy_handle,
|
||||
nthread,
|
||||
missing,
|
||||
max_bin,
|
||||
ref
|
||||
)
|
||||
|
||||
attributes(dmat) <- list(
|
||||
class = c("xgb.DMatrix", "xgb.QuantileDMatrix"),
|
||||
fields = attributes(proxy_handle)$fields
|
||||
)
|
||||
return(dmat)
|
||||
}
|
||||
|
||||
@@ -712,7 +1322,17 @@ print.xgb.DMatrix <- function(x, verbose = FALSE, ...) {
|
||||
cat("INVALID xgb.DMatrix object. Must be constructed anew.\n")
|
||||
return(invisible(x))
|
||||
}
|
||||
cat('xgb.DMatrix dim:', nrow(x), 'x', ncol(x), ' info: ')
|
||||
class_print <- if (inherits(x, "xgb.QuantileDMatrix")) {
|
||||
"xgb.QuantileDMatrix"
|
||||
} else if (inherits(x, "xgb.ExternalDMatrix")) {
|
||||
"xgb.ExternalDMatrix"
|
||||
} else if (inherits(x, "xgb.ProxyDMatrix")) {
|
||||
"xgb.ProxyDMatrix"
|
||||
} else {
|
||||
"xgb.DMatrix"
|
||||
}
|
||||
|
||||
cat(class_print, ' dim:', nrow(x), 'x', ncol(x), ' info: ')
|
||||
infos <- character(0)
|
||||
if (xgb.DMatrix.hasinfo(x, 'label')) infos <- 'label'
|
||||
if (xgb.DMatrix.hasinfo(x, 'weight')) infos <- c(infos, 'weight')
|
||||
|
||||
Reference in New Issue
Block a user