[R] rename proxy dmatrix -> data batch (#10016)

This commit is contained in:
david-cortes 2024-01-31 08:43:58 +01:00 committed by GitHub
parent 1e72dc1276
commit 0955213220
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 43 additions and 42 deletions

View File

@ -32,9 +32,9 @@ export(setinfo)
export(xgb.DMatrix) export(xgb.DMatrix)
export(xgb.DMatrix.hasinfo) export(xgb.DMatrix.hasinfo)
export(xgb.DMatrix.save) export(xgb.DMatrix.save)
export(xgb.DataBatch)
export(xgb.DataIter) export(xgb.DataIter)
export(xgb.ExternalDMatrix) export(xgb.ExternalDMatrix)
export(xgb.ProxyDMatrix)
export(xgb.QuantileDMatrix) export(xgb.QuantileDMatrix)
export(xgb.QuantileDMatrix.from_iterator) export(xgb.QuantileDMatrix.from_iterator)
export(xgb.attr) export(xgb.attr)

View File

@ -348,7 +348,7 @@ xgb.QuantileDMatrix <- function(
.Call(XGDMatrixFree_R, proxy_handle) .Call(XGDMatrixFree_R, proxy_handle)
}) })
iterator_next <- function() { iterator_next <- function() {
return(xgb.ProxyDMatrix.internal(proxy_handle, data_iterator)) return(xgb.ProxyDMatrix(proxy_handle, data_iterator))
} }
iterator_reset <- function() { iterator_reset <- function() {
return(data_iterator$f_reset(iterator_env)) return(data_iterator$f_reset(iterator_env))
@ -391,12 +391,12 @@ xgb.QuantileDMatrix <- function(
#' to know which part of the data to pass next. #' to know which part of the data to pass next.
#' @param f_next `function(env)` which is responsible for:\itemize{ #' @param f_next `function(env)` which is responsible for:\itemize{
#' \item Accessing or retrieving the next batch of data in the iterator. #' \item Accessing or retrieving the next batch of data in the iterator.
#' \item Supplying this data by calling function \link{xgb.ProxyDMatrix} on it and returning the result. #' \item Supplying this data by calling function \link{xgb.DataBatch} on it and returning the result.
#' \item Keeping track of where in the iterator batch it is or will go next, which can for example #' \item Keeping track of where in the iterator batch it is or will go next, which can for example
#' be done by modifiying variables in the `env` variable that is passed here. #' be done by modifiying variables in the `env` variable that is passed here.
#' \item Signaling whether there are more batches to be consumed or not, by returning `NULL` #' \item Signaling whether there are more batches to be consumed or not, by returning `NULL`
#' when the stream of data ends (all batches in the iterator have been consumed), or the result from #' when the stream of data ends (all batches in the iterator have been consumed), or the result from
#' calling \link{xgb.ProxyDMatrix} when there are more batches in the line to be consumed. #' calling \link{xgb.DataBatch} when there are more batches in the line to be consumed.
#' } #' }
#' @param f_reset `function(env)` which is responsible for reseting the data iterator #' @param f_reset `function(env)` which is responsible for reseting the data iterator
#' (i.e. taking it back to the first batch, called before and after the sequence of batches #' (i.e. taking it back to the first batch, called before and after the sequence of batches
@ -406,7 +406,7 @@ xgb.QuantileDMatrix <- function(
#' (and in the same order) must be passed in subsequent iterations. #' (and in the same order) must be passed in subsequent iterations.
#' @return An `xgb.DataIter` object, containing the same inputs supplied here, which can then #' @return An `xgb.DataIter` object, containing the same inputs supplied here, which can then
#' be passed to \link{xgb.ExternalDMatrix}. #' be passed to \link{xgb.ExternalDMatrix}.
#' @seealso \link{xgb.ExternalDMatrix}, \link{xgb.ProxyDMatrix}. #' @seealso \link{xgb.ExternalDMatrix}, \link{xgb.DataBatch}.
#' @export #' @export
xgb.DataIter <- function(env = new.env(), f_next, f_reset) { xgb.DataIter <- function(env = new.env(), f_next, f_reset) {
if (!is.function(f_next)) { if (!is.function(f_next)) {
@ -434,7 +434,7 @@ xgb.DataIter <- function(env = new.env(), f_next, f_reset) {
env[["iter"]] <- curr_iter + 1L env[["iter"]] <- curr_iter + 1L
}) })
return( return(
xgb.ProxyDMatrix( xgb.DataBatch(
data = env[["data"]], data = env[["data"]],
label = env[["label"]], label = env[["label"]],
weight = env[["weight"]], weight = env[["weight"]],
@ -464,13 +464,13 @@ xgb.DataIter <- function(env = new.env(), f_next, f_reset) {
.make.proxy.handle <- function() { .make.proxy.handle <- function() {
out <- .Call(XGProxyDMatrixCreate_R) out <- .Call(XGProxyDMatrixCreate_R)
attributes(out) <- list( attributes(out) <- list(
class = c("xgb.DMatrix", "xgb.ProxyDMatrixHandle"), class = c("xgb.DMatrix", "xgb.ProxyDMatrix"),
fields = new.env() fields = new.env()
) )
return(out) return(out)
} }
#' @title Proxy DMatrix Updater #' @title Structure for Data Batches
#' @description Helper function to supply data in batches of a data iterator when #' @description Helper function to supply data in batches of a data iterator when
#' constructing a DMatrix from external memory through \link{xgb.ExternalDMatrix} #' constructing a DMatrix from external memory through \link{xgb.ExternalDMatrix}
#' or through \link{xgb.QuantileDMatrix.from_iterator}. #' or through \link{xgb.QuantileDMatrix.from_iterator}.
@ -480,8 +480,8 @@ xgb.DataIter <- function(env = new.env(), f_next, f_reset) {
#' when constructing a DMatrix through external memory - otherwise, one should call #' when constructing a DMatrix through external memory - otherwise, one should call
#' \link{xgb.DMatrix} or \link{xgb.QuantileDMatrix}. #' \link{xgb.DMatrix} or \link{xgb.QuantileDMatrix}.
#' #'
#' The object that results from calling this function directly is \bold{not} like the other #' The object that results from calling this function directly is \bold{not} like
#' `xgb.DMatrix` variants - i.e. cannot be used to train a model, nor to get predictions - only #' an `xgb.DMatrix` - i.e. cannot be used to train a model, nor to get predictions - only
#' possible usage is to supply data to an iterator, from which a DMatrix is then constructed. #' possible usage is to supply data to an iterator, from which a DMatrix is then constructed.
#' #'
#' For more information and for example usage, see the documentation for \link{xgb.ExternalDMatrix}. #' For more information and for example usage, see the documentation for \link{xgb.ExternalDMatrix}.
@ -499,11 +499,11 @@ xgb.DataIter <- function(env = new.env(), f_next, f_reset) {
#' \link{xgb.DMatrix} for details on it. #' \link{xgb.DMatrix} for details on it.
#' \item CSR matrices, as class `dgRMatrix` from package `Matrix`. #' \item CSR matrices, as class `dgRMatrix` from package `Matrix`.
#' } #' }
#' @return An object of class `xgb.ProxyDMatrix`, which is just a list containing the #' @return An object of class `xgb.DataBatch`, which is just a list containing the
#' data and parameters passed here. It does \bold{not} inherit from `xgb.DMatrix`. #' data and parameters passed here. It does \bold{not} inherit from `xgb.DMatrix`.
#' @seealso \link{xgb.DataIter}, \link{xgb.ExternalDMatrix}. #' @seealso \link{xgb.DataIter}, \link{xgb.ExternalDMatrix}.
#' @export #' @export
xgb.ProxyDMatrix <- function( xgb.DataBatch <- function(
data, data,
label = NULL, label = NULL,
weight = NULL, weight = NULL,
@ -530,17 +530,18 @@ xgb.ProxyDMatrix <- function(
label_upper_bound = label_upper_bound, label_upper_bound = label_upper_bound,
feature_weights = feature_weights feature_weights = feature_weights
) )
class(out) <- "xgb.ProxyDMatrix" class(out) <- "xgb.DataBatch"
return(out) return(out)
} }
xgb.ProxyDMatrix.internal <- function(proxy_handle, data_iterator) { # This is only for internal usage, class is not exposed to the user.
xgb.ProxyDMatrix <- function(proxy_handle, data_iterator) {
lst <- data_iterator$f_next(data_iterator$env) lst <- data_iterator$f_next(data_iterator$env)
if (is.null(lst)) { if (is.null(lst)) {
return(0L) return(0L)
} }
if (!inherits(lst, "xgb.ProxyDMatrix")) { if (!inherits(lst, "xgb.DataBatch")) {
stop("DataIter 'f_next' must return either NULL or the result from calling 'xgb.ProxyDMatrix'.") stop("DataIter 'f_next' must return either NULL or the result from calling 'xgb.DataBatch'.")
} }
if (!is.null(lst$group) && !is.null(lst$qid)) { if (!is.null(lst$group) && !is.null(lst$qid)) {
@ -606,7 +607,7 @@ xgb.ProxyDMatrix.internal <- function(proxy_handle, data_iterator) {
#' This should not pose any problem for `numeric` types, since they do have an inheret NaN value. #' This should not pose any problem for `numeric` types, since they do have an inheret NaN value.
#' @return An 'xgb.DMatrix' object, with subclass 'xgb.ExternalDMatrix', in which the data is not #' @return An 'xgb.DMatrix' object, with subclass 'xgb.ExternalDMatrix', in which the data is not
#' held internally but accessed through the iterator when needed. #' held internally but accessed through the iterator when needed.
#' @seealso \link{xgb.DataIter}, \link{xgb.ProxyDMatrix}, \link{xgb.QuantileDMatrix.from_iterator} #' @seealso \link{xgb.DataIter}, \link{xgb.DataBatch}, \link{xgb.QuantileDMatrix.from_iterator}
#' @examples #' @examples
#' library(xgboost) #' library(xgboost)
#' data(mtcars) #' data(mtcars)
@ -646,10 +647,10 @@ xgb.ProxyDMatrix.internal <- function(proxy_handle, data_iterator) {
#' iterator_env[["iter"]] <- curr_iter + 1 #' iterator_env[["iter"]] <- curr_iter + 1
#' }) #' })
#' #'
#' # Function 'xgb.ProxyDMatrix' must be called manually #' # Function 'xgb.DataBatch' must be called manually
#' # at each batch with all the appropriate attributes, #' # at each batch with all the appropriate attributes,
#' # such as feature names and feature types. #' # such as feature names and feature types.
#' return(xgb.ProxyDMatrix(data = x_batch, label = y_batch)) #' return(xgb.DataBatch(data = x_batch, label = y_batch))
#' } #' }
#' #'
#' # This moves the iterator back to its beginning #' # This moves the iterator back to its beginning
@ -693,7 +694,7 @@ xgb.ExternalDMatrix <- function(
.Call(XGDMatrixFree_R, proxy_handle) .Call(XGDMatrixFree_R, proxy_handle)
}) })
iterator_next <- function() { iterator_next <- function() {
return(xgb.ProxyDMatrix.internal(proxy_handle, data_iterator)) return(xgb.ProxyDMatrix(proxy_handle, data_iterator))
} }
iterator_reset <- function() { iterator_reset <- function() {
return(data_iterator$f_reset(data_iterator$env)) return(data_iterator$f_reset(data_iterator$env))
@ -736,7 +737,7 @@ xgb.ExternalDMatrix <- function(
#' @inheritParams xgb.ExternalDMatrix #' @inheritParams xgb.ExternalDMatrix
#' @inheritParams xgb.QuantileDMatrix #' @inheritParams xgb.QuantileDMatrix
#' @return An 'xgb.DMatrix' object, with subclass 'xgb.QuantileDMatrix'. #' @return An 'xgb.DMatrix' object, with subclass 'xgb.QuantileDMatrix'.
#' @seealso \link{xgb.DataIter}, \link{xgb.ProxyDMatrix}, \link{xgb.ExternalDMatrix}, #' @seealso \link{xgb.DataIter}, \link{xgb.DataBatch}, \link{xgb.ExternalDMatrix},
#' \link{xgb.QuantileDMatrix} #' \link{xgb.QuantileDMatrix}
#' @export #' @export
xgb.QuantileDMatrix.from_iterator <- function( # nolint xgb.QuantileDMatrix.from_iterator <- function( # nolint
@ -758,7 +759,7 @@ xgb.QuantileDMatrix.from_iterator <- function( # nolint
.Call(XGDMatrixFree_R, proxy_handle) .Call(XGDMatrixFree_R, proxy_handle)
}) })
iterator_next <- function() { iterator_next <- function() {
return(xgb.ProxyDMatrix.internal(proxy_handle, data_iterator)) return(xgb.ProxyDMatrix(proxy_handle, data_iterator))
} }
iterator_reset <- function() { iterator_reset <- function() {
return(data_iterator$f_reset(data_iterator$env)) return(data_iterator$f_reset(data_iterator$env))

View File

@ -1,10 +1,10 @@
% Generated by roxygen2: do not edit by hand % Generated by roxygen2: do not edit by hand
% Please edit documentation in R/xgb.DMatrix.R % Please edit documentation in R/xgb.DMatrix.R
\name{xgb.ProxyDMatrix} \name{xgb.DataBatch}
\alias{xgb.ProxyDMatrix} \alias{xgb.DataBatch}
\title{Proxy DMatrix Updater} \title{Structure for Data Batches}
\usage{ \usage{
xgb.ProxyDMatrix( xgb.DataBatch(
data, data,
label = NULL, label = NULL,
weight = NULL, weight = NULL,
@ -82,7 +82,7 @@ functionalities such as feature importances.}
\item{feature_weights}{Set feature weights for column sampling.} \item{feature_weights}{Set feature weights for column sampling.}
} }
\value{ \value{
An object of class \code{xgb.ProxyDMatrix}, which is just a list containing the An object of class \code{xgb.DataBatch}, which is just a list containing the
data and parameters passed here. It does \bold{not} inherit from \code{xgb.DMatrix}. data and parameters passed here. It does \bold{not} inherit from \code{xgb.DMatrix}.
} }
\description{ \description{
@ -95,8 +95,8 @@ is passed as argument to function \link{xgb.DataIter} to construct a data iterat
when constructing a DMatrix through external memory - otherwise, one should call when constructing a DMatrix through external memory - otherwise, one should call
\link{xgb.DMatrix} or \link{xgb.QuantileDMatrix}. \link{xgb.DMatrix} or \link{xgb.QuantileDMatrix}.
The object that results from calling this function directly is \bold{not} like the other The object that results from calling this function directly is \bold{not} like
\code{xgb.DMatrix} variants - i.e. cannot be used to train a model, nor to get predictions - only an \code{xgb.DMatrix} - i.e. cannot be used to train a model, nor to get predictions - only
possible usage is to supply data to an iterator, from which a DMatrix is then constructed. possible usage is to supply data to an iterator, from which a DMatrix is then constructed.
For more information and for example usage, see the documentation for \link{xgb.ExternalDMatrix}. For more information and for example usage, see the documentation for \link{xgb.ExternalDMatrix}.

View File

@ -15,12 +15,12 @@ to know which part of the data to pass next.}
\item{f_next}{\verb{function(env)} which is responsible for:\itemize{ \item{f_next}{\verb{function(env)} which is responsible for:\itemize{
\item Accessing or retrieving the next batch of data in the iterator. \item Accessing or retrieving the next batch of data in the iterator.
\item Supplying this data by calling function \link{xgb.ProxyDMatrix} on it and returning the result. \item Supplying this data by calling function \link{xgb.DataBatch} on it and returning the result.
\item Keeping track of where in the iterator batch it is or will go next, which can for example \item Keeping track of where in the iterator batch it is or will go next, which can for example
be done by modifiying variables in the \code{env} variable that is passed here. be done by modifiying variables in the \code{env} variable that is passed here.
\item Signaling whether there are more batches to be consumed or not, by returning \code{NULL} \item Signaling whether there are more batches to be consumed or not, by returning \code{NULL}
when the stream of data ends (all batches in the iterator have been consumed), or the result from when the stream of data ends (all batches in the iterator have been consumed), or the result from
calling \link{xgb.ProxyDMatrix} when there are more batches in the line to be consumed. calling \link{xgb.DataBatch} when there are more batches in the line to be consumed.
}} }}
\item{f_reset}{\verb{function(env)} which is responsible for reseting the data iterator \item{f_reset}{\verb{function(env)} which is responsible for reseting the data iterator
@ -47,5 +47,5 @@ which will consume the data and create a DMatrix from it by executing the callba
For more information, and for a usage example, see the documentation for \link{xgb.ExternalDMatrix}. For more information, and for a usage example, see the documentation for \link{xgb.ExternalDMatrix}.
} }
\seealso{ \seealso{
\link{xgb.ExternalDMatrix}, \link{xgb.ProxyDMatrix}. \link{xgb.ExternalDMatrix}, \link{xgb.DataBatch}.
} }

View File

@ -87,10 +87,10 @@ iterator_next <- function(iterator_env) {
iterator_env[["iter"]] <- curr_iter + 1 iterator_env[["iter"]] <- curr_iter + 1
}) })
# Function 'xgb.ProxyDMatrix' must be called manually # Function 'xgb.DataBatch' must be called manually
# at each batch with all the appropriate attributes, # at each batch with all the appropriate attributes,
# such as feature names and feature types. # such as feature names and feature types.
return(xgb.ProxyDMatrix(data = x_batch, label = y_batch)) return(xgb.DataBatch(data = x_batch, label = y_batch))
} }
# This moves the iterator back to its beginning # This moves the iterator back to its beginning
@ -118,5 +118,5 @@ pred_dm <- predict(model, dm)
pred_mat <- predict(model, as.matrix(mtcars[, -1])) pred_mat <- predict(model, as.matrix(mtcars[, -1]))
} }
\seealso{ \seealso{
\link{xgb.DataIter}, \link{xgb.ProxyDMatrix}, \link{xgb.QuantileDMatrix.from_iterator} \link{xgb.DataIter}, \link{xgb.DataBatch}, \link{xgb.QuantileDMatrix.from_iterator}
} }

View File

@ -60,6 +60,6 @@ For more information, see the guide 'Using XGBoost External Memory Version':
\url{https://xgboost.readthedocs.io/en/stable/tutorials/external_memory.html} \url{https://xgboost.readthedocs.io/en/stable/tutorials/external_memory.html}
} }
\seealso{ \seealso{
\link{xgb.DataIter}, \link{xgb.ProxyDMatrix}, \link{xgb.ExternalDMatrix}, \link{xgb.DataIter}, \link{xgb.DataBatch}, \link{xgb.ExternalDMatrix},
\link{xgb.QuantileDMatrix} \link{xgb.QuantileDMatrix}
} }

View File

@ -472,7 +472,7 @@ test_that("xgb.DMatrix: ExternalDMatrix produces the same results as regular DMa
y = mtcars[, 1] y = mtcars[, 1]
) )
) )
iterator_next <- function(iterator_env, proxy_handle) { iterator_next <- function(iterator_env) {
curr_iter <- iterator_env[["iter"]] curr_iter <- iterator_env[["iter"]]
if (curr_iter >= 2) { if (curr_iter >= 2) {
return(NULL) return(NULL)
@ -487,7 +487,7 @@ test_that("xgb.DMatrix: ExternalDMatrix produces the same results as regular DMa
on.exit({ on.exit({
iterator_env[["iter"]] <- curr_iter + 1 iterator_env[["iter"]] <- curr_iter + 1
}) })
return(xgb.ProxyDMatrix(data = x_batch, label = y_batch)) return(xgb.DataBatch(data = x_batch, label = y_batch))
} }
iterator_reset <- function(iterator_env) { iterator_reset <- function(iterator_env) {
iterator_env[["iter"]] <- 0 iterator_env[["iter"]] <- 0
@ -546,7 +546,7 @@ test_that("xgb.DMatrix: External QDM produces same results as regular QDM", {
y = mtcars[, 1] y = mtcars[, 1]
) )
) )
iterator_next <- function(iterator_env, proxy_handle) { iterator_next <- function(iterator_env) {
curr_iter <- iterator_env[["iter"]] curr_iter <- iterator_env[["iter"]]
if (curr_iter >= 2) { if (curr_iter >= 2) {
return(NULL) return(NULL)
@ -561,7 +561,7 @@ test_that("xgb.DMatrix: External QDM produces same results as regular QDM", {
on.exit({ on.exit({
iterator_env[["iter"]] <- curr_iter + 1 iterator_env[["iter"]] <- curr_iter + 1
}) })
return(xgb.ProxyDMatrix(data = x_batch, label = y_batch)) return(xgb.DataBatch(data = x_batch, label = y_batch))
} }
iterator_reset <- function(iterator_env) { iterator_reset <- function(iterator_env) {
iterator_env[["iter"]] <- 0 iterator_env[["iter"]] <- 0
@ -604,7 +604,7 @@ test_that("xgb.DMatrix: R errors thrown on DataIterator are thrown back to the u
y = mtcars[, 1] y = mtcars[, 1]
) )
) )
iterator_next <- function(iterator_env, proxy_handle) { iterator_next <- function(iterator_env) {
curr_iter <- iterator_env[["iter"]] curr_iter <- iterator_env[["iter"]]
if (curr_iter >= 2) { if (curr_iter >= 2) {
return(0) return(0)
@ -618,7 +618,7 @@ test_that("xgb.DMatrix: R errors thrown on DataIterator are thrown back to the u
on.exit({ on.exit({
iterator_env[["iter"]] <- curr_iter + 1 iterator_env[["iter"]] <- curr_iter + 1
}) })
return(xgb.ProxyDMatrix(data = x_batch, label = y_batch)) return(xgb.DataBatch(data = x_batch, label = y_batch))
} }
iterator_reset <- function(iterator_env) { iterator_reset <- function(iterator_env) {
iterator_env[["iter"]] <- 0 iterator_env[["iter"]] <- 0