From 09552132209541d8f373f003ebdd4b734cb2f84d Mon Sep 17 00:00:00 2001 From: david-cortes Date: Wed, 31 Jan 2024 08:43:58 +0100 Subject: [PATCH] [R] rename proxy dmatrix -> data batch (#10016) --- R-package/NAMESPACE | 2 +- R-package/R/xgb.DMatrix.R | 43 ++++++++++--------- .../{xgb.ProxyDMatrix.Rd => xgb.DataBatch.Rd} | 14 +++--- R-package/man/xgb.DataIter.Rd | 6 +-- R-package/man/xgb.ExternalDMatrix.Rd | 6 +-- .../man/xgb.QuantileDMatrix.from_iterator.Rd | 2 +- R-package/tests/testthat/test_dmatrix.R | 12 +++--- 7 files changed, 43 insertions(+), 42 deletions(-) rename R-package/man/{xgb.ProxyDMatrix.Rd => xgb.DataBatch.Rd} (93%) diff --git a/R-package/NAMESPACE b/R-package/NAMESPACE index bb5959eab..580d1f873 100644 --- a/R-package/NAMESPACE +++ b/R-package/NAMESPACE @@ -32,9 +32,9 @@ export(setinfo) export(xgb.DMatrix) export(xgb.DMatrix.hasinfo) export(xgb.DMatrix.save) +export(xgb.DataBatch) export(xgb.DataIter) export(xgb.ExternalDMatrix) -export(xgb.ProxyDMatrix) export(xgb.QuantileDMatrix) export(xgb.QuantileDMatrix.from_iterator) export(xgb.attr) diff --git a/R-package/R/xgb.DMatrix.R b/R-package/R/xgb.DMatrix.R index ad446d248..a4c476dbc 100644 --- a/R-package/R/xgb.DMatrix.R +++ b/R-package/R/xgb.DMatrix.R @@ -348,7 +348,7 @@ xgb.QuantileDMatrix <- function( .Call(XGDMatrixFree_R, proxy_handle) }) iterator_next <- function() { - return(xgb.ProxyDMatrix.internal(proxy_handle, data_iterator)) + return(xgb.ProxyDMatrix(proxy_handle, data_iterator)) } iterator_reset <- function() { return(data_iterator$f_reset(iterator_env)) @@ -391,12 +391,12 @@ xgb.QuantileDMatrix <- function( #' to know which part of the data to pass next. #' @param f_next `function(env)` which is responsible for:\itemize{ #' \item Accessing or retrieving the next batch of data in the iterator. -#' \item Supplying this data by calling function \link{xgb.ProxyDMatrix} on it and returning the result. +#' \item Supplying this data by calling function \link{xgb.DataBatch} on it and returning the result. #' \item Keeping track of where in the iterator batch it is or will go next, which can for example #' be done by modifiying variables in the `env` variable that is passed here. #' \item Signaling whether there are more batches to be consumed or not, by returning `NULL` #' when the stream of data ends (all batches in the iterator have been consumed), or the result from -#' calling \link{xgb.ProxyDMatrix} when there are more batches in the line to be consumed. +#' calling \link{xgb.DataBatch} when there are more batches in the line to be consumed. #' } #' @param f_reset `function(env)` which is responsible for reseting the data iterator #' (i.e. taking it back to the first batch, called before and after the sequence of batches @@ -406,7 +406,7 @@ xgb.QuantileDMatrix <- function( #' (and in the same order) must be passed in subsequent iterations. #' @return An `xgb.DataIter` object, containing the same inputs supplied here, which can then #' be passed to \link{xgb.ExternalDMatrix}. -#' @seealso \link{xgb.ExternalDMatrix}, \link{xgb.ProxyDMatrix}. +#' @seealso \link{xgb.ExternalDMatrix}, \link{xgb.DataBatch}. #' @export xgb.DataIter <- function(env = new.env(), f_next, f_reset) { if (!is.function(f_next)) { @@ -434,7 +434,7 @@ xgb.DataIter <- function(env = new.env(), f_next, f_reset) { env[["iter"]] <- curr_iter + 1L }) return( - xgb.ProxyDMatrix( + xgb.DataBatch( data = env[["data"]], label = env[["label"]], weight = env[["weight"]], @@ -464,13 +464,13 @@ xgb.DataIter <- function(env = new.env(), f_next, f_reset) { .make.proxy.handle <- function() { out <- .Call(XGProxyDMatrixCreate_R) attributes(out) <- list( - class = c("xgb.DMatrix", "xgb.ProxyDMatrixHandle"), + class = c("xgb.DMatrix", "xgb.ProxyDMatrix"), fields = new.env() ) return(out) } -#' @title Proxy DMatrix Updater +#' @title Structure for Data Batches #' @description Helper function to supply data in batches of a data iterator when #' constructing a DMatrix from external memory through \link{xgb.ExternalDMatrix} #' or through \link{xgb.QuantileDMatrix.from_iterator}. @@ -480,8 +480,8 @@ xgb.DataIter <- function(env = new.env(), f_next, f_reset) { #' when constructing a DMatrix through external memory - otherwise, one should call #' \link{xgb.DMatrix} or \link{xgb.QuantileDMatrix}. #' -#' The object that results from calling this function directly is \bold{not} like the other -#' `xgb.DMatrix` variants - i.e. cannot be used to train a model, nor to get predictions - only +#' The object that results from calling this function directly is \bold{not} like +#' an `xgb.DMatrix` - i.e. cannot be used to train a model, nor to get predictions - only #' possible usage is to supply data to an iterator, from which a DMatrix is then constructed. #' #' For more information and for example usage, see the documentation for \link{xgb.ExternalDMatrix}. @@ -499,11 +499,11 @@ xgb.DataIter <- function(env = new.env(), f_next, f_reset) { #' \link{xgb.DMatrix} for details on it. #' \item CSR matrices, as class `dgRMatrix` from package `Matrix`. #' } -#' @return An object of class `xgb.ProxyDMatrix`, which is just a list containing the +#' @return An object of class `xgb.DataBatch`, which is just a list containing the #' data and parameters passed here. It does \bold{not} inherit from `xgb.DMatrix`. #' @seealso \link{xgb.DataIter}, \link{xgb.ExternalDMatrix}. #' @export -xgb.ProxyDMatrix <- function( +xgb.DataBatch <- function( data, label = NULL, weight = NULL, @@ -530,17 +530,18 @@ xgb.ProxyDMatrix <- function( label_upper_bound = label_upper_bound, feature_weights = feature_weights ) - class(out) <- "xgb.ProxyDMatrix" + class(out) <- "xgb.DataBatch" return(out) } -xgb.ProxyDMatrix.internal <- function(proxy_handle, data_iterator) { +# This is only for internal usage, class is not exposed to the user. +xgb.ProxyDMatrix <- function(proxy_handle, data_iterator) { lst <- data_iterator$f_next(data_iterator$env) if (is.null(lst)) { return(0L) } - if (!inherits(lst, "xgb.ProxyDMatrix")) { - stop("DataIter 'f_next' must return either NULL or the result from calling 'xgb.ProxyDMatrix'.") + if (!inherits(lst, "xgb.DataBatch")) { + stop("DataIter 'f_next' must return either NULL or the result from calling 'xgb.DataBatch'.") } if (!is.null(lst$group) && !is.null(lst$qid)) { @@ -606,7 +607,7 @@ xgb.ProxyDMatrix.internal <- function(proxy_handle, data_iterator) { #' This should not pose any problem for `numeric` types, since they do have an inheret NaN value. #' @return An 'xgb.DMatrix' object, with subclass 'xgb.ExternalDMatrix', in which the data is not #' held internally but accessed through the iterator when needed. -#' @seealso \link{xgb.DataIter}, \link{xgb.ProxyDMatrix}, \link{xgb.QuantileDMatrix.from_iterator} +#' @seealso \link{xgb.DataIter}, \link{xgb.DataBatch}, \link{xgb.QuantileDMatrix.from_iterator} #' @examples #' library(xgboost) #' data(mtcars) @@ -646,10 +647,10 @@ xgb.ProxyDMatrix.internal <- function(proxy_handle, data_iterator) { #' iterator_env[["iter"]] <- curr_iter + 1 #' }) #' -#' # Function 'xgb.ProxyDMatrix' must be called manually +#' # Function 'xgb.DataBatch' must be called manually #' # at each batch with all the appropriate attributes, #' # such as feature names and feature types. -#' return(xgb.ProxyDMatrix(data = x_batch, label = y_batch)) +#' return(xgb.DataBatch(data = x_batch, label = y_batch)) #' } #' #' # This moves the iterator back to its beginning @@ -693,7 +694,7 @@ xgb.ExternalDMatrix <- function( .Call(XGDMatrixFree_R, proxy_handle) }) iterator_next <- function() { - return(xgb.ProxyDMatrix.internal(proxy_handle, data_iterator)) + return(xgb.ProxyDMatrix(proxy_handle, data_iterator)) } iterator_reset <- function() { return(data_iterator$f_reset(data_iterator$env)) @@ -736,7 +737,7 @@ xgb.ExternalDMatrix <- function( #' @inheritParams xgb.ExternalDMatrix #' @inheritParams xgb.QuantileDMatrix #' @return An 'xgb.DMatrix' object, with subclass 'xgb.QuantileDMatrix'. -#' @seealso \link{xgb.DataIter}, \link{xgb.ProxyDMatrix}, \link{xgb.ExternalDMatrix}, +#' @seealso \link{xgb.DataIter}, \link{xgb.DataBatch}, \link{xgb.ExternalDMatrix}, #' \link{xgb.QuantileDMatrix} #' @export xgb.QuantileDMatrix.from_iterator <- function( # nolint @@ -758,7 +759,7 @@ xgb.QuantileDMatrix.from_iterator <- function( # nolint .Call(XGDMatrixFree_R, proxy_handle) }) iterator_next <- function() { - return(xgb.ProxyDMatrix.internal(proxy_handle, data_iterator)) + return(xgb.ProxyDMatrix(proxy_handle, data_iterator)) } iterator_reset <- function() { return(data_iterator$f_reset(data_iterator$env)) diff --git a/R-package/man/xgb.ProxyDMatrix.Rd b/R-package/man/xgb.DataBatch.Rd similarity index 93% rename from R-package/man/xgb.ProxyDMatrix.Rd rename to R-package/man/xgb.DataBatch.Rd index cf173a2db..0eeb117e8 100644 --- a/R-package/man/xgb.ProxyDMatrix.Rd +++ b/R-package/man/xgb.DataBatch.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/xgb.DMatrix.R -\name{xgb.ProxyDMatrix} -\alias{xgb.ProxyDMatrix} -\title{Proxy DMatrix Updater} +\name{xgb.DataBatch} +\alias{xgb.DataBatch} +\title{Structure for Data Batches} \usage{ -xgb.ProxyDMatrix( +xgb.DataBatch( data, label = NULL, weight = NULL, @@ -82,7 +82,7 @@ functionalities such as feature importances.} \item{feature_weights}{Set feature weights for column sampling.} } \value{ -An object of class \code{xgb.ProxyDMatrix}, which is just a list containing the +An object of class \code{xgb.DataBatch}, which is just a list containing the data and parameters passed here. It does \bold{not} inherit from \code{xgb.DMatrix}. } \description{ @@ -95,8 +95,8 @@ is passed as argument to function \link{xgb.DataIter} to construct a data iterat when constructing a DMatrix through external memory - otherwise, one should call \link{xgb.DMatrix} or \link{xgb.QuantileDMatrix}. -The object that results from calling this function directly is \bold{not} like the other -\code{xgb.DMatrix} variants - i.e. cannot be used to train a model, nor to get predictions - only +The object that results from calling this function directly is \bold{not} like +an \code{xgb.DMatrix} - i.e. cannot be used to train a model, nor to get predictions - only possible usage is to supply data to an iterator, from which a DMatrix is then constructed. For more information and for example usage, see the documentation for \link{xgb.ExternalDMatrix}. diff --git a/R-package/man/xgb.DataIter.Rd b/R-package/man/xgb.DataIter.Rd index 29cf5acc9..2bd68ce51 100644 --- a/R-package/man/xgb.DataIter.Rd +++ b/R-package/man/xgb.DataIter.Rd @@ -15,12 +15,12 @@ to know which part of the data to pass next.} \item{f_next}{\verb{function(env)} which is responsible for:\itemize{ \item Accessing or retrieving the next batch of data in the iterator. -\item Supplying this data by calling function \link{xgb.ProxyDMatrix} on it and returning the result. +\item Supplying this data by calling function \link{xgb.DataBatch} on it and returning the result. \item Keeping track of where in the iterator batch it is or will go next, which can for example be done by modifiying variables in the \code{env} variable that is passed here. \item Signaling whether there are more batches to be consumed or not, by returning \code{NULL} when the stream of data ends (all batches in the iterator have been consumed), or the result from -calling \link{xgb.ProxyDMatrix} when there are more batches in the line to be consumed. +calling \link{xgb.DataBatch} when there are more batches in the line to be consumed. }} \item{f_reset}{\verb{function(env)} which is responsible for reseting the data iterator @@ -47,5 +47,5 @@ which will consume the data and create a DMatrix from it by executing the callba For more information, and for a usage example, see the documentation for \link{xgb.ExternalDMatrix}. } \seealso{ -\link{xgb.ExternalDMatrix}, \link{xgb.ProxyDMatrix}. +\link{xgb.ExternalDMatrix}, \link{xgb.DataBatch}. } diff --git a/R-package/man/xgb.ExternalDMatrix.Rd b/R-package/man/xgb.ExternalDMatrix.Rd index 3e7844990..14a872cb5 100644 --- a/R-package/man/xgb.ExternalDMatrix.Rd +++ b/R-package/man/xgb.ExternalDMatrix.Rd @@ -87,10 +87,10 @@ iterator_next <- function(iterator_env) { iterator_env[["iter"]] <- curr_iter + 1 }) - # Function 'xgb.ProxyDMatrix' must be called manually + # Function 'xgb.DataBatch' must be called manually # at each batch with all the appropriate attributes, # such as feature names and feature types. - return(xgb.ProxyDMatrix(data = x_batch, label = y_batch)) + return(xgb.DataBatch(data = x_batch, label = y_batch)) } # This moves the iterator back to its beginning @@ -118,5 +118,5 @@ pred_dm <- predict(model, dm) pred_mat <- predict(model, as.matrix(mtcars[, -1])) } \seealso{ -\link{xgb.DataIter}, \link{xgb.ProxyDMatrix}, \link{xgb.QuantileDMatrix.from_iterator} +\link{xgb.DataIter}, \link{xgb.DataBatch}, \link{xgb.QuantileDMatrix.from_iterator} } diff --git a/R-package/man/xgb.QuantileDMatrix.from_iterator.Rd b/R-package/man/xgb.QuantileDMatrix.from_iterator.Rd index 21f24576d..791b5576e 100644 --- a/R-package/man/xgb.QuantileDMatrix.from_iterator.Rd +++ b/R-package/man/xgb.QuantileDMatrix.from_iterator.Rd @@ -60,6 +60,6 @@ For more information, see the guide 'Using XGBoost External Memory Version': \url{https://xgboost.readthedocs.io/en/stable/tutorials/external_memory.html} } \seealso{ -\link{xgb.DataIter}, \link{xgb.ProxyDMatrix}, \link{xgb.ExternalDMatrix}, +\link{xgb.DataIter}, \link{xgb.DataBatch}, \link{xgb.ExternalDMatrix}, \link{xgb.QuantileDMatrix} } diff --git a/R-package/tests/testthat/test_dmatrix.R b/R-package/tests/testthat/test_dmatrix.R index 43b7515bb..50621f241 100644 --- a/R-package/tests/testthat/test_dmatrix.R +++ b/R-package/tests/testthat/test_dmatrix.R @@ -472,7 +472,7 @@ test_that("xgb.DMatrix: ExternalDMatrix produces the same results as regular DMa y = mtcars[, 1] ) ) - iterator_next <- function(iterator_env, proxy_handle) { + iterator_next <- function(iterator_env) { curr_iter <- iterator_env[["iter"]] if (curr_iter >= 2) { return(NULL) @@ -487,7 +487,7 @@ test_that("xgb.DMatrix: ExternalDMatrix produces the same results as regular DMa on.exit({ iterator_env[["iter"]] <- curr_iter + 1 }) - return(xgb.ProxyDMatrix(data = x_batch, label = y_batch)) + return(xgb.DataBatch(data = x_batch, label = y_batch)) } iterator_reset <- function(iterator_env) { iterator_env[["iter"]] <- 0 @@ -546,7 +546,7 @@ test_that("xgb.DMatrix: External QDM produces same results as regular QDM", { y = mtcars[, 1] ) ) - iterator_next <- function(iterator_env, proxy_handle) { + iterator_next <- function(iterator_env) { curr_iter <- iterator_env[["iter"]] if (curr_iter >= 2) { return(NULL) @@ -561,7 +561,7 @@ test_that("xgb.DMatrix: External QDM produces same results as regular QDM", { on.exit({ iterator_env[["iter"]] <- curr_iter + 1 }) - return(xgb.ProxyDMatrix(data = x_batch, label = y_batch)) + return(xgb.DataBatch(data = x_batch, label = y_batch)) } iterator_reset <- function(iterator_env) { iterator_env[["iter"]] <- 0 @@ -604,7 +604,7 @@ test_that("xgb.DMatrix: R errors thrown on DataIterator are thrown back to the u y = mtcars[, 1] ) ) - iterator_next <- function(iterator_env, proxy_handle) { + iterator_next <- function(iterator_env) { curr_iter <- iterator_env[["iter"]] if (curr_iter >= 2) { return(0) @@ -618,7 +618,7 @@ test_that("xgb.DMatrix: R errors thrown on DataIterator are thrown back to the u on.exit({ iterator_env[["iter"]] <- curr_iter + 1 }) - return(xgb.ProxyDMatrix(data = x_batch, label = y_batch)) + return(xgb.DataBatch(data = x_batch, label = y_batch)) } iterator_reset <- function(iterator_env) { iterator_env[["iter"]] <- 0