[R] Rename ExternalDMatrix -> ExtMemDMatrix. (#10849)

This commit is contained in:
Jiaming Yuan 2024-09-29 05:45:53 +08:00 committed by GitHub
parent 9ee4008654
commit c9f89c4241
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
10 changed files with 46 additions and 43 deletions

View File

@ -29,7 +29,7 @@ export(xgb.DMatrix.hasinfo)
export(xgb.DMatrix.save)
export(xgb.DataBatch)
export(xgb.DataIter)
export(xgb.ExternalDMatrix)
export(xgb.ExtMemDMatrix)
export(xgb.QuantileDMatrix)
export(xgb.QuantileDMatrix.from_iterator)
export(xgb.attr)

View File

@ -418,10 +418,10 @@ xgb.QuantileDMatrix <- function(
#' This function is responsible for generating an R object structure containing callback
#' functions and an environment shared with them.
#'
#' The output structure from this function is then meant to be passed to [xgb.ExternalDMatrix()],
#' The output structure from this function is then meant to be passed to [xgb.ExtMemDMatrix()],
#' which will consume the data and create a DMatrix from it by executing the callback functions.
#'
#' For more information, and for a usage example, see the documentation for [xgb.ExternalDMatrix()].
#' For more information, and for a usage example, see the documentation for [xgb.ExtMemDMatrix()].
#'
#' @param env An R environment to pass to the callback functions supplied here, which can be
#' used to keep track of variables to determine how to handle the batches.
@ -443,8 +443,8 @@ xgb.QuantileDMatrix <- function(
#' Note that, after resetting the iterator, the batches will be accessed again, so the same data
#' (and in the same order) must be passed in subsequent iterations.
#' @return An `xgb.DataIter` object, containing the same inputs supplied here, which can then
#' be passed to [xgb.ExternalDMatrix()].
#' @seealso [xgb.ExternalDMatrix()], [xgb.DataBatch()].
#' be passed to [xgb.ExtMemDMatrix()].
#' @seealso [xgb.ExtMemDMatrix()], [xgb.DataBatch()].
#' @export
xgb.DataIter <- function(env = new.env(), f_next, f_reset) {
if (!is.function(f_next)) {
@ -512,7 +512,7 @@ xgb.DataIter <- function(env = new.env(), f_next, f_reset) {
#'
#' @description
#' Helper function to supply data in batches of a data iterator when
#' constructing a DMatrix from external memory through [xgb.ExternalDMatrix()]
#' constructing a DMatrix from external memory through [xgb.ExtMemDMatrix()]
#' or through [xgb.QuantileDMatrix.from_iterator()].
#'
#' This function is **only** meant to be called inside of a callback function (which
@ -524,7 +524,7 @@ xgb.DataIter <- function(env = new.env(), f_next, f_reset) {
#' an `xgb.DMatrix` - i.e. cannot be used to train a model, nor to get predictions - only
#' possible usage is to supply data to an iterator, from which a DMatrix is then constructed.
#'
#' For more information and for example usage, see the documentation for [xgb.ExternalDMatrix()].
#' For more information and for example usage, see the documentation for [xgb.ExtMemDMatrix()].
#' @inheritParams xgb.DMatrix
#' @param data Batch of data belonging to this batch.
#'
@ -532,7 +532,7 @@ xgb.DataIter <- function(env = new.env(), f_next, f_reset) {
#' to pass here. Supported types are:
#' - `matrix`, with types `numeric`, `integer`, and `logical`. Note that for types
#' `integer` and `logical`, missing values might not be automatically recognized as
#' as such - see the documentation for parameter `missing` in [xgb.ExternalDMatrix()]
#' as such - see the documentation for parameter `missing` in [xgb.ExtMemDMatrix()]
#' for details on this.
#' - `data.frame`, with the same types as supported by 'xgb.DMatrix' and same
#' conversions applied to it. See the documentation for parameter `data` in
@ -540,7 +540,7 @@ xgb.DataIter <- function(env = new.env(), f_next, f_reset) {
#' - CSR matrices, as class `dgRMatrix` from package "Matrix".
#' @return An object of class `xgb.DataBatch`, which is just a list containing the
#' data and parameters passed here. It does **not** inherit from `xgb.DMatrix`.
#' @seealso [xgb.DataIter()], [xgb.ExternalDMatrix()].
#' @seealso [xgb.DataIter()], [xgb.ExtMemDMatrix()].
#' @export
xgb.DataBatch <- function(
data,
@ -643,10 +643,10 @@ xgb.ProxyDMatrix <- function(proxy_handle, data_iterator) {
#'
#' For example, in R `integer` types, missing values are represented by integer number `-2147483648`
#' (since machine 'integer' types do not have an inherent 'NA' value) - hence, if one passes `NA`,
#' which is interpreted as a floating-point NaN by [xgb.ExternalDMatrix()] and by
#' which is interpreted as a floating-point NaN by [xgb.ExtMemDMatrix()] and by
#' [xgb.QuantileDMatrix.from_iterator()], these integer missing values will not be treated as missing.
#' This should not pose any problem for `numeric` types, since they do have an inheret NaN value.
#' @return An 'xgb.DMatrix' object, with subclass 'xgb.ExternalDMatrix', in which the data is not
#' @return An 'xgb.DMatrix' object, with subclass 'xgb.ExtMemDMatrix', in which the data is not
#' held internally but accessed through the iterator when needed.
#' @seealso [xgb.DataIter()], [xgb.DataBatch()], [xgb.QuantileDMatrix.from_iterator()]
#' @examples
@ -706,7 +706,7 @@ xgb.ProxyDMatrix <- function(proxy_handle, data_iterator) {
#' cache_prefix <- tempdir()
#'
#' # DMatrix will be constructed from the iterator's batches
#' dm <- xgb.ExternalDMatrix(data_iterator, cache_prefix, nthread = 1)
#' dm <- xgb.ExtMemDMatrix(data_iterator, cache_prefix, nthread = 1)
#'
#' # After construction, can be used as a regular DMatrix
#' params <- list(nthread = 1, objective = "reg:squarederror")
@ -717,7 +717,7 @@ xgb.ProxyDMatrix <- function(proxy_handle, data_iterator) {
#' pred_dm <- predict(model, dm)
#' pred_mat <- predict(model, as.matrix(mtcars[, -1]))
#' @export
xgb.ExternalDMatrix <- function(
xgb.ExtMemDMatrix <- function(
data_iterator,
cache_prefix = tempdir(),
missing = NA,
@ -753,7 +753,7 @@ xgb.ExternalDMatrix <- function(
)
attributes(dmat) <- list(
class = c("xgb.DMatrix", "xgb.ExternalDMatrix"),
class = c("xgb.DMatrix", "xgb.ExtMemDMatrix"),
fields = attributes(proxy_handle)$fields
)
return(dmat)
@ -766,7 +766,7 @@ xgb.ExternalDMatrix <- function(
#' Create an `xgb.QuantileDMatrix` object (exact same class as would be returned by
#' calling function [xgb.QuantileDMatrix()], with the same advantages and limitations) from
#' external data supplied by [xgb.DataIter()], potentially passed in batches from
#' a bigger set that might not fit entirely in memory, same way as [xgb.ExternalDMatrix()].
#' a bigger set that might not fit entirely in memory, same way as [xgb.ExtMemDMatrix()].
#'
#' Note that, while external data will only be loaded through the iterator (thus the full data
#' might not be held entirely in-memory), the quantized representation of the data will get
@ -776,10 +776,10 @@ xgb.ExternalDMatrix <- function(
#'
#' For more information, see the guide 'Using XGBoost External Memory Version':
#' \url{https://xgboost.readthedocs.io/en/stable/tutorials/external_memory.html}
#' @inheritParams xgb.ExternalDMatrix
#' @inheritParams xgb.ExtMemDMatrix
#' @inheritParams xgb.QuantileDMatrix
#' @return An 'xgb.DMatrix' object, with subclass 'xgb.QuantileDMatrix'.
#' @seealso [xgb.DataIter()], [xgb.DataBatch()], [xgb.ExternalDMatrix()],
#' @seealso [xgb.DataIter()], [xgb.DataBatch()], [xgb.ExtMemDMatrix()],
#' [xgb.QuantileDMatrix()]
#' @export
xgb.QuantileDMatrix.from_iterator <- function( # nolint
@ -1318,8 +1318,8 @@ print.xgb.DMatrix <- function(x, verbose = FALSE, ...) {
}
class_print <- if (inherits(x, "xgb.QuantileDMatrix")) {
"xgb.QuantileDMatrix"
} else if (inherits(x, "xgb.ExternalDMatrix")) {
"xgb.ExternalDMatrix"
} else if (inherits(x, "xgb.ExtMemDMatrix")) {
"xgb.ExtMemDMatrix"
} else if (inherits(x, "xgb.ProxyDMatrix")) {
"xgb.ProxyDMatrix"
} else {

View File

@ -24,7 +24,7 @@
#' for model training by the objective.
#'
#' Note that only the basic `xgb.DMatrix` class is supported - variants such as `xgb.QuantileDMatrix`
#' or `xgb.ExternalDMatrix` are not supported here.
#' or `xgb.ExtMemDMatrix` are not supported here.
#' @param nrounds The max number of iterations.
#' @param nfold The original dataset is randomly partitioned into `nfold` equal size subsamples.
#' @param prediction A logical value indicating whether to return the test fold predictions

View File

@ -26,7 +26,7 @@ to pass here. Supported types are:
\itemize{
\item \code{matrix}, with types \code{numeric}, \code{integer}, and \code{logical}. Note that for types
\code{integer} and \code{logical}, missing values might not be automatically recognized as
as such - see the documentation for parameter \code{missing} in \code{\link[=xgb.ExternalDMatrix]{xgb.ExternalDMatrix()}}
as such - see the documentation for parameter \code{missing} in \code{\link[=xgb.ExtMemDMatrix]{xgb.ExtMemDMatrix()}}
for details on this.
\item \code{data.frame}, with the same types as supported by 'xgb.DMatrix' and same
conversions applied to it. See the documentation for parameter \code{data} in
@ -92,7 +92,7 @@ data and parameters passed here. It does \strong{not} inherit from \code{xgb.DMa
}
\description{
Helper function to supply data in batches of a data iterator when
constructing a DMatrix from external memory through \code{\link[=xgb.ExternalDMatrix]{xgb.ExternalDMatrix()}}
constructing a DMatrix from external memory through \code{\link[=xgb.ExtMemDMatrix]{xgb.ExtMemDMatrix()}}
or through \code{\link[=xgb.QuantileDMatrix.from_iterator]{xgb.QuantileDMatrix.from_iterator()}}.
This function is \strong{only} meant to be called inside of a callback function (which
@ -104,8 +104,8 @@ The object that results from calling this function directly is \strong{not} like
an \code{xgb.DMatrix} - i.e. cannot be used to train a model, nor to get predictions - only
possible usage is to supply data to an iterator, from which a DMatrix is then constructed.
For more information and for example usage, see the documentation for \code{\link[=xgb.ExternalDMatrix]{xgb.ExternalDMatrix()}}.
For more information and for example usage, see the documentation for \code{\link[=xgb.ExtMemDMatrix]{xgb.ExtMemDMatrix()}}.
}
\seealso{
\code{\link[=xgb.DataIter]{xgb.DataIter()}}, \code{\link[=xgb.ExternalDMatrix]{xgb.ExternalDMatrix()}}.
\code{\link[=xgb.DataIter]{xgb.DataIter()}}, \code{\link[=xgb.ExtMemDMatrix]{xgb.ExtMemDMatrix()}}.
}

View File

@ -33,7 +33,7 @@ Note that, after resetting the iterator, the batches will be accessed again, so
}
\value{
An \code{xgb.DataIter} object, containing the same inputs supplied here, which can then
be passed to \code{\link[=xgb.ExternalDMatrix]{xgb.ExternalDMatrix()}}.
be passed to \code{\link[=xgb.ExtMemDMatrix]{xgb.ExtMemDMatrix()}}.
}
\description{
Interface to create a custom data iterator in order to construct a DMatrix
@ -42,11 +42,11 @@ from external memory.
This function is responsible for generating an R object structure containing callback
functions and an environment shared with them.
The output structure from this function is then meant to be passed to \code{\link[=xgb.ExternalDMatrix]{xgb.ExternalDMatrix()}},
The output structure from this function is then meant to be passed to \code{\link[=xgb.ExtMemDMatrix]{xgb.ExtMemDMatrix()}},
which will consume the data and create a DMatrix from it by executing the callback functions.
For more information, and for a usage example, see the documentation for \code{\link[=xgb.ExternalDMatrix]{xgb.ExternalDMatrix()}}.
For more information, and for a usage example, see the documentation for \code{\link[=xgb.ExtMemDMatrix]{xgb.ExtMemDMatrix()}}.
}
\seealso{
\code{\link[=xgb.ExternalDMatrix]{xgb.ExternalDMatrix()}}, \code{\link[=xgb.DataBatch]{xgb.DataBatch()}}.
\code{\link[=xgb.ExtMemDMatrix]{xgb.ExtMemDMatrix()}}, \code{\link[=xgb.DataBatch]{xgb.DataBatch()}}.
}

View File

@ -1,10 +1,10 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/xgb.DMatrix.R
\name{xgb.ExternalDMatrix}
\alias{xgb.ExternalDMatrix}
\name{xgb.ExtMemDMatrix}
\alias{xgb.ExtMemDMatrix}
\title{DMatrix from External Data}
\usage{
xgb.ExternalDMatrix(
xgb.ExtMemDMatrix(
data_iterator,
cache_prefix = tempdir(),
missing = NA,
@ -26,14 +26,14 @@ it will not be adapted for different input types.
For example, in R \code{integer} types, missing values are represented by integer number \code{-2147483648}
(since machine 'integer' types do not have an inherent 'NA' value) - hence, if one passes \code{NA},
which is interpreted as a floating-point NaN by \code{\link[=xgb.ExternalDMatrix]{xgb.ExternalDMatrix()}} and by
which is interpreted as a floating-point NaN by \code{\link[=xgb.ExtMemDMatrix]{xgb.ExtMemDMatrix()}} and by
\code{\link[=xgb.QuantileDMatrix.from_iterator]{xgb.QuantileDMatrix.from_iterator()}}, these integer missing values will not be treated as missing.
This should not pose any problem for \code{numeric} types, since they do have an inheret NaN value.}
\item{nthread}{Number of threads used for creating DMatrix.}
}
\value{
An 'xgb.DMatrix' object, with subclass 'xgb.ExternalDMatrix', in which the data is not
An 'xgb.DMatrix' object, with subclass 'xgb.ExtMemDMatrix', in which the data is not
held internally but accessed through the iterator when needed.
}
\description{
@ -105,7 +105,7 @@ data_iterator <- xgb.DataIter(
cache_prefix <- tempdir()
# DMatrix will be constructed from the iterator's batches
dm <- xgb.ExternalDMatrix(data_iterator, cache_prefix, nthread = 1)
dm <- xgb.ExtMemDMatrix(data_iterator, cache_prefix, nthread = 1)
# After construction, can be used as a regular DMatrix
params <- list(nthread = 1, objective = "reg:squarederror")

View File

@ -25,7 +25,7 @@ it will not be adapted for different input types.
For example, in R \code{integer} types, missing values are represented by integer number \code{-2147483648}
(since machine 'integer' types do not have an inherent 'NA' value) - hence, if one passes \code{NA},
which is interpreted as a floating-point NaN by \code{\link[=xgb.ExternalDMatrix]{xgb.ExternalDMatrix()}} and by
which is interpreted as a floating-point NaN by \code{\link[=xgb.ExtMemDMatrix]{xgb.ExtMemDMatrix()}} and by
\code{\link[=xgb.QuantileDMatrix.from_iterator]{xgb.QuantileDMatrix.from_iterator()}}, these integer missing values will not be treated as missing.
This should not pose any problem for \code{numeric} types, since they do have an inheret NaN value.}
@ -48,7 +48,7 @@ An 'xgb.DMatrix' object, with subclass 'xgb.QuantileDMatrix'.
Create an \code{xgb.QuantileDMatrix} object (exact same class as would be returned by
calling function \code{\link[=xgb.QuantileDMatrix]{xgb.QuantileDMatrix()}}, with the same advantages and limitations) from
external data supplied by \code{\link[=xgb.DataIter]{xgb.DataIter()}}, potentially passed in batches from
a bigger set that might not fit entirely in memory, same way as \code{\link[=xgb.ExternalDMatrix]{xgb.ExternalDMatrix()}}.
a bigger set that might not fit entirely in memory, same way as \code{\link[=xgb.ExtMemDMatrix]{xgb.ExtMemDMatrix()}}.
Note that, while external data will only be loaded through the iterator (thus the full data
might not be held entirely in-memory), the quantized representation of the data will get
@ -60,6 +60,6 @@ For more information, see the guide 'Using XGBoost External Memory Version':
\url{https://xgboost.readthedocs.io/en/stable/tutorials/external_memory.html}
}
\seealso{
\code{\link[=xgb.DataIter]{xgb.DataIter()}}, \code{\link[=xgb.DataBatch]{xgb.DataBatch()}}, \code{\link[=xgb.ExternalDMatrix]{xgb.ExternalDMatrix()}},
\code{\link[=xgb.DataIter]{xgb.DataIter()}}, \code{\link[=xgb.DataBatch]{xgb.DataBatch()}}, \code{\link[=xgb.ExtMemDMatrix]{xgb.ExtMemDMatrix()}},
\code{\link[=xgb.QuantileDMatrix]{xgb.QuantileDMatrix()}}
}

View File

@ -53,7 +53,7 @@ system - thus, for reproducible results, one needs to call the \code{\link[=set.
for model training by the objective.
Note that only the basic \code{xgb.DMatrix} class is supported - variants such as \code{xgb.QuantileDMatrix}
or \code{xgb.ExternalDMatrix} are not supported here.}
or \code{xgb.ExtMemDMatrix} are not supported here.}
\item{nrounds}{The max number of iterations.}

View File

@ -478,7 +478,7 @@ test_that("xgb.DMatrix: QuantileDMatrix is not accepted by exact method", {
})
})
test_that("xgb.DMatrix: ExternalDMatrix produces the same results as regular DMatrix", {
test_that("xgb.DMatrix: ExtMemDMatrix produces the same results as regular DMatrix", {
data(mtcars)
y <- mtcars[, 1]
x <- as.matrix(mtcars[, -1])
@ -528,8 +528,8 @@ test_that("xgb.DMatrix: ExternalDMatrix produces the same results as regular DMa
f_reset = iterator_reset
)
cache_prefix <- tempdir()
edm <- xgb.ExternalDMatrix(data_iterator, cache_prefix, nthread = 1)
expect_true(inherits(edm, "xgb.ExternalDMatrix"))
edm <- xgb.ExtMemDMatrix(data_iterator, cache_prefix, nthread = 1)
expect_true(inherits(edm, "xgb.ExtMemDMatrix"))
expect_true(inherits(edm, "xgb.DMatrix"))
set.seed(123)
model_ext <- xgb.train(
@ -660,7 +660,7 @@ test_that("xgb.DMatrix: R errors thrown on DataIterator are thrown back to the u
f_reset = iterator_reset
)
expect_error(
{xgb.ExternalDMatrix(data_iterator, nthread = 1)},
{xgb.ExtMemDMatrix(data_iterator, nthread = 1)},
"custom error"
)
})

View File

@ -37,6 +37,9 @@ def pack_rpackage() -> Path:
output = subprocess.run(["git", "clean", "-xdf", "--dry-run"], capture_output=True)
if output.returncode != 0:
raise ValueError("Failed to check git repository status.", output)
if len(output.stdout) == 0:
would_remove = None
else:
would_remove = output.stdout.decode("utf-8").strip().split("\n")
if would_remove and not all(f.find("tests/ci_build") != -1 for f in would_remove):