[R] Add data iterator, quantile dmatrix, external memory, and missing feature_types (#9913)

2024-01-30 12:26:44 +01:00
parent d9f4ab557a
commit 3abbbe41ac
13 changed files with 1754 additions and 104 deletions
--- a/R-package/man/xgb.DMatrix.Rd
+++ b/R-package/man/xgb.DMatrix.Rd
@@ -2,6 +2,7 @@
 % Please edit documentation in R/xgb.DMatrix.R
 \name{xgb.DMatrix}
 \alias{xgb.DMatrix}
+\alias{xgb.QuantileDMatrix}
 \title{Construct xgb.DMatrix object}
 \usage{
 xgb.DMatrix(
@@ -12,6 +13,7 @@ xgb.DMatrix(
  missing = NA,
  silent = FALSE,
  feature_names = colnames(data),
+  feature_types = NULL,
  nthread = NULL,
  group = NULL,
  qid = NULL,
@@ -20,12 +22,55 @@ xgb.DMatrix(
  feature_weights = NULL,
  enable_categorical = FALSE
 )
+
+xgb.QuantileDMatrix(
+  data,
+  label = NULL,
+  weight = NULL,
+  base_margin = NULL,
+  missing = NA,
+  feature_names = colnames(data),
+  feature_types = NULL,
+  nthread = NULL,
+  group = NULL,
+  qid = NULL,
+  label_lower_bound = NULL,
+  label_upper_bound = NULL,
+  feature_weights = NULL,
+  enable_categorical = FALSE,
+  ref = NULL,
+  max_bin = NULL
+)
 }
 \arguments{
-\item{data}{a \code{matrix} object (either numeric or integer), a \code{dgCMatrix} object,
-a \code{dgRMatrix} object,
-a \code{dsparseVector} object (only when making predictions from a fitted model, will be
-interpreted as a row vector), or a character string representing a filename.}
+\item{data}{Data from which to create a DMatrix, which can then be used for fitting models or
+for getting predictions out of a fitted model.
+
+Supported input types are as follows:\itemize{
+\item \code{matrix} objects, with types \code{numeric}, \code{integer}, or \code{logical}.
+\item \code{data.frame} objects, with columns of types \code{numeric}, \code{integer}, \code{logical}, or \code{factor}.
+
+If passing \code{enable_categorical=TRUE}, columns with \code{factor} type will be treated as categorical.
+Otherwise, if passing \code{enable_categorical=FALSE} and the data contains \code{factor} columns, an error
+will be thrown.
+
+Note that xgboost uses base-0 encoding for categorical types, hence \code{factor} types (which use base-1
+encoding') will be converted inside the function call. Be aware that the encoding used for \code{factor}
+types is not kept as part of the model, so in subsequent calls to \code{predict}, it is the user's
+responsibility to ensure that factor columns have the same levels as the ones from which the DMatrix
+was constructed.
+
+Other column types are not supported.
+\item CSR matrices, as class \code{dgRMatrix} from package \code{Matrix}.
+\item CSC matrices, as class \code{dgCMatrix} from package \code{Matrix}. These are \bold{not} supported for
+'xgb.QuantileDMatrix'.
+\item Single-row CSR matrices, as class \code{dsparseVector} from package \code{Matrix}, which is interpreted
+as a single row (only when making predictions from a fitted model).
+\item Text files in SVMLight / LibSVM formats, passed as a path to the file. These are \bold{not}
+supported for xgb.QuantileDMatrix'.
+\item Binary files generated by \link{xgb.DMatrix.save},  passed as a path to the file. These are
+\bold{not} supported for xgb.QuantileDMatrix'.
+}}

 \item{label}{Label of the training data.}

@@ -41,13 +86,36 @@ so it doesn't make sense to assign weights to individual data points.}
 \if{html}{\out{<div class="sourceCode">}}\preformatted{   In the case of multi-output models, one can also pass multi-dimensional base_margin.
 }\if{html}{\out{</div>}}}

-\item{missing}{a float value to represents missing values in data (used only when input is a dense matrix).
-It is useful when a 0 or some other extreme value represents missing values in data.}
+\item{missing}{A float value to represents missing values in data (not used when creating DMatrix
+from text files).
+It is useful to change when a zero, infinite, or some other extreme value represents missing
+values in data.}

 \item{silent}{whether to suppress printing an informational message after loading from a file.}

 \item{feature_names}{Set names for features. Overrides column names in data
-frame and matrix.}
+frame and matrix.
+
+\if{html}{\out{<div class="sourceCode">}}\preformatted{   Note: columns are not referenced by name when calling `predict`, so the column order there
+   must be the same as in the DMatrix construction, regardless of the column names.
+}\if{html}{\out{</div>}}}
+
+\item{feature_types}{Set types for features.
+
+If \code{data} is a \code{data.frame} and passing \code{enable_categorical=TRUE}, the types will be deduced
+automatically from the column types.
+
+Otherwise, one can pass a character vector with the same length as number of columns in \code{data},
+with the following possible values:\itemize{
+\item "c", which represents categorical columns.
+\item "q", which represents numeric columns.
+\item "int", which represents integer columns.
+\item "i", which represents logical (boolean) columns.
+}
+
+Note that, while categorical types are treated differently from the rest for model fitting
+purposes, the other types do not influence the generated model, but have effects in other
+functionalities such as feature importances.}

 \item{nthread}{Number of threads used for creating DMatrix.}

@@ -74,13 +142,33 @@ frame and matrix.}

                      JSON/UBJSON serialization format is required for this.
 }\if{html}{\out{</div>}}}
+
+\item{ref}{The training dataset that provides quantile information, needed when creating
+validation/test dataset with \code{xgb.QuantileDMatrix}. Supplying the training DMatrix
+as a reference means that the same quantisation applied to the training data is
+applied to the validation/test data}
+
+\item{max_bin}{The number of histogram bin, should be consistent with the training parameter
+\code{max_bin}.
+
+This is only supported when constructing a QuantileDMatrix.}
+}
+\value{
+An 'xgb.DMatrix' object. If calling 'xgb.QuantileDMatrix', it will have additional
+subclass 'xgb.QuantileDMatrix'.
 }
 \description{
-Construct xgb.DMatrix object from either a dense matrix, a sparse matrix, or a local file.
-Supported input file formats are either a LIBSVM text file or a binary file that was created previously by
-\code{\link{xgb.DMatrix.save}}).
+Construct an 'xgb.DMatrix' object from a given data source, which can then be passed to functions
+such as \link{xgb.train} or \link{predict.xgb.Booster}.
 }
 \details{
+Function 'xgb.QuantileDMatrix' will construct a DMatrix with quantization for the histogram
+method already applied to it, which can be used to reduce memory usage (compared to using a
+a regular DMatrix first and then creating a quantization out of it) when using the histogram
+method (\code{tree_method = "hist"}, which is the default algorithm), but is not usable for the
+sorted-indices method (\code{tree_method = "exact"}), nor for the approximate method
+(\code{tree_method = "approx"}).
+
 Note that DMatrix objects are not serializable through R functions such as \code{saveRDS} or \code{save}.
 If a DMatrix gets serialized and then de-serialized (for example, when saving data in an R session or caching
 chunks in an Rmd file), the resulting object will not be usable anymore and will need to be reconstructed
--- a/R-package/man/xgb.DataIter.Rd
+++ b/R-package/man/xgb.DataIter.Rd
@@ -0,0 +1,51 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/xgb.DMatrix.R
+\name{xgb.DataIter}
+\alias{xgb.DataIter}
+\title{XGBoost Data Iterator}
+\usage{
+xgb.DataIter(env = new.env(), f_next, f_reset)
+}
+\arguments{
+\item{env}{An R environment to pass to the callback functions supplied here, which can be
+used to keep track of variables to determine how to handle the batches.
+
+For example, one might want to keep track of an iteration number in this environment in order
+to know which part of the data to pass next.}
+
+\item{f_next}{\verb{function(env)} which is responsible for:\itemize{
+\item Accessing or retrieving the next batch of data in the iterator.
+\item Supplying this data by calling function \link{xgb.ProxyDMatrix} on it and returning the result.
+\item Keeping track of where in the iterator batch it is or will go next, which can for example
+be done by modifiying variables in the \code{env} variable that is passed here.
+\item Signaling whether there are more batches to be consumed or not, by returning \code{NULL}
+when the stream of data ends (all batches in the iterator have been consumed), or the result from
+calling \link{xgb.ProxyDMatrix} when there are more batches in the line to be consumed.
+}}
+
+\item{f_reset}{\verb{function(env)} which is responsible for reseting the data iterator
+(i.e. taking it back to the first batch, called before and after the sequence of batches
+has been consumed).
+
+Note that, after resetting the iterator, the batches will be accessed again, so the same data
+(and in the same order) must be passed in subsequent iterations.}
+}
+\value{
+An \code{xgb.DataIter} object, containing the same inputs supplied here, which can then
+be passed to \link{xgb.ExternalDMatrix}.
+}
+\description{
+Interface to create a custom data iterator in order to construct a DMatrix
+from external memory.
+
+This function is responsible for generating an R object structure containing callback
+functions and an environment shared with them.
+
+The output structure from this function is then meant to be passed to \link{xgb.ExternalDMatrix},
+which will consume the data and create a DMatrix from it by executing the callback functions.
+
+For more information, and for a usage example, see the documentation for \link{xgb.ExternalDMatrix}.
+}
+\seealso{
+\link{xgb.ExternalDMatrix}, \link{xgb.ProxyDMatrix}.
+}
--- a/R-package/man/xgb.ExternalDMatrix.Rd
+++ b/R-package/man/xgb.ExternalDMatrix.Rd
@@ -0,0 +1,122 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/xgb.DMatrix.R
+\name{xgb.ExternalDMatrix}
+\alias{xgb.ExternalDMatrix}
+\title{DMatrix from External Data}
+\usage{
+xgb.ExternalDMatrix(
+  data_iterator,
+  cache_prefix = tempdir(),
+  missing = NA,
+  nthread = NULL
+)
+}
+\arguments{
+\item{data_iterator}{A data iterator structure as returned by \link{xgb.DataIter},
+which includes an environment shared between function calls, and functions to access
+the data in batches on-demand.}
+
+\item{cache_prefix}{The path of cache file, caller must initialize all the directories in this path.}
+
+\item{missing}{A float value to represents missing values in data.
+
+Note that, while functions like \link{xgb.DMatrix} can take a generic \code{NA} and interpret it
+correctly for different types like \code{numeric} and \code{integer}, if an \code{NA} value is passed here,
+it will not be adapted for different input types.
+
+For example, in R \code{integer} types, missing values are represented by integer number \code{-2147483648}
+(since machine 'integer' types do not have an inherent 'NA' value) - hence, if one passes \code{NA},
+which is interpreted as a floating-point NaN by 'xgb.ExternalDMatrix' and by
+'xgb.QuantileDMatrix.from_iterator', these integer missing values will not be treated as missing.
+This should not pose any problem for \code{numeric} types, since they do have an inheret NaN value.}
+
+\item{nthread}{Number of threads used for creating DMatrix.}
+}
+\value{
+An 'xgb.DMatrix' object, with subclass 'xgb.ExternalDMatrix', in which the data is not
+held internally but accessed through the iterator when needed.
+}
+\description{
+Create a special type of xgboost 'DMatrix' object from external data
+supplied by an \link{xgb.DataIter} object, potentially passed in batches from a
+bigger set that might not fit entirely in memory.
+
+The data supplied by the iterator is accessed on-demand as needed, multiple times,
+without being concatenated, but note that fields like 'label' \bold{will} be
+concatenated from multiple calls to the data iterator.
+
+For more information, see the guide 'Using XGBoost External Memory Version':
+\url{https://xgboost.readthedocs.io/en/stable/tutorials/external_memory.html}
+}
+\examples{
+library(xgboost)
+data(mtcars)
+
+# this custom environment will be passed to the iterator
+# functions at each call. It's up to the user to keep
+# track of the iteration number in this environment.
+iterator_env <- as.environment(
+  list(
+    iter = 0,
+    x = mtcars[, -1],
+    y = mtcars[, 1]
+  )
+)
+
+# Data is passed in two batches.
+# In this example, batches are obtained by subsetting the 'x' variable.
+# This is not advantageous to do, since the data is already loaded in memory
+# and can be passed in full in one go, but there can be situations in which
+# only a subset of the data will fit in the computer's memory, and it can
+# be loaded in batches that are accessed one-at-a-time only.
+iterator_next <- function(iterator_env) {
+  curr_iter <- iterator_env[["iter"]]
+  if (curr_iter >= 2) {
+    # there are only two batches, so this signals end of the stream
+    return(NULL)
+  }
+
+  if (curr_iter == 0) {
+    x_batch <- iterator_env[["x"]][1:16, ]
+    y_batch <- iterator_env[["y"]][1:16]
+  } else {
+    x_batch <- iterator_env[["x"]][17:32, ]
+    y_batch <- iterator_env[["y"]][17:32]
+  }
+  on.exit({
+    iterator_env[["iter"]] <- curr_iter + 1
+  })
+
+  # Function 'xgb.ProxyDMatrix' must be called manually
+  # at each batch with all the appropriate attributes,
+  # such as feature names and feature types.
+  return(xgb.ProxyDMatrix(data = x_batch, label = y_batch))
+}
+
+# This moves the iterator back to its beginning
+iterator_reset <- function(iterator_env) {
+  iterator_env[["iter"]] <- 0
+}
+
+data_iterator <- xgb.DataIter(
+  env = iterator_env,
+  f_next = iterator_next,
+  f_reset = iterator_reset
+)
+cache_prefix <- tempdir()
+
+# DMatrix will be constructed from the iterator's batches
+dm <- xgb.ExternalDMatrix(data_iterator, cache_prefix, nthread = 1)
+
+# After construction, can be used as a regular DMatrix
+params <- list(nthread = 1, objective = "reg:squarederror")
+model <- xgb.train(data = dm, nrounds = 2, params = params)
+
+# Predictions can also be called on it, and should be the same
+# as if the data were passed differently.
+pred_dm <- predict(model, dm)
+pred_mat <- predict(model, as.matrix(mtcars[, -1]))
+}
+\seealso{
+\link{xgb.DataIter}, \link{xgb.ProxyDMatrix}, \link{xgb.QuantileDMatrix.from_iterator}
+}
--- a/R-package/man/xgb.ProxyDMatrix.Rd
+++ b/R-package/man/xgb.ProxyDMatrix.Rd
@@ -0,0 +1,121 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/xgb.DMatrix.R
+\name{xgb.ProxyDMatrix}
+\alias{xgb.ProxyDMatrix}
+\title{Proxy DMatrix Updater}
+\usage{
+xgb.ProxyDMatrix(
+  data,
+  label = NULL,
+  weight = NULL,
+  base_margin = NULL,
+  feature_names = colnames(data),
+  feature_types = NULL,
+  group = NULL,
+  qid = NULL,
+  label_lower_bound = NULL,
+  label_upper_bound = NULL,
+  feature_weights = NULL,
+  enable_categorical = FALSE
+)
+}
+\arguments{
+\item{data}{Batch of data belonging to this batch.
+
+Note that not all of the input types supported by \link{xgb.DMatrix} are possible
+to pass here. Supported types are:\itemize{
+\item \code{matrix}, with types \code{numeric}, \code{integer}, and \code{logical}. Note that for types
+\code{integer} and \code{logical}, missing values might not be automatically recognized as
+as such - see the documentation for parameter \code{missing} in \link{xgb.ExternalDMatrix}
+for details on this.
+\item \code{data.frame}, with the same types as supported by 'xgb.DMatrix' and same
+conversions applied to it. See the documentation for parameter \code{data} in
+\link{xgb.DMatrix} for details on it.
+\item CSR matrices, as class \code{dgRMatrix} from package \code{Matrix}.
+}}
+
+\item{label}{Label of the training data.}
+
+\item{weight}{Weight for each instance.
+
+Note that, for ranking task, weights are per-group.  In ranking task, one weight
+is assigned to each group (not each data point). This is because we
+only care about the relative ordering of data points within each group,
+so it doesn't make sense to assign weights to individual data points.}
+
+\item{base_margin}{Base margin used for boosting from existing model.
+
+\if{html}{\out{<div class="sourceCode">}}\preformatted{   In the case of multi-output models, one can also pass multi-dimensional base_margin.
+}\if{html}{\out{</div>}}}
+
+\item{feature_names}{Set names for features. Overrides column names in data
+frame and matrix.
+
+\if{html}{\out{<div class="sourceCode">}}\preformatted{   Note: columns are not referenced by name when calling `predict`, so the column order there
+   must be the same as in the DMatrix construction, regardless of the column names.
+}\if{html}{\out{</div>}}}
+
+\item{feature_types}{Set types for features.
+
+If \code{data} is a \code{data.frame} and passing \code{enable_categorical=TRUE}, the types will be deduced
+automatically from the column types.
+
+Otherwise, one can pass a character vector with the same length as number of columns in \code{data},
+with the following possible values:\itemize{
+\item "c", which represents categorical columns.
+\item "q", which represents numeric columns.
+\item "int", which represents integer columns.
+\item "i", which represents logical (boolean) columns.
+}
+
+Note that, while categorical types are treated differently from the rest for model fitting
+purposes, the other types do not influence the generated model, but have effects in other
+functionalities such as feature importances.}
+
+\item{group}{Group size for all ranking group.}
+
+\item{qid}{Query ID for data samples, used for ranking.}
+
+\item{label_lower_bound}{Lower bound for survival training.}
+
+\item{label_upper_bound}{Upper bound for survival training.}
+
+\item{feature_weights}{Set feature weights for column sampling.}
+
+\item{enable_categorical}{Experimental support of specializing for categorical features.
+
+\if{html}{\out{<div class="sourceCode">}}\preformatted{                      If passing 'TRUE' and 'data' is a data frame,
+                      columns of categorical types will automatically
+                      be set to be of categorical type (feature_type='c') in the resulting DMatrix.
+
+                      If passing 'FALSE' and 'data' is a data frame with categorical columns,
+                      it will result in an error being thrown.
+
+                      If 'data' is not a data frame, this argument is ignored.
+
+                      JSON/UBJSON serialization format is required for this.
+}\if{html}{\out{</div>}}}
+}
+\value{
+An object of class \code{xgb.ProxyDMatrix}, which is just a list containing the
+data and parameters passed here. It does \bold{not} inherit from \code{xgb.DMatrix}.
+}
+\description{
+Helper function to supply data in batches of a data iterator when
+constructing a DMatrix from external memory through \link{xgb.ExternalDMatrix}
+or through \link{xgb.QuantileDMatrix.from_iterator}.
+
+This function is \bold{only} meant to be called inside of a callback function (which
+is passed as argument to function \link{xgb.DataIter} to construct a data iterator)
+when constructing a DMatrix through external memory - otherwise, one should call
+\link{xgb.DMatrix} or \link{xgb.QuantileDMatrix}.
+
+The object that results from calling this function directly is \bold{not} like the other
+\code{xgb.DMatrix} variants - i.e. cannot be used to train a model, nor to get predictions - only
+possible usage is to supply data to an iterator, from which a DMatrix is then constructed.
+
+For more information and for example usage, see the documentation for \link{xgb.ExternalDMatrix}.
+}
+\seealso{
+\link{xgb.DataIter}, \link{xgb.ExternalDMatrix}.
+}
--- a/R-package/man/xgb.QuantileDMatrix.from_iterator.Rd
+++ b/R-package/man/xgb.QuantileDMatrix.from_iterator.Rd
@@ -0,0 +1,65 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/xgb.DMatrix.R
+\name{xgb.QuantileDMatrix.from_iterator}
+\alias{xgb.QuantileDMatrix.from_iterator}
+\title{QuantileDMatrix from External Data}
+\usage{
+xgb.QuantileDMatrix.from_iterator(
+  data_iterator,
+  missing = NA,
+  nthread = NULL,
+  ref = NULL,
+  max_bin = NULL
+)
+}
+\arguments{
+\item{data_iterator}{A data iterator structure as returned by \link{xgb.DataIter},
+which includes an environment shared between function calls, and functions to access
+the data in batches on-demand.}
+
+\item{missing}{A float value to represents missing values in data.
+
+Note that, while functions like \link{xgb.DMatrix} can take a generic \code{NA} and interpret it
+correctly for different types like \code{numeric} and \code{integer}, if an \code{NA} value is passed here,
+it will not be adapted for different input types.
+
+For example, in R \code{integer} types, missing values are represented by integer number \code{-2147483648}
+(since machine 'integer' types do not have an inherent 'NA' value) - hence, if one passes \code{NA},
+which is interpreted as a floating-point NaN by 'xgb.ExternalDMatrix' and by
+'xgb.QuantileDMatrix.from_iterator', these integer missing values will not be treated as missing.
+This should not pose any problem for \code{numeric} types, since they do have an inheret NaN value.}
+
+\item{nthread}{Number of threads used for creating DMatrix.}
+
+\item{ref}{The training dataset that provides quantile information, needed when creating
+validation/test dataset with \code{xgb.QuantileDMatrix}. Supplying the training DMatrix
+as a reference means that the same quantisation applied to the training data is
+applied to the validation/test data}
+
+\item{max_bin}{The number of histogram bin, should be consistent with the training parameter
+\code{max_bin}.
+
+This is only supported when constructing a QuantileDMatrix.}
+}
+\value{
+An 'xgb.DMatrix' object, with subclass 'xgb.QuantileDMatrix'.
+}
+\description{
+Create an \code{xgb.QuantileDMatrix} object (exact same class as would be returned by
+calling function \link{xgb.QuantileDMatrix}, with the same advantages and limitations) from
+external data supplied by an \link{xgb.DataIter} object, potentially passed in batches from
+a bigger set that might not fit entirely in memory, same way as \link{xgb.ExternalDMatrix}.
+
+Note that, while external data will only be loaded through the iterator (thus the full data
+might not be held entirely in-memory), the quantized representation of the data will get
+created in-memory, being concatenated from multiple calls to the data iterator. The quantized
+version is typically lighter than the original data, so there might be cases in which this
+representation could potentially fit in memory even if the full data doesn't.
+
+For more information, see the guide 'Using XGBoost External Memory Version':
+\url{https://xgboost.readthedocs.io/en/stable/tutorials/external_memory.html}
+}
+\seealso{
+\link{xgb.DataIter}, \link{xgb.ProxyDMatrix}, \link{xgb.ExternalDMatrix},
+\link{xgb.QuantileDMatrix}
+}