[R] Add data iterator, quantile dmatrix, external memory, and missing feature_types (#9913)
This commit is contained in:
@@ -2,6 +2,7 @@
|
||||
% Please edit documentation in R/xgb.DMatrix.R
|
||||
\name{xgb.DMatrix}
|
||||
\alias{xgb.DMatrix}
|
||||
\alias{xgb.QuantileDMatrix}
|
||||
\title{Construct xgb.DMatrix object}
|
||||
\usage{
|
||||
xgb.DMatrix(
|
||||
@@ -12,6 +13,7 @@ xgb.DMatrix(
|
||||
missing = NA,
|
||||
silent = FALSE,
|
||||
feature_names = colnames(data),
|
||||
feature_types = NULL,
|
||||
nthread = NULL,
|
||||
group = NULL,
|
||||
qid = NULL,
|
||||
@@ -20,12 +22,55 @@ xgb.DMatrix(
|
||||
feature_weights = NULL,
|
||||
enable_categorical = FALSE
|
||||
)
|
||||
|
||||
xgb.QuantileDMatrix(
|
||||
data,
|
||||
label = NULL,
|
||||
weight = NULL,
|
||||
base_margin = NULL,
|
||||
missing = NA,
|
||||
feature_names = colnames(data),
|
||||
feature_types = NULL,
|
||||
nthread = NULL,
|
||||
group = NULL,
|
||||
qid = NULL,
|
||||
label_lower_bound = NULL,
|
||||
label_upper_bound = NULL,
|
||||
feature_weights = NULL,
|
||||
enable_categorical = FALSE,
|
||||
ref = NULL,
|
||||
max_bin = NULL
|
||||
)
|
||||
}
|
||||
\arguments{
|
||||
\item{data}{a \code{matrix} object (either numeric or integer), a \code{dgCMatrix} object,
|
||||
a \code{dgRMatrix} object,
|
||||
a \code{dsparseVector} object (only when making predictions from a fitted model, will be
|
||||
interpreted as a row vector), or a character string representing a filename.}
|
||||
\item{data}{Data from which to create a DMatrix, which can then be used for fitting models or
|
||||
for getting predictions out of a fitted model.
|
||||
|
||||
Supported input types are as follows:\itemize{
|
||||
\item \code{matrix} objects, with types \code{numeric}, \code{integer}, or \code{logical}.
|
||||
\item \code{data.frame} objects, with columns of types \code{numeric}, \code{integer}, \code{logical}, or \code{factor}.
|
||||
|
||||
If passing \code{enable_categorical=TRUE}, columns with \code{factor} type will be treated as categorical.
|
||||
Otherwise, if passing \code{enable_categorical=FALSE} and the data contains \code{factor} columns, an error
|
||||
will be thrown.
|
||||
|
||||
Note that xgboost uses base-0 encoding for categorical types, hence \code{factor} types (which use base-1
|
||||
encoding') will be converted inside the function call. Be aware that the encoding used for \code{factor}
|
||||
types is not kept as part of the model, so in subsequent calls to \code{predict}, it is the user's
|
||||
responsibility to ensure that factor columns have the same levels as the ones from which the DMatrix
|
||||
was constructed.
|
||||
|
||||
Other column types are not supported.
|
||||
\item CSR matrices, as class \code{dgRMatrix} from package \code{Matrix}.
|
||||
\item CSC matrices, as class \code{dgCMatrix} from package \code{Matrix}. These are \bold{not} supported for
|
||||
'xgb.QuantileDMatrix'.
|
||||
\item Single-row CSR matrices, as class \code{dsparseVector} from package \code{Matrix}, which is interpreted
|
||||
as a single row (only when making predictions from a fitted model).
|
||||
\item Text files in SVMLight / LibSVM formats, passed as a path to the file. These are \bold{not}
|
||||
supported for xgb.QuantileDMatrix'.
|
||||
\item Binary files generated by \link{xgb.DMatrix.save}, passed as a path to the file. These are
|
||||
\bold{not} supported for xgb.QuantileDMatrix'.
|
||||
}}
|
||||
|
||||
\item{label}{Label of the training data.}
|
||||
|
||||
@@ -41,13 +86,36 @@ so it doesn't make sense to assign weights to individual data points.}
|
||||
\if{html}{\out{<div class="sourceCode">}}\preformatted{ In the case of multi-output models, one can also pass multi-dimensional base_margin.
|
||||
}\if{html}{\out{</div>}}}
|
||||
|
||||
\item{missing}{a float value to represents missing values in data (used only when input is a dense matrix).
|
||||
It is useful when a 0 or some other extreme value represents missing values in data.}
|
||||
\item{missing}{A float value to represents missing values in data (not used when creating DMatrix
|
||||
from text files).
|
||||
It is useful to change when a zero, infinite, or some other extreme value represents missing
|
||||
values in data.}
|
||||
|
||||
\item{silent}{whether to suppress printing an informational message after loading from a file.}
|
||||
|
||||
\item{feature_names}{Set names for features. Overrides column names in data
|
||||
frame and matrix.}
|
||||
frame and matrix.
|
||||
|
||||
\if{html}{\out{<div class="sourceCode">}}\preformatted{ Note: columns are not referenced by name when calling `predict`, so the column order there
|
||||
must be the same as in the DMatrix construction, regardless of the column names.
|
||||
}\if{html}{\out{</div>}}}
|
||||
|
||||
\item{feature_types}{Set types for features.
|
||||
|
||||
If \code{data} is a \code{data.frame} and passing \code{enable_categorical=TRUE}, the types will be deduced
|
||||
automatically from the column types.
|
||||
|
||||
Otherwise, one can pass a character vector with the same length as number of columns in \code{data},
|
||||
with the following possible values:\itemize{
|
||||
\item "c", which represents categorical columns.
|
||||
\item "q", which represents numeric columns.
|
||||
\item "int", which represents integer columns.
|
||||
\item "i", which represents logical (boolean) columns.
|
||||
}
|
||||
|
||||
Note that, while categorical types are treated differently from the rest for model fitting
|
||||
purposes, the other types do not influence the generated model, but have effects in other
|
||||
functionalities such as feature importances.}
|
||||
|
||||
\item{nthread}{Number of threads used for creating DMatrix.}
|
||||
|
||||
@@ -74,13 +142,33 @@ frame and matrix.}
|
||||
|
||||
JSON/UBJSON serialization format is required for this.
|
||||
}\if{html}{\out{</div>}}}
|
||||
|
||||
\item{ref}{The training dataset that provides quantile information, needed when creating
|
||||
validation/test dataset with \code{xgb.QuantileDMatrix}. Supplying the training DMatrix
|
||||
as a reference means that the same quantisation applied to the training data is
|
||||
applied to the validation/test data}
|
||||
|
||||
\item{max_bin}{The number of histogram bin, should be consistent with the training parameter
|
||||
\code{max_bin}.
|
||||
|
||||
This is only supported when constructing a QuantileDMatrix.}
|
||||
}
|
||||
\value{
|
||||
An 'xgb.DMatrix' object. If calling 'xgb.QuantileDMatrix', it will have additional
|
||||
subclass 'xgb.QuantileDMatrix'.
|
||||
}
|
||||
\description{
|
||||
Construct xgb.DMatrix object from either a dense matrix, a sparse matrix, or a local file.
|
||||
Supported input file formats are either a LIBSVM text file or a binary file that was created previously by
|
||||
\code{\link{xgb.DMatrix.save}}).
|
||||
Construct an 'xgb.DMatrix' object from a given data source, which can then be passed to functions
|
||||
such as \link{xgb.train} or \link{predict.xgb.Booster}.
|
||||
}
|
||||
\details{
|
||||
Function 'xgb.QuantileDMatrix' will construct a DMatrix with quantization for the histogram
|
||||
method already applied to it, which can be used to reduce memory usage (compared to using a
|
||||
a regular DMatrix first and then creating a quantization out of it) when using the histogram
|
||||
method (\code{tree_method = "hist"}, which is the default algorithm), but is not usable for the
|
||||
sorted-indices method (\code{tree_method = "exact"}), nor for the approximate method
|
||||
(\code{tree_method = "approx"}).
|
||||
|
||||
Note that DMatrix objects are not serializable through R functions such as \code{saveRDS} or \code{save}.
|
||||
If a DMatrix gets serialized and then de-serialized (for example, when saving data in an R session or caching
|
||||
chunks in an Rmd file), the resulting object will not be usable anymore and will need to be reconstructed
|
||||
|
||||
51
R-package/man/xgb.DataIter.Rd
Normal file
51
R-package/man/xgb.DataIter.Rd
Normal file
@@ -0,0 +1,51 @@
|
||||
% Generated by roxygen2: do not edit by hand
|
||||
% Please edit documentation in R/xgb.DMatrix.R
|
||||
\name{xgb.DataIter}
|
||||
\alias{xgb.DataIter}
|
||||
\title{XGBoost Data Iterator}
|
||||
\usage{
|
||||
xgb.DataIter(env = new.env(), f_next, f_reset)
|
||||
}
|
||||
\arguments{
|
||||
\item{env}{An R environment to pass to the callback functions supplied here, which can be
|
||||
used to keep track of variables to determine how to handle the batches.
|
||||
|
||||
For example, one might want to keep track of an iteration number in this environment in order
|
||||
to know which part of the data to pass next.}
|
||||
|
||||
\item{f_next}{\verb{function(env)} which is responsible for:\itemize{
|
||||
\item Accessing or retrieving the next batch of data in the iterator.
|
||||
\item Supplying this data by calling function \link{xgb.ProxyDMatrix} on it and returning the result.
|
||||
\item Keeping track of where in the iterator batch it is or will go next, which can for example
|
||||
be done by modifiying variables in the \code{env} variable that is passed here.
|
||||
\item Signaling whether there are more batches to be consumed or not, by returning \code{NULL}
|
||||
when the stream of data ends (all batches in the iterator have been consumed), or the result from
|
||||
calling \link{xgb.ProxyDMatrix} when there are more batches in the line to be consumed.
|
||||
}}
|
||||
|
||||
\item{f_reset}{\verb{function(env)} which is responsible for reseting the data iterator
|
||||
(i.e. taking it back to the first batch, called before and after the sequence of batches
|
||||
has been consumed).
|
||||
|
||||
Note that, after resetting the iterator, the batches will be accessed again, so the same data
|
||||
(and in the same order) must be passed in subsequent iterations.}
|
||||
}
|
||||
\value{
|
||||
An \code{xgb.DataIter} object, containing the same inputs supplied here, which can then
|
||||
be passed to \link{xgb.ExternalDMatrix}.
|
||||
}
|
||||
\description{
|
||||
Interface to create a custom data iterator in order to construct a DMatrix
|
||||
from external memory.
|
||||
|
||||
This function is responsible for generating an R object structure containing callback
|
||||
functions and an environment shared with them.
|
||||
|
||||
The output structure from this function is then meant to be passed to \link{xgb.ExternalDMatrix},
|
||||
which will consume the data and create a DMatrix from it by executing the callback functions.
|
||||
|
||||
For more information, and for a usage example, see the documentation for \link{xgb.ExternalDMatrix}.
|
||||
}
|
||||
\seealso{
|
||||
\link{xgb.ExternalDMatrix}, \link{xgb.ProxyDMatrix}.
|
||||
}
|
||||
122
R-package/man/xgb.ExternalDMatrix.Rd
Normal file
122
R-package/man/xgb.ExternalDMatrix.Rd
Normal file
@@ -0,0 +1,122 @@
|
||||
% Generated by roxygen2: do not edit by hand
|
||||
% Please edit documentation in R/xgb.DMatrix.R
|
||||
\name{xgb.ExternalDMatrix}
|
||||
\alias{xgb.ExternalDMatrix}
|
||||
\title{DMatrix from External Data}
|
||||
\usage{
|
||||
xgb.ExternalDMatrix(
|
||||
data_iterator,
|
||||
cache_prefix = tempdir(),
|
||||
missing = NA,
|
||||
nthread = NULL
|
||||
)
|
||||
}
|
||||
\arguments{
|
||||
\item{data_iterator}{A data iterator structure as returned by \link{xgb.DataIter},
|
||||
which includes an environment shared between function calls, and functions to access
|
||||
the data in batches on-demand.}
|
||||
|
||||
\item{cache_prefix}{The path of cache file, caller must initialize all the directories in this path.}
|
||||
|
||||
\item{missing}{A float value to represents missing values in data.
|
||||
|
||||
Note that, while functions like \link{xgb.DMatrix} can take a generic \code{NA} and interpret it
|
||||
correctly for different types like \code{numeric} and \code{integer}, if an \code{NA} value is passed here,
|
||||
it will not be adapted for different input types.
|
||||
|
||||
For example, in R \code{integer} types, missing values are represented by integer number \code{-2147483648}
|
||||
(since machine 'integer' types do not have an inherent 'NA' value) - hence, if one passes \code{NA},
|
||||
which is interpreted as a floating-point NaN by 'xgb.ExternalDMatrix' and by
|
||||
'xgb.QuantileDMatrix.from_iterator', these integer missing values will not be treated as missing.
|
||||
This should not pose any problem for \code{numeric} types, since they do have an inheret NaN value.}
|
||||
|
||||
\item{nthread}{Number of threads used for creating DMatrix.}
|
||||
}
|
||||
\value{
|
||||
An 'xgb.DMatrix' object, with subclass 'xgb.ExternalDMatrix', in which the data is not
|
||||
held internally but accessed through the iterator when needed.
|
||||
}
|
||||
\description{
|
||||
Create a special type of xgboost 'DMatrix' object from external data
|
||||
supplied by an \link{xgb.DataIter} object, potentially passed in batches from a
|
||||
bigger set that might not fit entirely in memory.
|
||||
|
||||
The data supplied by the iterator is accessed on-demand as needed, multiple times,
|
||||
without being concatenated, but note that fields like 'label' \bold{will} be
|
||||
concatenated from multiple calls to the data iterator.
|
||||
|
||||
For more information, see the guide 'Using XGBoost External Memory Version':
|
||||
\url{https://xgboost.readthedocs.io/en/stable/tutorials/external_memory.html}
|
||||
}
|
||||
\examples{
|
||||
library(xgboost)
|
||||
data(mtcars)
|
||||
|
||||
# this custom environment will be passed to the iterator
|
||||
# functions at each call. It's up to the user to keep
|
||||
# track of the iteration number in this environment.
|
||||
iterator_env <- as.environment(
|
||||
list(
|
||||
iter = 0,
|
||||
x = mtcars[, -1],
|
||||
y = mtcars[, 1]
|
||||
)
|
||||
)
|
||||
|
||||
# Data is passed in two batches.
|
||||
# In this example, batches are obtained by subsetting the 'x' variable.
|
||||
# This is not advantageous to do, since the data is already loaded in memory
|
||||
# and can be passed in full in one go, but there can be situations in which
|
||||
# only a subset of the data will fit in the computer's memory, and it can
|
||||
# be loaded in batches that are accessed one-at-a-time only.
|
||||
iterator_next <- function(iterator_env) {
|
||||
curr_iter <- iterator_env[["iter"]]
|
||||
if (curr_iter >= 2) {
|
||||
# there are only two batches, so this signals end of the stream
|
||||
return(NULL)
|
||||
}
|
||||
|
||||
if (curr_iter == 0) {
|
||||
x_batch <- iterator_env[["x"]][1:16, ]
|
||||
y_batch <- iterator_env[["y"]][1:16]
|
||||
} else {
|
||||
x_batch <- iterator_env[["x"]][17:32, ]
|
||||
y_batch <- iterator_env[["y"]][17:32]
|
||||
}
|
||||
on.exit({
|
||||
iterator_env[["iter"]] <- curr_iter + 1
|
||||
})
|
||||
|
||||
# Function 'xgb.ProxyDMatrix' must be called manually
|
||||
# at each batch with all the appropriate attributes,
|
||||
# such as feature names and feature types.
|
||||
return(xgb.ProxyDMatrix(data = x_batch, label = y_batch))
|
||||
}
|
||||
|
||||
# This moves the iterator back to its beginning
|
||||
iterator_reset <- function(iterator_env) {
|
||||
iterator_env[["iter"]] <- 0
|
||||
}
|
||||
|
||||
data_iterator <- xgb.DataIter(
|
||||
env = iterator_env,
|
||||
f_next = iterator_next,
|
||||
f_reset = iterator_reset
|
||||
)
|
||||
cache_prefix <- tempdir()
|
||||
|
||||
# DMatrix will be constructed from the iterator's batches
|
||||
dm <- xgb.ExternalDMatrix(data_iterator, cache_prefix, nthread = 1)
|
||||
|
||||
# After construction, can be used as a regular DMatrix
|
||||
params <- list(nthread = 1, objective = "reg:squarederror")
|
||||
model <- xgb.train(data = dm, nrounds = 2, params = params)
|
||||
|
||||
# Predictions can also be called on it, and should be the same
|
||||
# as if the data were passed differently.
|
||||
pred_dm <- predict(model, dm)
|
||||
pred_mat <- predict(model, as.matrix(mtcars[, -1]))
|
||||
}
|
||||
\seealso{
|
||||
\link{xgb.DataIter}, \link{xgb.ProxyDMatrix}, \link{xgb.QuantileDMatrix.from_iterator}
|
||||
}
|
||||
121
R-package/man/xgb.ProxyDMatrix.Rd
Normal file
121
R-package/man/xgb.ProxyDMatrix.Rd
Normal file
@@ -0,0 +1,121 @@
|
||||
% Generated by roxygen2: do not edit by hand
|
||||
% Please edit documentation in R/xgb.DMatrix.R
|
||||
\name{xgb.ProxyDMatrix}
|
||||
\alias{xgb.ProxyDMatrix}
|
||||
\title{Proxy DMatrix Updater}
|
||||
\usage{
|
||||
xgb.ProxyDMatrix(
|
||||
data,
|
||||
label = NULL,
|
||||
weight = NULL,
|
||||
base_margin = NULL,
|
||||
feature_names = colnames(data),
|
||||
feature_types = NULL,
|
||||
group = NULL,
|
||||
qid = NULL,
|
||||
label_lower_bound = NULL,
|
||||
label_upper_bound = NULL,
|
||||
feature_weights = NULL,
|
||||
enable_categorical = FALSE
|
||||
)
|
||||
}
|
||||
\arguments{
|
||||
\item{data}{Batch of data belonging to this batch.
|
||||
|
||||
Note that not all of the input types supported by \link{xgb.DMatrix} are possible
|
||||
to pass here. Supported types are:\itemize{
|
||||
\item \code{matrix}, with types \code{numeric}, \code{integer}, and \code{logical}. Note that for types
|
||||
\code{integer} and \code{logical}, missing values might not be automatically recognized as
|
||||
as such - see the documentation for parameter \code{missing} in \link{xgb.ExternalDMatrix}
|
||||
for details on this.
|
||||
\item \code{data.frame}, with the same types as supported by 'xgb.DMatrix' and same
|
||||
conversions applied to it. See the documentation for parameter \code{data} in
|
||||
\link{xgb.DMatrix} for details on it.
|
||||
\item CSR matrices, as class \code{dgRMatrix} from package \code{Matrix}.
|
||||
}}
|
||||
|
||||
\item{label}{Label of the training data.}
|
||||
|
||||
\item{weight}{Weight for each instance.
|
||||
|
||||
Note that, for ranking task, weights are per-group. In ranking task, one weight
|
||||
is assigned to each group (not each data point). This is because we
|
||||
only care about the relative ordering of data points within each group,
|
||||
so it doesn't make sense to assign weights to individual data points.}
|
||||
|
||||
\item{base_margin}{Base margin used for boosting from existing model.
|
||||
|
||||
\if{html}{\out{<div class="sourceCode">}}\preformatted{ In the case of multi-output models, one can also pass multi-dimensional base_margin.
|
||||
}\if{html}{\out{</div>}}}
|
||||
|
||||
\item{feature_names}{Set names for features. Overrides column names in data
|
||||
frame and matrix.
|
||||
|
||||
\if{html}{\out{<div class="sourceCode">}}\preformatted{ Note: columns are not referenced by name when calling `predict`, so the column order there
|
||||
must be the same as in the DMatrix construction, regardless of the column names.
|
||||
}\if{html}{\out{</div>}}}
|
||||
|
||||
\item{feature_types}{Set types for features.
|
||||
|
||||
If \code{data} is a \code{data.frame} and passing \code{enable_categorical=TRUE}, the types will be deduced
|
||||
automatically from the column types.
|
||||
|
||||
Otherwise, one can pass a character vector with the same length as number of columns in \code{data},
|
||||
with the following possible values:\itemize{
|
||||
\item "c", which represents categorical columns.
|
||||
\item "q", which represents numeric columns.
|
||||
\item "int", which represents integer columns.
|
||||
\item "i", which represents logical (boolean) columns.
|
||||
}
|
||||
|
||||
Note that, while categorical types are treated differently from the rest for model fitting
|
||||
purposes, the other types do not influence the generated model, but have effects in other
|
||||
functionalities such as feature importances.}
|
||||
|
||||
\item{group}{Group size for all ranking group.}
|
||||
|
||||
\item{qid}{Query ID for data samples, used for ranking.}
|
||||
|
||||
\item{label_lower_bound}{Lower bound for survival training.}
|
||||
|
||||
\item{label_upper_bound}{Upper bound for survival training.}
|
||||
|
||||
\item{feature_weights}{Set feature weights for column sampling.}
|
||||
|
||||
\item{enable_categorical}{Experimental support of specializing for categorical features.
|
||||
|
||||
\if{html}{\out{<div class="sourceCode">}}\preformatted{ If passing 'TRUE' and 'data' is a data frame,
|
||||
columns of categorical types will automatically
|
||||
be set to be of categorical type (feature_type='c') in the resulting DMatrix.
|
||||
|
||||
If passing 'FALSE' and 'data' is a data frame with categorical columns,
|
||||
it will result in an error being thrown.
|
||||
|
||||
If 'data' is not a data frame, this argument is ignored.
|
||||
|
||||
JSON/UBJSON serialization format is required for this.
|
||||
}\if{html}{\out{</div>}}}
|
||||
}
|
||||
\value{
|
||||
An object of class \code{xgb.ProxyDMatrix}, which is just a list containing the
|
||||
data and parameters passed here. It does \bold{not} inherit from \code{xgb.DMatrix}.
|
||||
}
|
||||
\description{
|
||||
Helper function to supply data in batches of a data iterator when
|
||||
constructing a DMatrix from external memory through \link{xgb.ExternalDMatrix}
|
||||
or through \link{xgb.QuantileDMatrix.from_iterator}.
|
||||
|
||||
This function is \bold{only} meant to be called inside of a callback function (which
|
||||
is passed as argument to function \link{xgb.DataIter} to construct a data iterator)
|
||||
when constructing a DMatrix through external memory - otherwise, one should call
|
||||
\link{xgb.DMatrix} or \link{xgb.QuantileDMatrix}.
|
||||
|
||||
The object that results from calling this function directly is \bold{not} like the other
|
||||
\code{xgb.DMatrix} variants - i.e. cannot be used to train a model, nor to get predictions - only
|
||||
possible usage is to supply data to an iterator, from which a DMatrix is then constructed.
|
||||
|
||||
For more information and for example usage, see the documentation for \link{xgb.ExternalDMatrix}.
|
||||
}
|
||||
\seealso{
|
||||
\link{xgb.DataIter}, \link{xgb.ExternalDMatrix}.
|
||||
}
|
||||
65
R-package/man/xgb.QuantileDMatrix.from_iterator.Rd
Normal file
65
R-package/man/xgb.QuantileDMatrix.from_iterator.Rd
Normal file
@@ -0,0 +1,65 @@
|
||||
% Generated by roxygen2: do not edit by hand
|
||||
% Please edit documentation in R/xgb.DMatrix.R
|
||||
\name{xgb.QuantileDMatrix.from_iterator}
|
||||
\alias{xgb.QuantileDMatrix.from_iterator}
|
||||
\title{QuantileDMatrix from External Data}
|
||||
\usage{
|
||||
xgb.QuantileDMatrix.from_iterator(
|
||||
data_iterator,
|
||||
missing = NA,
|
||||
nthread = NULL,
|
||||
ref = NULL,
|
||||
max_bin = NULL
|
||||
)
|
||||
}
|
||||
\arguments{
|
||||
\item{data_iterator}{A data iterator structure as returned by \link{xgb.DataIter},
|
||||
which includes an environment shared between function calls, and functions to access
|
||||
the data in batches on-demand.}
|
||||
|
||||
\item{missing}{A float value to represents missing values in data.
|
||||
|
||||
Note that, while functions like \link{xgb.DMatrix} can take a generic \code{NA} and interpret it
|
||||
correctly for different types like \code{numeric} and \code{integer}, if an \code{NA} value is passed here,
|
||||
it will not be adapted for different input types.
|
||||
|
||||
For example, in R \code{integer} types, missing values are represented by integer number \code{-2147483648}
|
||||
(since machine 'integer' types do not have an inherent 'NA' value) - hence, if one passes \code{NA},
|
||||
which is interpreted as a floating-point NaN by 'xgb.ExternalDMatrix' and by
|
||||
'xgb.QuantileDMatrix.from_iterator', these integer missing values will not be treated as missing.
|
||||
This should not pose any problem for \code{numeric} types, since they do have an inheret NaN value.}
|
||||
|
||||
\item{nthread}{Number of threads used for creating DMatrix.}
|
||||
|
||||
\item{ref}{The training dataset that provides quantile information, needed when creating
|
||||
validation/test dataset with \code{xgb.QuantileDMatrix}. Supplying the training DMatrix
|
||||
as a reference means that the same quantisation applied to the training data is
|
||||
applied to the validation/test data}
|
||||
|
||||
\item{max_bin}{The number of histogram bin, should be consistent with the training parameter
|
||||
\code{max_bin}.
|
||||
|
||||
This is only supported when constructing a QuantileDMatrix.}
|
||||
}
|
||||
\value{
|
||||
An 'xgb.DMatrix' object, with subclass 'xgb.QuantileDMatrix'.
|
||||
}
|
||||
\description{
|
||||
Create an \code{xgb.QuantileDMatrix} object (exact same class as would be returned by
|
||||
calling function \link{xgb.QuantileDMatrix}, with the same advantages and limitations) from
|
||||
external data supplied by an \link{xgb.DataIter} object, potentially passed in batches from
|
||||
a bigger set that might not fit entirely in memory, same way as \link{xgb.ExternalDMatrix}.
|
||||
|
||||
Note that, while external data will only be loaded through the iterator (thus the full data
|
||||
might not be held entirely in-memory), the quantized representation of the data will get
|
||||
created in-memory, being concatenated from multiple calls to the data iterator. The quantized
|
||||
version is typically lighter than the original data, so there might be cases in which this
|
||||
representation could potentially fit in memory even if the full data doesn't.
|
||||
|
||||
For more information, see the guide 'Using XGBoost External Memory Version':
|
||||
\url{https://xgboost.readthedocs.io/en/stable/tutorials/external_memory.html}
|
||||
}
|
||||
\seealso{
|
||||
\link{xgb.DataIter}, \link{xgb.ProxyDMatrix}, \link{xgb.ExternalDMatrix},
|
||||
\link{xgb.QuantileDMatrix}
|
||||
}
|
||||
Reference in New Issue
Block a user