Merge branch 'master' into sync-2024Jan24
This commit is contained in:
commit
44db1cef54
39
.github/workflows/i386.yml
vendored
Normal file
39
.github/workflows/i386.yml
vendored
Normal file
@ -0,0 +1,39 @@
|
|||||||
|
name: XGBoost-i386-test
|
||||||
|
|
||||||
|
on: [push, pull_request]
|
||||||
|
|
||||||
|
permissions:
|
||||||
|
contents: read # to fetch code (actions/checkout)
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
build-32bit:
|
||||||
|
name: Build 32-bit
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
services:
|
||||||
|
registry:
|
||||||
|
image: registry:2
|
||||||
|
ports:
|
||||||
|
- 5000:5000
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v2.5.0
|
||||||
|
with:
|
||||||
|
submodules: 'true'
|
||||||
|
- name: Set up Docker Buildx
|
||||||
|
uses: docker/setup-buildx-action@v3
|
||||||
|
with:
|
||||||
|
driver-opts: network=host
|
||||||
|
- name: Build and push container
|
||||||
|
uses: docker/build-push-action@v5
|
||||||
|
with:
|
||||||
|
context: .
|
||||||
|
file: tests/ci_build/Dockerfile.i386
|
||||||
|
push: true
|
||||||
|
tags: localhost:5000/xgboost/build-32bit:latest
|
||||||
|
cache-from: type=gha
|
||||||
|
cache-to: type=gha,mode=max
|
||||||
|
- name: Build XGBoost
|
||||||
|
run: |
|
||||||
|
docker run --rm -v $PWD:/workspace -w /workspace \
|
||||||
|
-e CXXFLAGS='-Wno-error=overloaded-virtual -Wno-error=maybe-uninitialized -Wno-error=redundant-move' \
|
||||||
|
localhost:5000/xgboost/build-32bit:latest \
|
||||||
|
tests/ci_build/build_via_cmake.sh
|
||||||
@ -65,6 +65,6 @@ Imports:
|
|||||||
data.table (>= 1.9.6),
|
data.table (>= 1.9.6),
|
||||||
jsonlite (>= 1.0)
|
jsonlite (>= 1.0)
|
||||||
Roxygen: list(markdown = TRUE)
|
Roxygen: list(markdown = TRUE)
|
||||||
RoxygenNote: 7.3.0
|
RoxygenNote: 7.3.1
|
||||||
Encoding: UTF-8
|
Encoding: UTF-8
|
||||||
SystemRequirements: GNU make, C++17
|
SystemRequirements: GNU make, C++17
|
||||||
|
|||||||
@ -15,7 +15,6 @@ S3method(print,xgb.DMatrix)
|
|||||||
S3method(print,xgb.cv.synchronous)
|
S3method(print,xgb.cv.synchronous)
|
||||||
S3method(setinfo,xgb.Booster)
|
S3method(setinfo,xgb.Booster)
|
||||||
S3method(setinfo,xgb.DMatrix)
|
S3method(setinfo,xgb.DMatrix)
|
||||||
S3method(slice,xgb.DMatrix)
|
|
||||||
S3method(variable.names,xgb.Booster)
|
S3method(variable.names,xgb.Booster)
|
||||||
export("xgb.attr<-")
|
export("xgb.attr<-")
|
||||||
export("xgb.attributes<-")
|
export("xgb.attributes<-")
|
||||||
@ -30,10 +29,14 @@ export(cb.reset.parameters)
|
|||||||
export(cb.save.model)
|
export(cb.save.model)
|
||||||
export(getinfo)
|
export(getinfo)
|
||||||
export(setinfo)
|
export(setinfo)
|
||||||
export(slice)
|
|
||||||
export(xgb.DMatrix)
|
export(xgb.DMatrix)
|
||||||
export(xgb.DMatrix.hasinfo)
|
export(xgb.DMatrix.hasinfo)
|
||||||
export(xgb.DMatrix.save)
|
export(xgb.DMatrix.save)
|
||||||
|
export(xgb.DataBatch)
|
||||||
|
export(xgb.DataIter)
|
||||||
|
export(xgb.ExternalDMatrix)
|
||||||
|
export(xgb.QuantileDMatrix)
|
||||||
|
export(xgb.QuantileDMatrix.from_iterator)
|
||||||
export(xgb.attr)
|
export(xgb.attr)
|
||||||
export(xgb.attributes)
|
export(xgb.attributes)
|
||||||
export(xgb.config)
|
export(xgb.config)
|
||||||
@ -65,6 +68,7 @@ export(xgb.save)
|
|||||||
export(xgb.save.raw)
|
export(xgb.save.raw)
|
||||||
export(xgb.set.config)
|
export(xgb.set.config)
|
||||||
export(xgb.slice.Booster)
|
export(xgb.slice.Booster)
|
||||||
|
export(xgb.slice.DMatrix)
|
||||||
export(xgb.train)
|
export(xgb.train)
|
||||||
export(xgboost)
|
export(xgboost)
|
||||||
import(methods)
|
import(methods)
|
||||||
|
|||||||
@ -111,6 +111,21 @@ xgb.get.handle <- function(object) {
|
|||||||
#' If passing "all", will use all of the rounds regardless of whether the model had early stopping or not.
|
#' If passing "all", will use all of the rounds regardless of whether the model had early stopping or not.
|
||||||
#' @param strict_shape Default is `FALSE`. When set to `TRUE`, the output
|
#' @param strict_shape Default is `FALSE`. When set to `TRUE`, the output
|
||||||
#' type and shape of predictions are invariant to the model type.
|
#' type and shape of predictions are invariant to the model type.
|
||||||
|
#' @param validate_features When `TRUE`, validate that the Booster's and newdata's feature_names
|
||||||
|
#' match (only applicable when both `object` and `newdata` have feature names).
|
||||||
|
#'
|
||||||
|
#' If the column names differ and `newdata` is not an `xgb.DMatrix`, will try to reorder
|
||||||
|
#' the columns in `newdata` to match with the booster's.
|
||||||
|
#'
|
||||||
|
#' If the booster has feature types and `newdata` is either an `xgb.DMatrix` or `data.frame`,
|
||||||
|
#' will additionally verify that categorical columns are of the correct type in `newdata`,
|
||||||
|
#' throwing an error if they do not match.
|
||||||
|
#'
|
||||||
|
#' If passing `FALSE`, it is assumed that the feature names and types are the same,
|
||||||
|
#' and come in the same order as in the training data.
|
||||||
|
#'
|
||||||
|
#' Note that this check might add some sizable latency to the predictions, so it's
|
||||||
|
#' recommended to disable it for performance-sensitive applications.
|
||||||
#' @param ... Not used.
|
#' @param ... Not used.
|
||||||
#'
|
#'
|
||||||
#' @details
|
#' @details
|
||||||
@ -271,7 +286,11 @@ xgb.get.handle <- function(object) {
|
|||||||
#' @export
|
#' @export
|
||||||
predict.xgb.Booster <- function(object, newdata, missing = NA, outputmargin = FALSE,
|
predict.xgb.Booster <- function(object, newdata, missing = NA, outputmargin = FALSE,
|
||||||
predleaf = FALSE, predcontrib = FALSE, approxcontrib = FALSE, predinteraction = FALSE,
|
predleaf = FALSE, predcontrib = FALSE, approxcontrib = FALSE, predinteraction = FALSE,
|
||||||
reshape = FALSE, training = FALSE, iterationrange = NULL, strict_shape = FALSE, ...) {
|
reshape = FALSE, training = FALSE, iterationrange = NULL, strict_shape = FALSE,
|
||||||
|
validate_features = FALSE, ...) {
|
||||||
|
if (validate_features) {
|
||||||
|
newdata <- validate.features(object, newdata)
|
||||||
|
}
|
||||||
if (!inherits(newdata, "xgb.DMatrix")) {
|
if (!inherits(newdata, "xgb.DMatrix")) {
|
||||||
nthread <- xgb.nthread(object)
|
nthread <- xgb.nthread(object)
|
||||||
newdata <- xgb.DMatrix(
|
newdata <- xgb.DMatrix(
|
||||||
@ -418,6 +437,85 @@ predict.xgb.Booster <- function(object, newdata, missing = NA, outputmargin = FA
|
|||||||
return(arr)
|
return(arr)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
validate.features <- function(bst, newdata) {
|
||||||
|
if (is.character(newdata)) {
|
||||||
|
# this will be encountered when passing file paths
|
||||||
|
return(newdata)
|
||||||
|
}
|
||||||
|
if (inherits(newdata, "sparseVector")) {
|
||||||
|
# in this case, newdata won't have metadata
|
||||||
|
return(newdata)
|
||||||
|
}
|
||||||
|
if (is.vector(newdata)) {
|
||||||
|
newdata <- as.matrix(newdata)
|
||||||
|
}
|
||||||
|
|
||||||
|
booster_names <- getinfo(bst, "feature_name")
|
||||||
|
checked_names <- FALSE
|
||||||
|
if (NROW(booster_names)) {
|
||||||
|
|
||||||
|
try_reorder <- FALSE
|
||||||
|
if (inherits(newdata, "xgb.DMatrix")) {
|
||||||
|
curr_names <- getinfo(newdata, "feature_name")
|
||||||
|
} else {
|
||||||
|
curr_names <- colnames(newdata)
|
||||||
|
try_reorder <- TRUE
|
||||||
|
}
|
||||||
|
|
||||||
|
if (NROW(curr_names)) {
|
||||||
|
checked_names <- TRUE
|
||||||
|
|
||||||
|
if (length(curr_names) != length(booster_names) || any(curr_names != booster_names)) {
|
||||||
|
|
||||||
|
if (!try_reorder) {
|
||||||
|
stop("Feature names in 'newdata' do not match with booster's.")
|
||||||
|
} else {
|
||||||
|
if (inherits(newdata, "data.table")) {
|
||||||
|
newdata <- newdata[, booster_names, with = FALSE]
|
||||||
|
} else {
|
||||||
|
newdata <- newdata[, booster_names, drop = FALSE]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
} # if (NROW(curr_names)) {
|
||||||
|
|
||||||
|
} # if (NROW(booster_names)) {
|
||||||
|
|
||||||
|
if (inherits(newdata, c("data.frame", "xgb.DMatrix"))) {
|
||||||
|
|
||||||
|
booster_types <- getinfo(bst, "feature_type")
|
||||||
|
if (!NROW(booster_types)) {
|
||||||
|
# Note: types in the booster are optional. Other interfaces
|
||||||
|
# might not even save it as booster attributes for example,
|
||||||
|
# even if the model uses categorical features.
|
||||||
|
return(newdata)
|
||||||
|
}
|
||||||
|
if (inherits(newdata, "xgb.DMatrix")) {
|
||||||
|
curr_types <- getinfo(newdata, "feature_type")
|
||||||
|
if (length(curr_types) != length(booster_types) || any(curr_types != booster_types)) {
|
||||||
|
stop("Feature types in 'newdata' do not match with booster's.")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (inherits(newdata, "data.frame")) {
|
||||||
|
is_factor <- sapply(newdata, is.factor)
|
||||||
|
if (any(is_factor != (booster_types == "c"))) {
|
||||||
|
stop(
|
||||||
|
paste0(
|
||||||
|
"Feature types in 'newdata' do not match with booster's for same columns (by ",
|
||||||
|
ifelse(checked_names, "name", "position"),
|
||||||
|
")."
|
||||||
|
)
|
||||||
|
)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
return(newdata)
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
#' @title Accessors for serializable attributes of a model
|
#' @title Accessors for serializable attributes of a model
|
||||||
#'
|
#'
|
||||||
|
|||||||
@ -1,14 +1,40 @@
|
|||||||
#' Construct xgb.DMatrix object
|
#' Construct xgb.DMatrix object
|
||||||
#'
|
#'
|
||||||
#' Construct xgb.DMatrix object from either a dense matrix, a sparse matrix, or a local file.
|
#' Construct an 'xgb.DMatrix' object from a given data source, which can then be passed to functions
|
||||||
#' Supported input file formats are either a LIBSVM text file or a binary file that was created previously by
|
#' such as \link{xgb.train} or \link{predict.xgb.Booster}.
|
||||||
#' \code{\link{xgb.DMatrix.save}}).
|
|
||||||
#'
|
#'
|
||||||
#' @param data a \code{matrix} object (either numeric or integer), a \code{dgCMatrix} object,
|
#' Function 'xgb.QuantileDMatrix' will construct a DMatrix with quantization for the histogram
|
||||||
#' a \code{dgRMatrix} object,
|
#' method already applied to it, which can be used to reduce memory usage (compared to using a
|
||||||
#' a \code{dsparseVector} object (only when making predictions from a fitted model, will be
|
#' a regular DMatrix first and then creating a quantization out of it) when using the histogram
|
||||||
#' interpreted as a row vector), or a character string representing a filename.
|
#' method (`tree_method = "hist"`, which is the default algorithm), but is not usable for the
|
||||||
#' @param label Label of the training data.
|
#' sorted-indices method (`tree_method = "exact"`), nor for the approximate method
|
||||||
|
#' (`tree_method = "approx"`).
|
||||||
|
#' @param data Data from which to create a DMatrix, which can then be used for fitting models or
|
||||||
|
#' for getting predictions out of a fitted model.
|
||||||
|
#'
|
||||||
|
#' Supported input types are as follows:\itemize{
|
||||||
|
#' \item `matrix` objects, with types `numeric`, `integer`, or `logical`.
|
||||||
|
#' \item `data.frame` objects, with columns of types `numeric`, `integer`, `logical`, or `factor`.
|
||||||
|
#'
|
||||||
|
#' Note that xgboost uses base-0 encoding for categorical types, hence `factor` types (which use base-1
|
||||||
|
#' encoding') will be converted inside the function call. Be aware that the encoding used for `factor`
|
||||||
|
#' types is not kept as part of the model, so in subsequent calls to `predict`, it is the user's
|
||||||
|
#' responsibility to ensure that factor columns have the same levels as the ones from which the DMatrix
|
||||||
|
#' was constructed.
|
||||||
|
#'
|
||||||
|
#' Other column types are not supported.
|
||||||
|
#' \item CSR matrices, as class `dgRMatrix` from package `Matrix`.
|
||||||
|
#' \item CSC matrices, as class `dgCMatrix` from package `Matrix`. These are \bold{not} supported for
|
||||||
|
#' 'xgb.QuantileDMatrix'.
|
||||||
|
#' \item Single-row CSR matrices, as class `dsparseVector` from package `Matrix`, which is interpreted
|
||||||
|
#' as a single row (only when making predictions from a fitted model).
|
||||||
|
#' \item Text files in SVMLight / LibSVM formats, passed as a path to the file. These are \bold{not}
|
||||||
|
#' supported for xgb.QuantileDMatrix'.
|
||||||
|
#' \item Binary files generated by \link{xgb.DMatrix.save}, passed as a path to the file. These are
|
||||||
|
#' \bold{not} supported for xgb.QuantileDMatrix'.
|
||||||
|
#' }
|
||||||
|
#' @param label Label of the training data. For classification problems, should be passed encoded as
|
||||||
|
#' integers with numeration starting at zero.
|
||||||
#' @param weight Weight for each instance.
|
#' @param weight Weight for each instance.
|
||||||
#'
|
#'
|
||||||
#' Note that, for ranking task, weights are per-group. In ranking task, one weight
|
#' Note that, for ranking task, weights are per-group. In ranking task, one weight
|
||||||
@ -18,29 +44,45 @@
|
|||||||
#' @param base_margin Base margin used for boosting from existing model.
|
#' @param base_margin Base margin used for boosting from existing model.
|
||||||
#'
|
#'
|
||||||
#' In the case of multi-output models, one can also pass multi-dimensional base_margin.
|
#' In the case of multi-output models, one can also pass multi-dimensional base_margin.
|
||||||
#' @param missing a float value to represents missing values in data (used only when input is a dense matrix).
|
#' @param missing A float value to represents missing values in data (not used when creating DMatrix
|
||||||
#' It is useful when a 0 or some other extreme value represents missing values in data.
|
#' from text files).
|
||||||
|
#' It is useful to change when a zero, infinite, or some other extreme value represents missing
|
||||||
|
#' values in data.
|
||||||
#' @param silent whether to suppress printing an informational message after loading from a file.
|
#' @param silent whether to suppress printing an informational message after loading from a file.
|
||||||
#' @param feature_names Set names for features. Overrides column names in data
|
#' @param feature_names Set names for features. Overrides column names in data
|
||||||
#' frame and matrix.
|
#' frame and matrix.
|
||||||
|
#'
|
||||||
|
#' Note: columns are not referenced by name when calling `predict`, so the column order there
|
||||||
|
#' must be the same as in the DMatrix construction, regardless of the column names.
|
||||||
|
#' @param feature_types Set types for features.
|
||||||
|
#'
|
||||||
|
#' If `data` is a `data.frame` and passing `feature_types` is not supplied, feature types will be deduced
|
||||||
|
#' automatically from the column types.
|
||||||
|
#'
|
||||||
|
#' Otherwise, one can pass a character vector with the same length as number of columns in `data`,
|
||||||
|
#' with the following possible values:\itemize{
|
||||||
|
#' \item "c", which represents categorical columns.
|
||||||
|
#' \item "q", which represents numeric columns.
|
||||||
|
#' \item "int", which represents integer columns.
|
||||||
|
#' \item "i", which represents logical (boolean) columns.
|
||||||
|
#' }
|
||||||
|
#'
|
||||||
|
#' Note that, while categorical types are treated differently from the rest for model fitting
|
||||||
|
#' purposes, the other types do not influence the generated model, but have effects in other
|
||||||
|
#' functionalities such as feature importances.
|
||||||
|
#'
|
||||||
|
#' \bold{Important}: categorical features, if specified manually through `feature_types`, must
|
||||||
|
#' be encoded as integers with numeration starting at zero, and the same encoding needs to be
|
||||||
|
#' applied when passing data to `predict`. Even if passing `factor` types, the encoding will
|
||||||
|
#' not be saved, so make sure that `factor` columns passed to `predict` have the same `levels`.
|
||||||
#' @param nthread Number of threads used for creating DMatrix.
|
#' @param nthread Number of threads used for creating DMatrix.
|
||||||
#' @param group Group size for all ranking group.
|
#' @param group Group size for all ranking group.
|
||||||
#' @param qid Query ID for data samples, used for ranking.
|
#' @param qid Query ID for data samples, used for ranking.
|
||||||
#' @param label_lower_bound Lower bound for survival training.
|
#' @param label_lower_bound Lower bound for survival training.
|
||||||
#' @param label_upper_bound Upper bound for survival training.
|
#' @param label_upper_bound Upper bound for survival training.
|
||||||
#' @param feature_weights Set feature weights for column sampling.
|
#' @param feature_weights Set feature weights for column sampling.
|
||||||
#' @param enable_categorical Experimental support of specializing for categorical features.
|
#' @return An 'xgb.DMatrix' object. If calling 'xgb.QuantileDMatrix', it will have additional
|
||||||
#'
|
#' subclass 'xgb.QuantileDMatrix'.
|
||||||
#' If passing 'TRUE' and 'data' is a data frame,
|
|
||||||
#' columns of categorical types will automatically
|
|
||||||
#' be set to be of categorical type (feature_type='c') in the resulting DMatrix.
|
|
||||||
#'
|
|
||||||
#' If passing 'FALSE' and 'data' is a data frame with categorical columns,
|
|
||||||
#' it will result in an error being thrown.
|
|
||||||
#'
|
|
||||||
#' If 'data' is not a data frame, this argument is ignored.
|
|
||||||
#'
|
|
||||||
#' JSON/UBJSON serialization format is required for this.
|
|
||||||
#'
|
#'
|
||||||
#' @details
|
#' @details
|
||||||
#' Note that DMatrix objects are not serializable through R functions such as \code{saveRDS} or \code{save}.
|
#' Note that DMatrix objects are not serializable through R functions such as \code{saveRDS} or \code{save}.
|
||||||
@ -60,6 +102,7 @@
|
|||||||
#' xgb.DMatrix.save(dtrain, fname)
|
#' xgb.DMatrix.save(dtrain, fname)
|
||||||
#' dtrain <- xgb.DMatrix(fname)
|
#' dtrain <- xgb.DMatrix(fname)
|
||||||
#' @export
|
#' @export
|
||||||
|
#' @rdname xgb.DMatrix
|
||||||
xgb.DMatrix <- function(
|
xgb.DMatrix <- function(
|
||||||
data,
|
data,
|
||||||
label = NULL,
|
label = NULL,
|
||||||
@ -68,18 +111,18 @@ xgb.DMatrix <- function(
|
|||||||
missing = NA,
|
missing = NA,
|
||||||
silent = FALSE,
|
silent = FALSE,
|
||||||
feature_names = colnames(data),
|
feature_names = colnames(data),
|
||||||
|
feature_types = NULL,
|
||||||
nthread = NULL,
|
nthread = NULL,
|
||||||
group = NULL,
|
group = NULL,
|
||||||
qid = NULL,
|
qid = NULL,
|
||||||
label_lower_bound = NULL,
|
label_lower_bound = NULL,
|
||||||
label_upper_bound = NULL,
|
label_upper_bound = NULL,
|
||||||
feature_weights = NULL,
|
feature_weights = NULL
|
||||||
enable_categorical = FALSE
|
|
||||||
) {
|
) {
|
||||||
if (!is.null(group) && !is.null(qid)) {
|
if (!is.null(group) && !is.null(qid)) {
|
||||||
stop("Either one of 'group' or 'qid' should be NULL")
|
stop("Either one of 'group' or 'qid' should be NULL")
|
||||||
}
|
}
|
||||||
ctypes <- NULL
|
nthread <- as.integer(NVL(nthread, -1L))
|
||||||
if (typeof(data) == "character") {
|
if (typeof(data) == "character") {
|
||||||
if (length(data) > 1) {
|
if (length(data) > 1) {
|
||||||
stop(
|
stop(
|
||||||
@ -91,7 +134,7 @@ xgb.DMatrix <- function(
|
|||||||
handle <- .Call(XGDMatrixCreateFromFile_R, data, as.integer(silent))
|
handle <- .Call(XGDMatrixCreateFromFile_R, data, as.integer(silent))
|
||||||
} else if (is.matrix(data)) {
|
} else if (is.matrix(data)) {
|
||||||
handle <- .Call(
|
handle <- .Call(
|
||||||
XGDMatrixCreateFromMat_R, data, missing, as.integer(NVL(nthread, -1))
|
XGDMatrixCreateFromMat_R, data, missing, nthread
|
||||||
)
|
)
|
||||||
} else if (inherits(data, "dgCMatrix")) {
|
} else if (inherits(data, "dgCMatrix")) {
|
||||||
handle <- .Call(
|
handle <- .Call(
|
||||||
@ -101,7 +144,7 @@ xgb.DMatrix <- function(
|
|||||||
data@x,
|
data@x,
|
||||||
nrow(data),
|
nrow(data),
|
||||||
missing,
|
missing,
|
||||||
as.integer(NVL(nthread, -1))
|
nthread
|
||||||
)
|
)
|
||||||
} else if (inherits(data, "dgRMatrix")) {
|
} else if (inherits(data, "dgRMatrix")) {
|
||||||
handle <- .Call(
|
handle <- .Call(
|
||||||
@ -111,7 +154,7 @@ xgb.DMatrix <- function(
|
|||||||
data@x,
|
data@x,
|
||||||
ncol(data),
|
ncol(data),
|
||||||
missing,
|
missing,
|
||||||
as.integer(NVL(nthread, -1))
|
nthread
|
||||||
)
|
)
|
||||||
} else if (inherits(data, "dsparseVector")) {
|
} else if (inherits(data, "dsparseVector")) {
|
||||||
indptr <- c(0L, as.integer(length(data@i)))
|
indptr <- c(0L, as.integer(length(data@i)))
|
||||||
@ -123,41 +166,15 @@ xgb.DMatrix <- function(
|
|||||||
data@x,
|
data@x,
|
||||||
length(data),
|
length(data),
|
||||||
missing,
|
missing,
|
||||||
as.integer(NVL(nthread, -1))
|
nthread
|
||||||
)
|
)
|
||||||
} else if (is.data.frame(data)) {
|
} else if (is.data.frame(data)) {
|
||||||
ctypes <- sapply(data, function(x) {
|
tmp <- .process.df.for.dmatrix(data, feature_types)
|
||||||
if (is.factor(x)) {
|
feature_types <- tmp$feature_types
|
||||||
if (!enable_categorical) {
|
|
||||||
stop(
|
|
||||||
"When factor type is used, the parameter `enable_categorical`",
|
|
||||||
" must be set to TRUE."
|
|
||||||
)
|
|
||||||
}
|
|
||||||
"c"
|
|
||||||
} else if (is.integer(x)) {
|
|
||||||
"int"
|
|
||||||
} else if (is.logical(x)) {
|
|
||||||
"i"
|
|
||||||
} else {
|
|
||||||
if (!is.numeric(x)) {
|
|
||||||
stop("Invalid type in dataframe.")
|
|
||||||
}
|
|
||||||
"float"
|
|
||||||
}
|
|
||||||
})
|
|
||||||
## as.data.frame somehow converts integer/logical into real.
|
|
||||||
data <- as.data.frame(sapply(data, function(x) {
|
|
||||||
if (is.factor(x)) {
|
|
||||||
## XGBoost uses 0-based indexing.
|
|
||||||
as.numeric(x) - 1
|
|
||||||
} else {
|
|
||||||
x
|
|
||||||
}
|
|
||||||
}))
|
|
||||||
handle <- .Call(
|
handle <- .Call(
|
||||||
XGDMatrixCreateFromDF_R, data, missing, as.integer(NVL(nthread, -1))
|
XGDMatrixCreateFromDF_R, tmp$lst, missing, nthread
|
||||||
)
|
)
|
||||||
|
rm(tmp)
|
||||||
} else {
|
} else {
|
||||||
stop("xgb.DMatrix does not support construction from ", typeof(data))
|
stop("xgb.DMatrix does not support construction from ", typeof(data))
|
||||||
}
|
}
|
||||||
@ -167,7 +184,75 @@ xgb.DMatrix <- function(
|
|||||||
class = "xgb.DMatrix",
|
class = "xgb.DMatrix",
|
||||||
fields = new.env()
|
fields = new.env()
|
||||||
)
|
)
|
||||||
|
.set.dmatrix.fields(
|
||||||
|
dmat = dmat,
|
||||||
|
label = label,
|
||||||
|
weight = weight,
|
||||||
|
base_margin = base_margin,
|
||||||
|
feature_names = feature_names,
|
||||||
|
feature_types = feature_types,
|
||||||
|
group = group,
|
||||||
|
qid = qid,
|
||||||
|
label_lower_bound = label_lower_bound,
|
||||||
|
label_upper_bound = label_upper_bound,
|
||||||
|
feature_weights = feature_weights
|
||||||
|
)
|
||||||
|
|
||||||
|
return(dmat)
|
||||||
|
}
|
||||||
|
|
||||||
|
.process.df.for.dmatrix <- function(df, feature_types) {
|
||||||
|
if (!nrow(df) || !ncol(df)) {
|
||||||
|
stop("'data' is an empty data.frame.")
|
||||||
|
}
|
||||||
|
if (!is.null(feature_types)) {
|
||||||
|
if (!is.character(feature_types) || length(feature_types) != ncol(df)) {
|
||||||
|
stop(
|
||||||
|
"'feature_types' must be a character vector with one entry per column in 'data'."
|
||||||
|
)
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
feature_types <- sapply(df, function(col) {
|
||||||
|
if (is.factor(col)) {
|
||||||
|
return("c")
|
||||||
|
} else if (is.integer(col)) {
|
||||||
|
return("int")
|
||||||
|
} else if (is.logical(col)) {
|
||||||
|
return("i")
|
||||||
|
} else {
|
||||||
|
if (!is.numeric(col)) {
|
||||||
|
stop("Invalid type in dataframe.")
|
||||||
|
}
|
||||||
|
return("float")
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
lst <- lapply(df, function(col) {
|
||||||
|
is_factor <- is.factor(col)
|
||||||
|
col <- as.numeric(col)
|
||||||
|
if (is_factor) {
|
||||||
|
col <- col - 1
|
||||||
|
}
|
||||||
|
return(col)
|
||||||
|
})
|
||||||
|
|
||||||
|
return(list(lst = lst, feature_types = feature_types))
|
||||||
|
}
|
||||||
|
|
||||||
|
.set.dmatrix.fields <- function(
|
||||||
|
dmat,
|
||||||
|
label,
|
||||||
|
weight,
|
||||||
|
base_margin,
|
||||||
|
feature_names,
|
||||||
|
feature_types,
|
||||||
|
group,
|
||||||
|
qid,
|
||||||
|
label_lower_bound,
|
||||||
|
label_upper_bound,
|
||||||
|
feature_weights
|
||||||
|
) {
|
||||||
if (!is.null(label)) {
|
if (!is.null(label)) {
|
||||||
setinfo(dmat, "label", label)
|
setinfo(dmat, "label", label)
|
||||||
}
|
}
|
||||||
@ -180,6 +265,9 @@ xgb.DMatrix <- function(
|
|||||||
if (!is.null(feature_names)) {
|
if (!is.null(feature_names)) {
|
||||||
setinfo(dmat, "feature_name", feature_names)
|
setinfo(dmat, "feature_name", feature_names)
|
||||||
}
|
}
|
||||||
|
if (!is.null(feature_types)) {
|
||||||
|
setinfo(dmat, "feature_type", feature_types)
|
||||||
|
}
|
||||||
if (!is.null(group)) {
|
if (!is.null(group)) {
|
||||||
setinfo(dmat, "group", group)
|
setinfo(dmat, "group", group)
|
||||||
}
|
}
|
||||||
@ -195,10 +283,511 @@ xgb.DMatrix <- function(
|
|||||||
if (!is.null(feature_weights)) {
|
if (!is.null(feature_weights)) {
|
||||||
setinfo(dmat, "feature_weights", feature_weights)
|
setinfo(dmat, "feature_weights", feature_weights)
|
||||||
}
|
}
|
||||||
if (!is.null(ctypes)) {
|
|
||||||
setinfo(dmat, "feature_type", ctypes)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#' @param ref The training dataset that provides quantile information, needed when creating
|
||||||
|
#' validation/test dataset with `xgb.QuantileDMatrix`. Supplying the training DMatrix
|
||||||
|
#' as a reference means that the same quantisation applied to the training data is
|
||||||
|
#' applied to the validation/test data
|
||||||
|
#' @param max_bin The number of histogram bin, should be consistent with the training parameter
|
||||||
|
#' `max_bin`.
|
||||||
|
#'
|
||||||
|
#' This is only supported when constructing a QuantileDMatrix.
|
||||||
|
#' @export
|
||||||
|
#' @rdname xgb.DMatrix
|
||||||
|
xgb.QuantileDMatrix <- function(
|
||||||
|
data,
|
||||||
|
label = NULL,
|
||||||
|
weight = NULL,
|
||||||
|
base_margin = NULL,
|
||||||
|
missing = NA,
|
||||||
|
feature_names = colnames(data),
|
||||||
|
feature_types = NULL,
|
||||||
|
nthread = NULL,
|
||||||
|
group = NULL,
|
||||||
|
qid = NULL,
|
||||||
|
label_lower_bound = NULL,
|
||||||
|
label_upper_bound = NULL,
|
||||||
|
feature_weights = NULL,
|
||||||
|
ref = NULL,
|
||||||
|
max_bin = NULL
|
||||||
|
) {
|
||||||
|
nthread <- as.integer(NVL(nthread, -1L))
|
||||||
|
if (!is.null(ref) && !inherits(ref, "xgb.DMatrix")) {
|
||||||
|
stop("'ref' must be an xgb.DMatrix object.")
|
||||||
|
}
|
||||||
|
|
||||||
|
# Note: when passing an integer matrix, it won't get casted to numeric.
|
||||||
|
# Since 'int' values as understood by languages like C cannot have missing values,
|
||||||
|
# R represents missingness there by assigning them a value equal to the minimum
|
||||||
|
# integer. The 'missing' value here is set before the data, so in case of integers,
|
||||||
|
# need to make the conversion manually beforehand.
|
||||||
|
if (is.matrix(data) && storage.mode(data) %in% c("integer", "logical") && is.na(missing)) {
|
||||||
|
missing <- .Call(XGGetRNAIntAsDouble)
|
||||||
|
}
|
||||||
|
|
||||||
|
iterator_env <- as.environment(
|
||||||
|
list(
|
||||||
|
data = data,
|
||||||
|
label = label,
|
||||||
|
weight = weight,
|
||||||
|
base_margin = base_margin,
|
||||||
|
missing = missing,
|
||||||
|
feature_names = feature_names,
|
||||||
|
feature_types = feature_types,
|
||||||
|
group = group,
|
||||||
|
qid = qid,
|
||||||
|
label_lower_bound = label_lower_bound,
|
||||||
|
label_upper_bound = label_upper_bound,
|
||||||
|
feature_weights = feature_weights
|
||||||
|
)
|
||||||
|
)
|
||||||
|
data_iterator <- .single.data.iterator(iterator_env)
|
||||||
|
|
||||||
|
# Note: the ProxyDMatrix has its finalizer assigned in the R externalptr
|
||||||
|
# object, but that finalizer will only be called once the object is
|
||||||
|
# garbage-collected, which doesn't happen immediately after it goes out
|
||||||
|
# of scope, hence this piece of code to tigger its destruction earlier
|
||||||
|
# and free memory right away.
|
||||||
|
proxy_handle <- .make.proxy.handle()
|
||||||
|
on.exit({
|
||||||
|
.Call(XGDMatrixFree_R, proxy_handle)
|
||||||
|
})
|
||||||
|
iterator_next <- function() {
|
||||||
|
return(xgb.ProxyDMatrix(proxy_handle, data_iterator))
|
||||||
|
}
|
||||||
|
iterator_reset <- function() {
|
||||||
|
return(data_iterator$f_reset(iterator_env))
|
||||||
|
}
|
||||||
|
calling_env <- environment()
|
||||||
|
|
||||||
|
dmat <- .Call(
|
||||||
|
XGQuantileDMatrixCreateFromCallback_R,
|
||||||
|
iterator_next,
|
||||||
|
iterator_reset,
|
||||||
|
calling_env,
|
||||||
|
proxy_handle,
|
||||||
|
nthread,
|
||||||
|
missing,
|
||||||
|
max_bin,
|
||||||
|
ref
|
||||||
|
)
|
||||||
|
attributes(dmat) <- list(
|
||||||
|
class = c("xgb.DMatrix", "xgb.QuantileDMatrix"),
|
||||||
|
fields = attributes(proxy_handle)$fields
|
||||||
|
)
|
||||||
|
return(dmat)
|
||||||
|
}
|
||||||
|
|
||||||
|
#' @title XGBoost Data Iterator
|
||||||
|
#' @description Interface to create a custom data iterator in order to construct a DMatrix
|
||||||
|
#' from external memory.
|
||||||
|
#'
|
||||||
|
#' This function is responsible for generating an R object structure containing callback
|
||||||
|
#' functions and an environment shared with them.
|
||||||
|
#'
|
||||||
|
#' The output structure from this function is then meant to be passed to \link{xgb.ExternalDMatrix},
|
||||||
|
#' which will consume the data and create a DMatrix from it by executing the callback functions.
|
||||||
|
#'
|
||||||
|
#' For more information, and for a usage example, see the documentation for \link{xgb.ExternalDMatrix}.
|
||||||
|
#' @param env An R environment to pass to the callback functions supplied here, which can be
|
||||||
|
#' used to keep track of variables to determine how to handle the batches.
|
||||||
|
#'
|
||||||
|
#' For example, one might want to keep track of an iteration number in this environment in order
|
||||||
|
#' to know which part of the data to pass next.
|
||||||
|
#' @param f_next `function(env)` which is responsible for:\itemize{
|
||||||
|
#' \item Accessing or retrieving the next batch of data in the iterator.
|
||||||
|
#' \item Supplying this data by calling function \link{xgb.DataBatch} on it and returning the result.
|
||||||
|
#' \item Keeping track of where in the iterator batch it is or will go next, which can for example
|
||||||
|
#' be done by modifiying variables in the `env` variable that is passed here.
|
||||||
|
#' \item Signaling whether there are more batches to be consumed or not, by returning `NULL`
|
||||||
|
#' when the stream of data ends (all batches in the iterator have been consumed), or the result from
|
||||||
|
#' calling \link{xgb.DataBatch} when there are more batches in the line to be consumed.
|
||||||
|
#' }
|
||||||
|
#' @param f_reset `function(env)` which is responsible for reseting the data iterator
|
||||||
|
#' (i.e. taking it back to the first batch, called before and after the sequence of batches
|
||||||
|
#' has been consumed).
|
||||||
|
#'
|
||||||
|
#' Note that, after resetting the iterator, the batches will be accessed again, so the same data
|
||||||
|
#' (and in the same order) must be passed in subsequent iterations.
|
||||||
|
#' @return An `xgb.DataIter` object, containing the same inputs supplied here, which can then
|
||||||
|
#' be passed to \link{xgb.ExternalDMatrix}.
|
||||||
|
#' @seealso \link{xgb.ExternalDMatrix}, \link{xgb.DataBatch}.
|
||||||
|
#' @export
|
||||||
|
xgb.DataIter <- function(env = new.env(), f_next, f_reset) {
|
||||||
|
if (!is.function(f_next)) {
|
||||||
|
stop("'f_next' must be a function.")
|
||||||
|
}
|
||||||
|
if (!is.function(f_reset)) {
|
||||||
|
stop("'f_reset' must be a function.")
|
||||||
|
}
|
||||||
|
out <- list(
|
||||||
|
env = env,
|
||||||
|
f_next = f_next,
|
||||||
|
f_reset = f_reset
|
||||||
|
)
|
||||||
|
class(out) <- "xgb.DataIter"
|
||||||
|
return(out)
|
||||||
|
}
|
||||||
|
|
||||||
|
.qdm.single.fnext <- function(env) {
|
||||||
|
curr_iter <- env[["iter"]]
|
||||||
|
if (curr_iter >= 1L) {
|
||||||
|
return(NULL)
|
||||||
|
}
|
||||||
|
|
||||||
|
on.exit({
|
||||||
|
env[["iter"]] <- curr_iter + 1L
|
||||||
|
})
|
||||||
|
return(
|
||||||
|
xgb.DataBatch(
|
||||||
|
data = env[["data"]],
|
||||||
|
label = env[["label"]],
|
||||||
|
weight = env[["weight"]],
|
||||||
|
base_margin = env[["base_margin"]],
|
||||||
|
feature_names = env[["feature_names"]],
|
||||||
|
feature_types = env[["feature_types"]],
|
||||||
|
group = env[["group"]],
|
||||||
|
qid = env[["qid"]],
|
||||||
|
label_lower_bound = env[["label_lower_bound"]],
|
||||||
|
label_upper_bound = env[["label_upper_bound"]],
|
||||||
|
feature_weights = env[["feature_weights"]]
|
||||||
|
)
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
.qdm.single.freset <- function(env) {
|
||||||
|
env[["iter"]] <- 0L
|
||||||
|
return(invisible(NULL))
|
||||||
|
}
|
||||||
|
|
||||||
|
.single.data.iterator <- function(env) {
|
||||||
|
env[["iter"]] <- 0L
|
||||||
|
return(xgb.DataIter(env, .qdm.single.fnext, .qdm.single.freset))
|
||||||
|
}
|
||||||
|
|
||||||
|
# Only for internal usage
|
||||||
|
.make.proxy.handle <- function() {
|
||||||
|
out <- .Call(XGProxyDMatrixCreate_R)
|
||||||
|
attributes(out) <- list(
|
||||||
|
class = c("xgb.DMatrix", "xgb.ProxyDMatrix"),
|
||||||
|
fields = new.env()
|
||||||
|
)
|
||||||
|
return(out)
|
||||||
|
}
|
||||||
|
|
||||||
|
#' @title Structure for Data Batches
|
||||||
|
#' @description Helper function to supply data in batches of a data iterator when
|
||||||
|
#' constructing a DMatrix from external memory through \link{xgb.ExternalDMatrix}
|
||||||
|
#' or through \link{xgb.QuantileDMatrix.from_iterator}.
|
||||||
|
#'
|
||||||
|
#' This function is \bold{only} meant to be called inside of a callback function (which
|
||||||
|
#' is passed as argument to function \link{xgb.DataIter} to construct a data iterator)
|
||||||
|
#' when constructing a DMatrix through external memory - otherwise, one should call
|
||||||
|
#' \link{xgb.DMatrix} or \link{xgb.QuantileDMatrix}.
|
||||||
|
#'
|
||||||
|
#' The object that results from calling this function directly is \bold{not} like
|
||||||
|
#' an `xgb.DMatrix` - i.e. cannot be used to train a model, nor to get predictions - only
|
||||||
|
#' possible usage is to supply data to an iterator, from which a DMatrix is then constructed.
|
||||||
|
#'
|
||||||
|
#' For more information and for example usage, see the documentation for \link{xgb.ExternalDMatrix}.
|
||||||
|
#' @inheritParams xgb.DMatrix
|
||||||
|
#' @param data Batch of data belonging to this batch.
|
||||||
|
#'
|
||||||
|
#' Note that not all of the input types supported by \link{xgb.DMatrix} are possible
|
||||||
|
#' to pass here. Supported types are:\itemize{
|
||||||
|
#' \item `matrix`, with types `numeric`, `integer`, and `logical`. Note that for types
|
||||||
|
#' `integer` and `logical`, missing values might not be automatically recognized as
|
||||||
|
#' as such - see the documentation for parameter `missing` in \link{xgb.ExternalDMatrix}
|
||||||
|
#' for details on this.
|
||||||
|
#' \item `data.frame`, with the same types as supported by 'xgb.DMatrix' and same
|
||||||
|
#' conversions applied to it. See the documentation for parameter `data` in
|
||||||
|
#' \link{xgb.DMatrix} for details on it.
|
||||||
|
#' \item CSR matrices, as class `dgRMatrix` from package `Matrix`.
|
||||||
|
#' }
|
||||||
|
#' @return An object of class `xgb.DataBatch`, which is just a list containing the
|
||||||
|
#' data and parameters passed here. It does \bold{not} inherit from `xgb.DMatrix`.
|
||||||
|
#' @seealso \link{xgb.DataIter}, \link{xgb.ExternalDMatrix}.
|
||||||
|
#' @export
|
||||||
|
xgb.DataBatch <- function(
|
||||||
|
data,
|
||||||
|
label = NULL,
|
||||||
|
weight = NULL,
|
||||||
|
base_margin = NULL,
|
||||||
|
feature_names = colnames(data),
|
||||||
|
feature_types = NULL,
|
||||||
|
group = NULL,
|
||||||
|
qid = NULL,
|
||||||
|
label_lower_bound = NULL,
|
||||||
|
label_upper_bound = NULL,
|
||||||
|
feature_weights = NULL
|
||||||
|
) {
|
||||||
|
stopifnot(inherits(data, c("matrix", "data.frame", "dgRMatrix")))
|
||||||
|
out <- list(
|
||||||
|
data = data,
|
||||||
|
label = label,
|
||||||
|
weight = weight,
|
||||||
|
base_margin = base_margin,
|
||||||
|
feature_names = feature_names,
|
||||||
|
feature_types = feature_types,
|
||||||
|
group = group,
|
||||||
|
qid = qid,
|
||||||
|
label_lower_bound = label_lower_bound,
|
||||||
|
label_upper_bound = label_upper_bound,
|
||||||
|
feature_weights = feature_weights
|
||||||
|
)
|
||||||
|
class(out) <- "xgb.DataBatch"
|
||||||
|
return(out)
|
||||||
|
}
|
||||||
|
|
||||||
|
# This is only for internal usage, class is not exposed to the user.
|
||||||
|
xgb.ProxyDMatrix <- function(proxy_handle, data_iterator) {
|
||||||
|
lst <- data_iterator$f_next(data_iterator$env)
|
||||||
|
if (is.null(lst)) {
|
||||||
|
return(0L)
|
||||||
|
}
|
||||||
|
if (!inherits(lst, "xgb.DataBatch")) {
|
||||||
|
stop("DataIter 'f_next' must return either NULL or the result from calling 'xgb.DataBatch'.")
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!is.null(lst$group) && !is.null(lst$qid)) {
|
||||||
|
stop("Either one of 'group' or 'qid' should be NULL")
|
||||||
|
}
|
||||||
|
if (is.data.frame(lst$data)) {
|
||||||
|
tmp <- .process.df.for.dmatrix(lst$data, lst$feature_types)
|
||||||
|
lst$feature_types <- tmp$feature_types
|
||||||
|
.Call(XGProxyDMatrixSetDataColumnar_R, proxy_handle, tmp$lst)
|
||||||
|
rm(tmp)
|
||||||
|
} else if (is.matrix(lst$data)) {
|
||||||
|
.Call(XGProxyDMatrixSetDataDense_R, proxy_handle, lst$data)
|
||||||
|
} else if (inherits(lst$data, "dgRMatrix")) {
|
||||||
|
tmp <- list(p = lst$data@p, j = lst$data@j, x = lst$data@x, ncol = ncol(lst$data))
|
||||||
|
.Call(XGProxyDMatrixSetDataCSR_R, proxy_handle, tmp)
|
||||||
|
} else {
|
||||||
|
stop("'data' has unsupported type.")
|
||||||
|
}
|
||||||
|
|
||||||
|
.set.dmatrix.fields(
|
||||||
|
dmat = proxy_handle,
|
||||||
|
label = lst$label,
|
||||||
|
weight = lst$weight,
|
||||||
|
base_margin = lst$base_margin,
|
||||||
|
feature_names = lst$feature_names,
|
||||||
|
feature_types = lst$feature_types,
|
||||||
|
group = lst$group,
|
||||||
|
qid = lst$qid,
|
||||||
|
label_lower_bound = lst$label_lower_bound,
|
||||||
|
label_upper_bound = lst$label_upper_bound,
|
||||||
|
feature_weights = lst$feature_weights
|
||||||
|
)
|
||||||
|
|
||||||
|
return(1L)
|
||||||
|
}
|
||||||
|
|
||||||
|
#' @title DMatrix from External Data
|
||||||
|
#' @description Create a special type of xgboost 'DMatrix' object from external data
|
||||||
|
#' supplied by an \link{xgb.DataIter} object, potentially passed in batches from a
|
||||||
|
#' bigger set that might not fit entirely in memory.
|
||||||
|
#'
|
||||||
|
#' The data supplied by the iterator is accessed on-demand as needed, multiple times,
|
||||||
|
#' without being concatenated, but note that fields like 'label' \bold{will} be
|
||||||
|
#' concatenated from multiple calls to the data iterator.
|
||||||
|
#'
|
||||||
|
#' For more information, see the guide 'Using XGBoost External Memory Version':
|
||||||
|
#' \url{https://xgboost.readthedocs.io/en/stable/tutorials/external_memory.html}
|
||||||
|
#' @inheritParams xgb.DMatrix
|
||||||
|
#' @param data_iterator A data iterator structure as returned by \link{xgb.DataIter},
|
||||||
|
#' which includes an environment shared between function calls, and functions to access
|
||||||
|
#' the data in batches on-demand.
|
||||||
|
#' @param cache_prefix The path of cache file, caller must initialize all the directories in this path.
|
||||||
|
#' @param missing A float value to represents missing values in data.
|
||||||
|
#'
|
||||||
|
#' Note that, while functions like \link{xgb.DMatrix} can take a generic `NA` and interpret it
|
||||||
|
#' correctly for different types like `numeric` and `integer`, if an `NA` value is passed here,
|
||||||
|
#' it will not be adapted for different input types.
|
||||||
|
#'
|
||||||
|
#' For example, in R `integer` types, missing values are represented by integer number `-2147483648`
|
||||||
|
#' (since machine 'integer' types do not have an inherent 'NA' value) - hence, if one passes `NA`,
|
||||||
|
#' which is interpreted as a floating-point NaN by 'xgb.ExternalDMatrix' and by
|
||||||
|
#' 'xgb.QuantileDMatrix.from_iterator', these integer missing values will not be treated as missing.
|
||||||
|
#' This should not pose any problem for `numeric` types, since they do have an inheret NaN value.
|
||||||
|
#' @return An 'xgb.DMatrix' object, with subclass 'xgb.ExternalDMatrix', in which the data is not
|
||||||
|
#' held internally but accessed through the iterator when needed.
|
||||||
|
#' @seealso \link{xgb.DataIter}, \link{xgb.DataBatch}, \link{xgb.QuantileDMatrix.from_iterator}
|
||||||
|
#' @examples
|
||||||
|
#' library(xgboost)
|
||||||
|
#' data(mtcars)
|
||||||
|
#'
|
||||||
|
#' # this custom environment will be passed to the iterator
|
||||||
|
#' # functions at each call. It's up to the user to keep
|
||||||
|
#' # track of the iteration number in this environment.
|
||||||
|
#' iterator_env <- as.environment(
|
||||||
|
#' list(
|
||||||
|
#' iter = 0,
|
||||||
|
#' x = mtcars[, -1],
|
||||||
|
#' y = mtcars[, 1]
|
||||||
|
#' )
|
||||||
|
#' )
|
||||||
|
#'
|
||||||
|
#' # Data is passed in two batches.
|
||||||
|
#' # In this example, batches are obtained by subsetting the 'x' variable.
|
||||||
|
#' # This is not advantageous to do, since the data is already loaded in memory
|
||||||
|
#' # and can be passed in full in one go, but there can be situations in which
|
||||||
|
#' # only a subset of the data will fit in the computer's memory, and it can
|
||||||
|
#' # be loaded in batches that are accessed one-at-a-time only.
|
||||||
|
#' iterator_next <- function(iterator_env) {
|
||||||
|
#' curr_iter <- iterator_env[["iter"]]
|
||||||
|
#' if (curr_iter >= 2) {
|
||||||
|
#' # there are only two batches, so this signals end of the stream
|
||||||
|
#' return(NULL)
|
||||||
|
#' }
|
||||||
|
#'
|
||||||
|
#' if (curr_iter == 0) {
|
||||||
|
#' x_batch <- iterator_env[["x"]][1:16, ]
|
||||||
|
#' y_batch <- iterator_env[["y"]][1:16]
|
||||||
|
#' } else {
|
||||||
|
#' x_batch <- iterator_env[["x"]][17:32, ]
|
||||||
|
#' y_batch <- iterator_env[["y"]][17:32]
|
||||||
|
#' }
|
||||||
|
#' on.exit({
|
||||||
|
#' iterator_env[["iter"]] <- curr_iter + 1
|
||||||
|
#' })
|
||||||
|
#'
|
||||||
|
#' # Function 'xgb.DataBatch' must be called manually
|
||||||
|
#' # at each batch with all the appropriate attributes,
|
||||||
|
#' # such as feature names and feature types.
|
||||||
|
#' return(xgb.DataBatch(data = x_batch, label = y_batch))
|
||||||
|
#' }
|
||||||
|
#'
|
||||||
|
#' # This moves the iterator back to its beginning
|
||||||
|
#' iterator_reset <- function(iterator_env) {
|
||||||
|
#' iterator_env[["iter"]] <- 0
|
||||||
|
#' }
|
||||||
|
#'
|
||||||
|
#' data_iterator <- xgb.DataIter(
|
||||||
|
#' env = iterator_env,
|
||||||
|
#' f_next = iterator_next,
|
||||||
|
#' f_reset = iterator_reset
|
||||||
|
#' )
|
||||||
|
#' cache_prefix <- tempdir()
|
||||||
|
#'
|
||||||
|
#' # DMatrix will be constructed from the iterator's batches
|
||||||
|
#' dm <- xgb.ExternalDMatrix(data_iterator, cache_prefix, nthread = 1)
|
||||||
|
#'
|
||||||
|
#' # After construction, can be used as a regular DMatrix
|
||||||
|
#' params <- list(nthread = 1, objective = "reg:squarederror")
|
||||||
|
#' model <- xgb.train(data = dm, nrounds = 2, params = params)
|
||||||
|
#'
|
||||||
|
#' # Predictions can also be called on it, and should be the same
|
||||||
|
#' # as if the data were passed differently.
|
||||||
|
#' pred_dm <- predict(model, dm)
|
||||||
|
#' pred_mat <- predict(model, as.matrix(mtcars[, -1]))
|
||||||
|
#' @export
|
||||||
|
xgb.ExternalDMatrix <- function(
|
||||||
|
data_iterator,
|
||||||
|
cache_prefix = tempdir(),
|
||||||
|
missing = NA,
|
||||||
|
nthread = NULL
|
||||||
|
) {
|
||||||
|
stopifnot(inherits(data_iterator, "xgb.DataIter"))
|
||||||
|
stopifnot(is.character(cache_prefix))
|
||||||
|
|
||||||
|
cache_prefix <- path.expand(cache_prefix)
|
||||||
|
nthread <- as.integer(NVL(nthread, -1L))
|
||||||
|
|
||||||
|
proxy_handle <- .make.proxy.handle()
|
||||||
|
on.exit({
|
||||||
|
.Call(XGDMatrixFree_R, proxy_handle)
|
||||||
|
})
|
||||||
|
iterator_next <- function() {
|
||||||
|
return(xgb.ProxyDMatrix(proxy_handle, data_iterator))
|
||||||
|
}
|
||||||
|
iterator_reset <- function() {
|
||||||
|
return(data_iterator$f_reset(data_iterator$env))
|
||||||
|
}
|
||||||
|
calling_env <- environment()
|
||||||
|
|
||||||
|
dmat <- .Call(
|
||||||
|
XGDMatrixCreateFromCallback_R,
|
||||||
|
iterator_next,
|
||||||
|
iterator_reset,
|
||||||
|
calling_env,
|
||||||
|
proxy_handle,
|
||||||
|
nthread,
|
||||||
|
missing,
|
||||||
|
cache_prefix
|
||||||
|
)
|
||||||
|
|
||||||
|
attributes(dmat) <- list(
|
||||||
|
class = c("xgb.DMatrix", "xgb.ExternalDMatrix"),
|
||||||
|
fields = attributes(proxy_handle)$fields
|
||||||
|
)
|
||||||
|
return(dmat)
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
#' @title QuantileDMatrix from External Data
|
||||||
|
#' @description Create an `xgb.QuantileDMatrix` object (exact same class as would be returned by
|
||||||
|
#' calling function \link{xgb.QuantileDMatrix}, with the same advantages and limitations) from
|
||||||
|
#' external data supplied by an \link{xgb.DataIter} object, potentially passed in batches from
|
||||||
|
#' a bigger set that might not fit entirely in memory, same way as \link{xgb.ExternalDMatrix}.
|
||||||
|
#'
|
||||||
|
#' Note that, while external data will only be loaded through the iterator (thus the full data
|
||||||
|
#' might not be held entirely in-memory), the quantized representation of the data will get
|
||||||
|
#' created in-memory, being concatenated from multiple calls to the data iterator. The quantized
|
||||||
|
#' version is typically lighter than the original data, so there might be cases in which this
|
||||||
|
#' representation could potentially fit in memory even if the full data doesn't.
|
||||||
|
#'
|
||||||
|
#' For more information, see the guide 'Using XGBoost External Memory Version':
|
||||||
|
#' \url{https://xgboost.readthedocs.io/en/stable/tutorials/external_memory.html}
|
||||||
|
#' @inheritParams xgb.ExternalDMatrix
|
||||||
|
#' @inheritParams xgb.QuantileDMatrix
|
||||||
|
#' @return An 'xgb.DMatrix' object, with subclass 'xgb.QuantileDMatrix'.
|
||||||
|
#' @seealso \link{xgb.DataIter}, \link{xgb.DataBatch}, \link{xgb.ExternalDMatrix},
|
||||||
|
#' \link{xgb.QuantileDMatrix}
|
||||||
|
#' @export
|
||||||
|
xgb.QuantileDMatrix.from_iterator <- function( # nolint
|
||||||
|
data_iterator,
|
||||||
|
missing = NA,
|
||||||
|
nthread = NULL,
|
||||||
|
ref = NULL,
|
||||||
|
max_bin = NULL
|
||||||
|
) {
|
||||||
|
stopifnot(inherits(data_iterator, "xgb.DataIter"))
|
||||||
|
if (!is.null(ref) && !inherits(ref, "xgb.DMatrix")) {
|
||||||
|
stop("'ref' must be an xgb.DMatrix object.")
|
||||||
|
}
|
||||||
|
|
||||||
|
nthread <- as.integer(NVL(nthread, -1L))
|
||||||
|
|
||||||
|
proxy_handle <- .make.proxy.handle()
|
||||||
|
on.exit({
|
||||||
|
.Call(XGDMatrixFree_R, proxy_handle)
|
||||||
|
})
|
||||||
|
iterator_next <- function() {
|
||||||
|
return(xgb.ProxyDMatrix(proxy_handle, data_iterator))
|
||||||
|
}
|
||||||
|
iterator_reset <- function() {
|
||||||
|
return(data_iterator$f_reset(data_iterator$env))
|
||||||
|
}
|
||||||
|
calling_env <- environment()
|
||||||
|
|
||||||
|
dmat <- .Call(
|
||||||
|
XGQuantileDMatrixCreateFromCallback_R,
|
||||||
|
iterator_next,
|
||||||
|
iterator_reset,
|
||||||
|
calling_env,
|
||||||
|
proxy_handle,
|
||||||
|
nthread,
|
||||||
|
missing,
|
||||||
|
max_bin,
|
||||||
|
ref
|
||||||
|
)
|
||||||
|
|
||||||
|
attributes(dmat) <- list(
|
||||||
|
class = c("xgb.DMatrix", "xgb.QuantileDMatrix"),
|
||||||
|
fields = attributes(proxy_handle)$fields
|
||||||
|
)
|
||||||
return(dmat)
|
return(dmat)
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -646,19 +1235,15 @@ xgb.get.DMatrix.data <- function(dmat) {
|
|||||||
#' data(agaricus.train, package='xgboost')
|
#' data(agaricus.train, package='xgboost')
|
||||||
#' dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label, nthread = 2))
|
#' dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label, nthread = 2))
|
||||||
#'
|
#'
|
||||||
#' dsub <- slice(dtrain, 1:42)
|
#' dsub <- xgb.slice.DMatrix(dtrain, 1:42)
|
||||||
#' labels1 <- getinfo(dsub, 'label')
|
#' labels1 <- getinfo(dsub, 'label')
|
||||||
#' dsub <- dtrain[1:42, ]
|
#' dsub <- dtrain[1:42, ]
|
||||||
#' labels2 <- getinfo(dsub, 'label')
|
#' labels2 <- getinfo(dsub, 'label')
|
||||||
#' all.equal(labels1, labels2)
|
#' all.equal(labels1, labels2)
|
||||||
#'
|
#'
|
||||||
#' @rdname slice.xgb.DMatrix
|
#' @rdname xgb.slice.DMatrix
|
||||||
#' @export
|
#' @export
|
||||||
slice <- function(object, idxset) UseMethod("slice")
|
xgb.slice.DMatrix <- function(object, idxset) {
|
||||||
|
|
||||||
#' @rdname slice.xgb.DMatrix
|
|
||||||
#' @export
|
|
||||||
slice.xgb.DMatrix <- function(object, idxset) {
|
|
||||||
if (!inherits(object, "xgb.DMatrix")) {
|
if (!inherits(object, "xgb.DMatrix")) {
|
||||||
stop("object must be xgb.DMatrix")
|
stop("object must be xgb.DMatrix")
|
||||||
}
|
}
|
||||||
@ -682,10 +1267,10 @@ slice.xgb.DMatrix <- function(object, idxset) {
|
|||||||
return(structure(ret, class = "xgb.DMatrix"))
|
return(structure(ret, class = "xgb.DMatrix"))
|
||||||
}
|
}
|
||||||
|
|
||||||
#' @rdname slice.xgb.DMatrix
|
#' @rdname xgb.slice.DMatrix
|
||||||
#' @export
|
#' @export
|
||||||
`[.xgb.DMatrix` <- function(object, idxset, colset = NULL) {
|
`[.xgb.DMatrix` <- function(object, idxset, colset = NULL) {
|
||||||
slice(object, idxset)
|
xgb.slice.DMatrix(object, idxset)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -712,7 +1297,17 @@ print.xgb.DMatrix <- function(x, verbose = FALSE, ...) {
|
|||||||
cat("INVALID xgb.DMatrix object. Must be constructed anew.\n")
|
cat("INVALID xgb.DMatrix object. Must be constructed anew.\n")
|
||||||
return(invisible(x))
|
return(invisible(x))
|
||||||
}
|
}
|
||||||
cat('xgb.DMatrix dim:', nrow(x), 'x', ncol(x), ' info: ')
|
class_print <- if (inherits(x, "xgb.QuantileDMatrix")) {
|
||||||
|
"xgb.QuantileDMatrix"
|
||||||
|
} else if (inherits(x, "xgb.ExternalDMatrix")) {
|
||||||
|
"xgb.ExternalDMatrix"
|
||||||
|
} else if (inherits(x, "xgb.ProxyDMatrix")) {
|
||||||
|
"xgb.ProxyDMatrix"
|
||||||
|
} else {
|
||||||
|
"xgb.DMatrix"
|
||||||
|
}
|
||||||
|
|
||||||
|
cat(class_print, ' dim:', nrow(x), 'x', ncol(x), ' info: ')
|
||||||
infos <- character(0)
|
infos <- character(0)
|
||||||
if (xgb.DMatrix.hasinfo(x, 'label')) infos <- 'label'
|
if (xgb.DMatrix.hasinfo(x, 'label')) infos <- 'label'
|
||||||
if (xgb.DMatrix.hasinfo(x, 'weight')) infos <- c(infos, 'weight')
|
if (xgb.DMatrix.hasinfo(x, 'weight')) infos <- c(infos, 'weight')
|
||||||
|
|||||||
@ -197,12 +197,12 @@ xgb.cv <- function(params = list(), data, nrounds, nfold, label = NULL, missing
|
|||||||
nthread = params$nthread
|
nthread = params$nthread
|
||||||
)
|
)
|
||||||
bst_folds <- lapply(seq_along(folds), function(k) {
|
bst_folds <- lapply(seq_along(folds), function(k) {
|
||||||
dtest <- slice(dall, folds[[k]])
|
dtest <- xgb.slice.DMatrix(dall, folds[[k]])
|
||||||
# code originally contributed by @RolandASc on stackoverflow
|
# code originally contributed by @RolandASc on stackoverflow
|
||||||
if (is.null(train_folds))
|
if (is.null(train_folds))
|
||||||
dtrain <- slice(dall, unlist(folds[-k]))
|
dtrain <- xgb.slice.DMatrix(dall, unlist(folds[-k]))
|
||||||
else
|
else
|
||||||
dtrain <- slice(dall, train_folds[[k]])
|
dtrain <- xgb.slice.DMatrix(dall, train_folds[[k]])
|
||||||
bst <- xgb.Booster(
|
bst <- xgb.Booster(
|
||||||
params = params,
|
params = params,
|
||||||
cachelist = list(dtrain, dtest),
|
cachelist = list(dtrain, dtest),
|
||||||
|
|||||||
@ -17,6 +17,7 @@
|
|||||||
training = FALSE,
|
training = FALSE,
|
||||||
iterationrange = NULL,
|
iterationrange = NULL,
|
||||||
strict_shape = FALSE,
|
strict_shape = FALSE,
|
||||||
|
validate_features = FALSE,
|
||||||
...
|
...
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
@ -66,6 +67,23 @@ base-1 indexing, and inclusive of both ends).
|
|||||||
\item{strict_shape}{Default is \code{FALSE}. When set to \code{TRUE}, the output
|
\item{strict_shape}{Default is \code{FALSE}. When set to \code{TRUE}, the output
|
||||||
type and shape of predictions are invariant to the model type.}
|
type and shape of predictions are invariant to the model type.}
|
||||||
|
|
||||||
|
\item{validate_features}{When \code{TRUE}, validate that the Booster's and newdata's feature_names
|
||||||
|
match (only applicable when both \code{object} and \code{newdata} have feature names).
|
||||||
|
|
||||||
|
\if{html}{\out{<div class="sourceCode">}}\preformatted{ If the column names differ and `newdata` is not an `xgb.DMatrix`, will try to reorder
|
||||||
|
the columns in `newdata` to match with the booster's.
|
||||||
|
|
||||||
|
If the booster has feature types and `newdata` is either an `xgb.DMatrix` or `data.frame`,
|
||||||
|
will additionally verify that categorical columns are of the correct type in `newdata`,
|
||||||
|
throwing an error if they do not match.
|
||||||
|
|
||||||
|
If passing `FALSE`, it is assumed that the feature names and types are the same,
|
||||||
|
and come in the same order as in the training data.
|
||||||
|
|
||||||
|
Note that this check might add some sizable latency to the predictions, so it's
|
||||||
|
recommended to disable it for performance-sensitive applications.
|
||||||
|
}\if{html}{\out{</div>}}}
|
||||||
|
|
||||||
\item{...}{Not used.}
|
\item{...}{Not used.}
|
||||||
}
|
}
|
||||||
\value{
|
\value{
|
||||||
|
|||||||
@ -2,6 +2,7 @@
|
|||||||
% Please edit documentation in R/xgb.DMatrix.R
|
% Please edit documentation in R/xgb.DMatrix.R
|
||||||
\name{xgb.DMatrix}
|
\name{xgb.DMatrix}
|
||||||
\alias{xgb.DMatrix}
|
\alias{xgb.DMatrix}
|
||||||
|
\alias{xgb.QuantileDMatrix}
|
||||||
\title{Construct xgb.DMatrix object}
|
\title{Construct xgb.DMatrix object}
|
||||||
\usage{
|
\usage{
|
||||||
xgb.DMatrix(
|
xgb.DMatrix(
|
||||||
@ -12,22 +13,61 @@ xgb.DMatrix(
|
|||||||
missing = NA,
|
missing = NA,
|
||||||
silent = FALSE,
|
silent = FALSE,
|
||||||
feature_names = colnames(data),
|
feature_names = colnames(data),
|
||||||
|
feature_types = NULL,
|
||||||
|
nthread = NULL,
|
||||||
|
group = NULL,
|
||||||
|
qid = NULL,
|
||||||
|
label_lower_bound = NULL,
|
||||||
|
label_upper_bound = NULL,
|
||||||
|
feature_weights = NULL
|
||||||
|
)
|
||||||
|
|
||||||
|
xgb.QuantileDMatrix(
|
||||||
|
data,
|
||||||
|
label = NULL,
|
||||||
|
weight = NULL,
|
||||||
|
base_margin = NULL,
|
||||||
|
missing = NA,
|
||||||
|
feature_names = colnames(data),
|
||||||
|
feature_types = NULL,
|
||||||
nthread = NULL,
|
nthread = NULL,
|
||||||
group = NULL,
|
group = NULL,
|
||||||
qid = NULL,
|
qid = NULL,
|
||||||
label_lower_bound = NULL,
|
label_lower_bound = NULL,
|
||||||
label_upper_bound = NULL,
|
label_upper_bound = NULL,
|
||||||
feature_weights = NULL,
|
feature_weights = NULL,
|
||||||
enable_categorical = FALSE
|
ref = NULL,
|
||||||
|
max_bin = NULL
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
\arguments{
|
\arguments{
|
||||||
\item{data}{a \code{matrix} object (either numeric or integer), a \code{dgCMatrix} object,
|
\item{data}{Data from which to create a DMatrix, which can then be used for fitting models or
|
||||||
a \code{dgRMatrix} object,
|
for getting predictions out of a fitted model.
|
||||||
a \code{dsparseVector} object (only when making predictions from a fitted model, will be
|
|
||||||
interpreted as a row vector), or a character string representing a filename.}
|
|
||||||
|
|
||||||
\item{label}{Label of the training data.}
|
Supported input types are as follows:\itemize{
|
||||||
|
\item \code{matrix} objects, with types \code{numeric}, \code{integer}, or \code{logical}.
|
||||||
|
\item \code{data.frame} objects, with columns of types \code{numeric}, \code{integer}, \code{logical}, or \code{factor}.
|
||||||
|
|
||||||
|
Note that xgboost uses base-0 encoding for categorical types, hence \code{factor} types (which use base-1
|
||||||
|
encoding') will be converted inside the function call. Be aware that the encoding used for \code{factor}
|
||||||
|
types is not kept as part of the model, so in subsequent calls to \code{predict}, it is the user's
|
||||||
|
responsibility to ensure that factor columns have the same levels as the ones from which the DMatrix
|
||||||
|
was constructed.
|
||||||
|
|
||||||
|
Other column types are not supported.
|
||||||
|
\item CSR matrices, as class \code{dgRMatrix} from package \code{Matrix}.
|
||||||
|
\item CSC matrices, as class \code{dgCMatrix} from package \code{Matrix}. These are \bold{not} supported for
|
||||||
|
'xgb.QuantileDMatrix'.
|
||||||
|
\item Single-row CSR matrices, as class \code{dsparseVector} from package \code{Matrix}, which is interpreted
|
||||||
|
as a single row (only when making predictions from a fitted model).
|
||||||
|
\item Text files in SVMLight / LibSVM formats, passed as a path to the file. These are \bold{not}
|
||||||
|
supported for xgb.QuantileDMatrix'.
|
||||||
|
\item Binary files generated by \link{xgb.DMatrix.save}, passed as a path to the file. These are
|
||||||
|
\bold{not} supported for xgb.QuantileDMatrix'.
|
||||||
|
}}
|
||||||
|
|
||||||
|
\item{label}{Label of the training data. For classification problems, should be passed encoded as
|
||||||
|
integers with numeration starting at zero.}
|
||||||
|
|
||||||
\item{weight}{Weight for each instance.
|
\item{weight}{Weight for each instance.
|
||||||
|
|
||||||
@ -41,13 +81,41 @@ so it doesn't make sense to assign weights to individual data points.}
|
|||||||
\if{html}{\out{<div class="sourceCode">}}\preformatted{ In the case of multi-output models, one can also pass multi-dimensional base_margin.
|
\if{html}{\out{<div class="sourceCode">}}\preformatted{ In the case of multi-output models, one can also pass multi-dimensional base_margin.
|
||||||
}\if{html}{\out{</div>}}}
|
}\if{html}{\out{</div>}}}
|
||||||
|
|
||||||
\item{missing}{a float value to represents missing values in data (used only when input is a dense matrix).
|
\item{missing}{A float value to represents missing values in data (not used when creating DMatrix
|
||||||
It is useful when a 0 or some other extreme value represents missing values in data.}
|
from text files).
|
||||||
|
It is useful to change when a zero, infinite, or some other extreme value represents missing
|
||||||
|
values in data.}
|
||||||
|
|
||||||
\item{silent}{whether to suppress printing an informational message after loading from a file.}
|
\item{silent}{whether to suppress printing an informational message after loading from a file.}
|
||||||
|
|
||||||
\item{feature_names}{Set names for features. Overrides column names in data
|
\item{feature_names}{Set names for features. Overrides column names in data
|
||||||
frame and matrix.}
|
frame and matrix.
|
||||||
|
|
||||||
|
\if{html}{\out{<div class="sourceCode">}}\preformatted{ Note: columns are not referenced by name when calling `predict`, so the column order there
|
||||||
|
must be the same as in the DMatrix construction, regardless of the column names.
|
||||||
|
}\if{html}{\out{</div>}}}
|
||||||
|
|
||||||
|
\item{feature_types}{Set types for features.
|
||||||
|
|
||||||
|
If \code{data} is a \code{data.frame} and passing \code{feature_types} is not supplied, feature types will be deduced
|
||||||
|
automatically from the column types.
|
||||||
|
|
||||||
|
Otherwise, one can pass a character vector with the same length as number of columns in \code{data},
|
||||||
|
with the following possible values:\itemize{
|
||||||
|
\item "c", which represents categorical columns.
|
||||||
|
\item "q", which represents numeric columns.
|
||||||
|
\item "int", which represents integer columns.
|
||||||
|
\item "i", which represents logical (boolean) columns.
|
||||||
|
}
|
||||||
|
|
||||||
|
Note that, while categorical types are treated differently from the rest for model fitting
|
||||||
|
purposes, the other types do not influence the generated model, but have effects in other
|
||||||
|
functionalities such as feature importances.
|
||||||
|
|
||||||
|
\bold{Important}: categorical features, if specified manually through \code{feature_types}, must
|
||||||
|
be encoded as integers with numeration starting at zero, and the same encoding needs to be
|
||||||
|
applied when passing data to \code{predict}. Even if passing \code{factor} types, the encoding will
|
||||||
|
not be saved, so make sure that \code{factor} columns passed to \code{predict} have the same \code{levels}.}
|
||||||
|
|
||||||
\item{nthread}{Number of threads used for creating DMatrix.}
|
\item{nthread}{Number of threads used for creating DMatrix.}
|
||||||
|
|
||||||
@ -61,26 +129,32 @@ frame and matrix.}
|
|||||||
|
|
||||||
\item{feature_weights}{Set feature weights for column sampling.}
|
\item{feature_weights}{Set feature weights for column sampling.}
|
||||||
|
|
||||||
\item{enable_categorical}{Experimental support of specializing for categorical features.
|
\item{ref}{The training dataset that provides quantile information, needed when creating
|
||||||
|
validation/test dataset with \code{xgb.QuantileDMatrix}. Supplying the training DMatrix
|
||||||
|
as a reference means that the same quantisation applied to the training data is
|
||||||
|
applied to the validation/test data}
|
||||||
|
|
||||||
\if{html}{\out{<div class="sourceCode">}}\preformatted{ If passing 'TRUE' and 'data' is a data frame,
|
\item{max_bin}{The number of histogram bin, should be consistent with the training parameter
|
||||||
columns of categorical types will automatically
|
\code{max_bin}.
|
||||||
be set to be of categorical type (feature_type='c') in the resulting DMatrix.
|
|
||||||
|
|
||||||
If passing 'FALSE' and 'data' is a data frame with categorical columns,
|
This is only supported when constructing a QuantileDMatrix.}
|
||||||
it will result in an error being thrown.
|
}
|
||||||
|
\value{
|
||||||
If 'data' is not a data frame, this argument is ignored.
|
An 'xgb.DMatrix' object. If calling 'xgb.QuantileDMatrix', it will have additional
|
||||||
|
subclass 'xgb.QuantileDMatrix'.
|
||||||
JSON/UBJSON serialization format is required for this.
|
|
||||||
}\if{html}{\out{</div>}}}
|
|
||||||
}
|
}
|
||||||
\description{
|
\description{
|
||||||
Construct xgb.DMatrix object from either a dense matrix, a sparse matrix, or a local file.
|
Construct an 'xgb.DMatrix' object from a given data source, which can then be passed to functions
|
||||||
Supported input file formats are either a LIBSVM text file or a binary file that was created previously by
|
such as \link{xgb.train} or \link{predict.xgb.Booster}.
|
||||||
\code{\link{xgb.DMatrix.save}}).
|
|
||||||
}
|
}
|
||||||
\details{
|
\details{
|
||||||
|
Function 'xgb.QuantileDMatrix' will construct a DMatrix with quantization for the histogram
|
||||||
|
method already applied to it, which can be used to reduce memory usage (compared to using a
|
||||||
|
a regular DMatrix first and then creating a quantization out of it) when using the histogram
|
||||||
|
method (\code{tree_method = "hist"}, which is the default algorithm), but is not usable for the
|
||||||
|
sorted-indices method (\code{tree_method = "exact"}), nor for the approximate method
|
||||||
|
(\code{tree_method = "approx"}).
|
||||||
|
|
||||||
Note that DMatrix objects are not serializable through R functions such as \code{saveRDS} or \code{save}.
|
Note that DMatrix objects are not serializable through R functions such as \code{saveRDS} or \code{save}.
|
||||||
If a DMatrix gets serialized and then de-serialized (for example, when saving data in an R session or caching
|
If a DMatrix gets serialized and then de-serialized (for example, when saving data in an R session or caching
|
||||||
chunks in an Rmd file), the resulting object will not be usable anymore and will need to be reconstructed
|
chunks in an Rmd file), the resulting object will not be usable anymore and will need to be reconstructed
|
||||||
|
|||||||
112
R-package/man/xgb.DataBatch.Rd
Normal file
112
R-package/man/xgb.DataBatch.Rd
Normal file
@ -0,0 +1,112 @@
|
|||||||
|
% Generated by roxygen2: do not edit by hand
|
||||||
|
% Please edit documentation in R/xgb.DMatrix.R
|
||||||
|
\name{xgb.DataBatch}
|
||||||
|
\alias{xgb.DataBatch}
|
||||||
|
\title{Structure for Data Batches}
|
||||||
|
\usage{
|
||||||
|
xgb.DataBatch(
|
||||||
|
data,
|
||||||
|
label = NULL,
|
||||||
|
weight = NULL,
|
||||||
|
base_margin = NULL,
|
||||||
|
feature_names = colnames(data),
|
||||||
|
feature_types = NULL,
|
||||||
|
group = NULL,
|
||||||
|
qid = NULL,
|
||||||
|
label_lower_bound = NULL,
|
||||||
|
label_upper_bound = NULL,
|
||||||
|
feature_weights = NULL
|
||||||
|
)
|
||||||
|
}
|
||||||
|
\arguments{
|
||||||
|
\item{data}{Batch of data belonging to this batch.
|
||||||
|
|
||||||
|
Note that not all of the input types supported by \link{xgb.DMatrix} are possible
|
||||||
|
to pass here. Supported types are:\itemize{
|
||||||
|
\item \code{matrix}, with types \code{numeric}, \code{integer}, and \code{logical}. Note that for types
|
||||||
|
\code{integer} and \code{logical}, missing values might not be automatically recognized as
|
||||||
|
as such - see the documentation for parameter \code{missing} in \link{xgb.ExternalDMatrix}
|
||||||
|
for details on this.
|
||||||
|
\item \code{data.frame}, with the same types as supported by 'xgb.DMatrix' and same
|
||||||
|
conversions applied to it. See the documentation for parameter \code{data} in
|
||||||
|
\link{xgb.DMatrix} for details on it.
|
||||||
|
\item CSR matrices, as class \code{dgRMatrix} from package \code{Matrix}.
|
||||||
|
}}
|
||||||
|
|
||||||
|
\item{label}{Label of the training data. For classification problems, should be passed encoded as
|
||||||
|
integers with numeration starting at zero.}
|
||||||
|
|
||||||
|
\item{weight}{Weight for each instance.
|
||||||
|
|
||||||
|
Note that, for ranking task, weights are per-group. In ranking task, one weight
|
||||||
|
is assigned to each group (not each data point). This is because we
|
||||||
|
only care about the relative ordering of data points within each group,
|
||||||
|
so it doesn't make sense to assign weights to individual data points.}
|
||||||
|
|
||||||
|
\item{base_margin}{Base margin used for boosting from existing model.
|
||||||
|
|
||||||
|
\if{html}{\out{<div class="sourceCode">}}\preformatted{ In the case of multi-output models, one can also pass multi-dimensional base_margin.
|
||||||
|
}\if{html}{\out{</div>}}}
|
||||||
|
|
||||||
|
\item{feature_names}{Set names for features. Overrides column names in data
|
||||||
|
frame and matrix.
|
||||||
|
|
||||||
|
\if{html}{\out{<div class="sourceCode">}}\preformatted{ Note: columns are not referenced by name when calling `predict`, so the column order there
|
||||||
|
must be the same as in the DMatrix construction, regardless of the column names.
|
||||||
|
}\if{html}{\out{</div>}}}
|
||||||
|
|
||||||
|
\item{feature_types}{Set types for features.
|
||||||
|
|
||||||
|
If \code{data} is a \code{data.frame} and passing \code{feature_types} is not supplied, feature types will be deduced
|
||||||
|
automatically from the column types.
|
||||||
|
|
||||||
|
Otherwise, one can pass a character vector with the same length as number of columns in \code{data},
|
||||||
|
with the following possible values:\itemize{
|
||||||
|
\item "c", which represents categorical columns.
|
||||||
|
\item "q", which represents numeric columns.
|
||||||
|
\item "int", which represents integer columns.
|
||||||
|
\item "i", which represents logical (boolean) columns.
|
||||||
|
}
|
||||||
|
|
||||||
|
Note that, while categorical types are treated differently from the rest for model fitting
|
||||||
|
purposes, the other types do not influence the generated model, but have effects in other
|
||||||
|
functionalities such as feature importances.
|
||||||
|
|
||||||
|
\bold{Important}: categorical features, if specified manually through \code{feature_types}, must
|
||||||
|
be encoded as integers with numeration starting at zero, and the same encoding needs to be
|
||||||
|
applied when passing data to \code{predict}. Even if passing \code{factor} types, the encoding will
|
||||||
|
not be saved, so make sure that \code{factor} columns passed to \code{predict} have the same \code{levels}.}
|
||||||
|
|
||||||
|
\item{group}{Group size for all ranking group.}
|
||||||
|
|
||||||
|
\item{qid}{Query ID for data samples, used for ranking.}
|
||||||
|
|
||||||
|
\item{label_lower_bound}{Lower bound for survival training.}
|
||||||
|
|
||||||
|
\item{label_upper_bound}{Upper bound for survival training.}
|
||||||
|
|
||||||
|
\item{feature_weights}{Set feature weights for column sampling.}
|
||||||
|
}
|
||||||
|
\value{
|
||||||
|
An object of class \code{xgb.DataBatch}, which is just a list containing the
|
||||||
|
data and parameters passed here. It does \bold{not} inherit from \code{xgb.DMatrix}.
|
||||||
|
}
|
||||||
|
\description{
|
||||||
|
Helper function to supply data in batches of a data iterator when
|
||||||
|
constructing a DMatrix from external memory through \link{xgb.ExternalDMatrix}
|
||||||
|
or through \link{xgb.QuantileDMatrix.from_iterator}.
|
||||||
|
|
||||||
|
This function is \bold{only} meant to be called inside of a callback function (which
|
||||||
|
is passed as argument to function \link{xgb.DataIter} to construct a data iterator)
|
||||||
|
when constructing a DMatrix through external memory - otherwise, one should call
|
||||||
|
\link{xgb.DMatrix} or \link{xgb.QuantileDMatrix}.
|
||||||
|
|
||||||
|
The object that results from calling this function directly is \bold{not} like
|
||||||
|
an \code{xgb.DMatrix} - i.e. cannot be used to train a model, nor to get predictions - only
|
||||||
|
possible usage is to supply data to an iterator, from which a DMatrix is then constructed.
|
||||||
|
|
||||||
|
For more information and for example usage, see the documentation for \link{xgb.ExternalDMatrix}.
|
||||||
|
}
|
||||||
|
\seealso{
|
||||||
|
\link{xgb.DataIter}, \link{xgb.ExternalDMatrix}.
|
||||||
|
}
|
||||||
51
R-package/man/xgb.DataIter.Rd
Normal file
51
R-package/man/xgb.DataIter.Rd
Normal file
@ -0,0 +1,51 @@
|
|||||||
|
% Generated by roxygen2: do not edit by hand
|
||||||
|
% Please edit documentation in R/xgb.DMatrix.R
|
||||||
|
\name{xgb.DataIter}
|
||||||
|
\alias{xgb.DataIter}
|
||||||
|
\title{XGBoost Data Iterator}
|
||||||
|
\usage{
|
||||||
|
xgb.DataIter(env = new.env(), f_next, f_reset)
|
||||||
|
}
|
||||||
|
\arguments{
|
||||||
|
\item{env}{An R environment to pass to the callback functions supplied here, which can be
|
||||||
|
used to keep track of variables to determine how to handle the batches.
|
||||||
|
|
||||||
|
For example, one might want to keep track of an iteration number in this environment in order
|
||||||
|
to know which part of the data to pass next.}
|
||||||
|
|
||||||
|
\item{f_next}{\verb{function(env)} which is responsible for:\itemize{
|
||||||
|
\item Accessing or retrieving the next batch of data in the iterator.
|
||||||
|
\item Supplying this data by calling function \link{xgb.DataBatch} on it and returning the result.
|
||||||
|
\item Keeping track of where in the iterator batch it is or will go next, which can for example
|
||||||
|
be done by modifiying variables in the \code{env} variable that is passed here.
|
||||||
|
\item Signaling whether there are more batches to be consumed or not, by returning \code{NULL}
|
||||||
|
when the stream of data ends (all batches in the iterator have been consumed), or the result from
|
||||||
|
calling \link{xgb.DataBatch} when there are more batches in the line to be consumed.
|
||||||
|
}}
|
||||||
|
|
||||||
|
\item{f_reset}{\verb{function(env)} which is responsible for reseting the data iterator
|
||||||
|
(i.e. taking it back to the first batch, called before and after the sequence of batches
|
||||||
|
has been consumed).
|
||||||
|
|
||||||
|
Note that, after resetting the iterator, the batches will be accessed again, so the same data
|
||||||
|
(and in the same order) must be passed in subsequent iterations.}
|
||||||
|
}
|
||||||
|
\value{
|
||||||
|
An \code{xgb.DataIter} object, containing the same inputs supplied here, which can then
|
||||||
|
be passed to \link{xgb.ExternalDMatrix}.
|
||||||
|
}
|
||||||
|
\description{
|
||||||
|
Interface to create a custom data iterator in order to construct a DMatrix
|
||||||
|
from external memory.
|
||||||
|
|
||||||
|
This function is responsible for generating an R object structure containing callback
|
||||||
|
functions and an environment shared with them.
|
||||||
|
|
||||||
|
The output structure from this function is then meant to be passed to \link{xgb.ExternalDMatrix},
|
||||||
|
which will consume the data and create a DMatrix from it by executing the callback functions.
|
||||||
|
|
||||||
|
For more information, and for a usage example, see the documentation for \link{xgb.ExternalDMatrix}.
|
||||||
|
}
|
||||||
|
\seealso{
|
||||||
|
\link{xgb.ExternalDMatrix}, \link{xgb.DataBatch}.
|
||||||
|
}
|
||||||
122
R-package/man/xgb.ExternalDMatrix.Rd
Normal file
122
R-package/man/xgb.ExternalDMatrix.Rd
Normal file
@ -0,0 +1,122 @@
|
|||||||
|
% Generated by roxygen2: do not edit by hand
|
||||||
|
% Please edit documentation in R/xgb.DMatrix.R
|
||||||
|
\name{xgb.ExternalDMatrix}
|
||||||
|
\alias{xgb.ExternalDMatrix}
|
||||||
|
\title{DMatrix from External Data}
|
||||||
|
\usage{
|
||||||
|
xgb.ExternalDMatrix(
|
||||||
|
data_iterator,
|
||||||
|
cache_prefix = tempdir(),
|
||||||
|
missing = NA,
|
||||||
|
nthread = NULL
|
||||||
|
)
|
||||||
|
}
|
||||||
|
\arguments{
|
||||||
|
\item{data_iterator}{A data iterator structure as returned by \link{xgb.DataIter},
|
||||||
|
which includes an environment shared between function calls, and functions to access
|
||||||
|
the data in batches on-demand.}
|
||||||
|
|
||||||
|
\item{cache_prefix}{The path of cache file, caller must initialize all the directories in this path.}
|
||||||
|
|
||||||
|
\item{missing}{A float value to represents missing values in data.
|
||||||
|
|
||||||
|
Note that, while functions like \link{xgb.DMatrix} can take a generic \code{NA} and interpret it
|
||||||
|
correctly for different types like \code{numeric} and \code{integer}, if an \code{NA} value is passed here,
|
||||||
|
it will not be adapted for different input types.
|
||||||
|
|
||||||
|
For example, in R \code{integer} types, missing values are represented by integer number \code{-2147483648}
|
||||||
|
(since machine 'integer' types do not have an inherent 'NA' value) - hence, if one passes \code{NA},
|
||||||
|
which is interpreted as a floating-point NaN by 'xgb.ExternalDMatrix' and by
|
||||||
|
'xgb.QuantileDMatrix.from_iterator', these integer missing values will not be treated as missing.
|
||||||
|
This should not pose any problem for \code{numeric} types, since they do have an inheret NaN value.}
|
||||||
|
|
||||||
|
\item{nthread}{Number of threads used for creating DMatrix.}
|
||||||
|
}
|
||||||
|
\value{
|
||||||
|
An 'xgb.DMatrix' object, with subclass 'xgb.ExternalDMatrix', in which the data is not
|
||||||
|
held internally but accessed through the iterator when needed.
|
||||||
|
}
|
||||||
|
\description{
|
||||||
|
Create a special type of xgboost 'DMatrix' object from external data
|
||||||
|
supplied by an \link{xgb.DataIter} object, potentially passed in batches from a
|
||||||
|
bigger set that might not fit entirely in memory.
|
||||||
|
|
||||||
|
The data supplied by the iterator is accessed on-demand as needed, multiple times,
|
||||||
|
without being concatenated, but note that fields like 'label' \bold{will} be
|
||||||
|
concatenated from multiple calls to the data iterator.
|
||||||
|
|
||||||
|
For more information, see the guide 'Using XGBoost External Memory Version':
|
||||||
|
\url{https://xgboost.readthedocs.io/en/stable/tutorials/external_memory.html}
|
||||||
|
}
|
||||||
|
\examples{
|
||||||
|
library(xgboost)
|
||||||
|
data(mtcars)
|
||||||
|
|
||||||
|
# this custom environment will be passed to the iterator
|
||||||
|
# functions at each call. It's up to the user to keep
|
||||||
|
# track of the iteration number in this environment.
|
||||||
|
iterator_env <- as.environment(
|
||||||
|
list(
|
||||||
|
iter = 0,
|
||||||
|
x = mtcars[, -1],
|
||||||
|
y = mtcars[, 1]
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
# Data is passed in two batches.
|
||||||
|
# In this example, batches are obtained by subsetting the 'x' variable.
|
||||||
|
# This is not advantageous to do, since the data is already loaded in memory
|
||||||
|
# and can be passed in full in one go, but there can be situations in which
|
||||||
|
# only a subset of the data will fit in the computer's memory, and it can
|
||||||
|
# be loaded in batches that are accessed one-at-a-time only.
|
||||||
|
iterator_next <- function(iterator_env) {
|
||||||
|
curr_iter <- iterator_env[["iter"]]
|
||||||
|
if (curr_iter >= 2) {
|
||||||
|
# there are only two batches, so this signals end of the stream
|
||||||
|
return(NULL)
|
||||||
|
}
|
||||||
|
|
||||||
|
if (curr_iter == 0) {
|
||||||
|
x_batch <- iterator_env[["x"]][1:16, ]
|
||||||
|
y_batch <- iterator_env[["y"]][1:16]
|
||||||
|
} else {
|
||||||
|
x_batch <- iterator_env[["x"]][17:32, ]
|
||||||
|
y_batch <- iterator_env[["y"]][17:32]
|
||||||
|
}
|
||||||
|
on.exit({
|
||||||
|
iterator_env[["iter"]] <- curr_iter + 1
|
||||||
|
})
|
||||||
|
|
||||||
|
# Function 'xgb.DataBatch' must be called manually
|
||||||
|
# at each batch with all the appropriate attributes,
|
||||||
|
# such as feature names and feature types.
|
||||||
|
return(xgb.DataBatch(data = x_batch, label = y_batch))
|
||||||
|
}
|
||||||
|
|
||||||
|
# This moves the iterator back to its beginning
|
||||||
|
iterator_reset <- function(iterator_env) {
|
||||||
|
iterator_env[["iter"]] <- 0
|
||||||
|
}
|
||||||
|
|
||||||
|
data_iterator <- xgb.DataIter(
|
||||||
|
env = iterator_env,
|
||||||
|
f_next = iterator_next,
|
||||||
|
f_reset = iterator_reset
|
||||||
|
)
|
||||||
|
cache_prefix <- tempdir()
|
||||||
|
|
||||||
|
# DMatrix will be constructed from the iterator's batches
|
||||||
|
dm <- xgb.ExternalDMatrix(data_iterator, cache_prefix, nthread = 1)
|
||||||
|
|
||||||
|
# After construction, can be used as a regular DMatrix
|
||||||
|
params <- list(nthread = 1, objective = "reg:squarederror")
|
||||||
|
model <- xgb.train(data = dm, nrounds = 2, params = params)
|
||||||
|
|
||||||
|
# Predictions can also be called on it, and should be the same
|
||||||
|
# as if the data were passed differently.
|
||||||
|
pred_dm <- predict(model, dm)
|
||||||
|
pred_mat <- predict(model, as.matrix(mtcars[, -1]))
|
||||||
|
}
|
||||||
|
\seealso{
|
||||||
|
\link{xgb.DataIter}, \link{xgb.DataBatch}, \link{xgb.QuantileDMatrix.from_iterator}
|
||||||
|
}
|
||||||
65
R-package/man/xgb.QuantileDMatrix.from_iterator.Rd
Normal file
65
R-package/man/xgb.QuantileDMatrix.from_iterator.Rd
Normal file
@ -0,0 +1,65 @@
|
|||||||
|
% Generated by roxygen2: do not edit by hand
|
||||||
|
% Please edit documentation in R/xgb.DMatrix.R
|
||||||
|
\name{xgb.QuantileDMatrix.from_iterator}
|
||||||
|
\alias{xgb.QuantileDMatrix.from_iterator}
|
||||||
|
\title{QuantileDMatrix from External Data}
|
||||||
|
\usage{
|
||||||
|
xgb.QuantileDMatrix.from_iterator(
|
||||||
|
data_iterator,
|
||||||
|
missing = NA,
|
||||||
|
nthread = NULL,
|
||||||
|
ref = NULL,
|
||||||
|
max_bin = NULL
|
||||||
|
)
|
||||||
|
}
|
||||||
|
\arguments{
|
||||||
|
\item{data_iterator}{A data iterator structure as returned by \link{xgb.DataIter},
|
||||||
|
which includes an environment shared between function calls, and functions to access
|
||||||
|
the data in batches on-demand.}
|
||||||
|
|
||||||
|
\item{missing}{A float value to represents missing values in data.
|
||||||
|
|
||||||
|
Note that, while functions like \link{xgb.DMatrix} can take a generic \code{NA} and interpret it
|
||||||
|
correctly for different types like \code{numeric} and \code{integer}, if an \code{NA} value is passed here,
|
||||||
|
it will not be adapted for different input types.
|
||||||
|
|
||||||
|
For example, in R \code{integer} types, missing values are represented by integer number \code{-2147483648}
|
||||||
|
(since machine 'integer' types do not have an inherent 'NA' value) - hence, if one passes \code{NA},
|
||||||
|
which is interpreted as a floating-point NaN by 'xgb.ExternalDMatrix' and by
|
||||||
|
'xgb.QuantileDMatrix.from_iterator', these integer missing values will not be treated as missing.
|
||||||
|
This should not pose any problem for \code{numeric} types, since they do have an inheret NaN value.}
|
||||||
|
|
||||||
|
\item{nthread}{Number of threads used for creating DMatrix.}
|
||||||
|
|
||||||
|
\item{ref}{The training dataset that provides quantile information, needed when creating
|
||||||
|
validation/test dataset with \code{xgb.QuantileDMatrix}. Supplying the training DMatrix
|
||||||
|
as a reference means that the same quantisation applied to the training data is
|
||||||
|
applied to the validation/test data}
|
||||||
|
|
||||||
|
\item{max_bin}{The number of histogram bin, should be consistent with the training parameter
|
||||||
|
\code{max_bin}.
|
||||||
|
|
||||||
|
This is only supported when constructing a QuantileDMatrix.}
|
||||||
|
}
|
||||||
|
\value{
|
||||||
|
An 'xgb.DMatrix' object, with subclass 'xgb.QuantileDMatrix'.
|
||||||
|
}
|
||||||
|
\description{
|
||||||
|
Create an \code{xgb.QuantileDMatrix} object (exact same class as would be returned by
|
||||||
|
calling function \link{xgb.QuantileDMatrix}, with the same advantages and limitations) from
|
||||||
|
external data supplied by an \link{xgb.DataIter} object, potentially passed in batches from
|
||||||
|
a bigger set that might not fit entirely in memory, same way as \link{xgb.ExternalDMatrix}.
|
||||||
|
|
||||||
|
Note that, while external data will only be loaded through the iterator (thus the full data
|
||||||
|
might not be held entirely in-memory), the quantized representation of the data will get
|
||||||
|
created in-memory, being concatenated from multiple calls to the data iterator. The quantized
|
||||||
|
version is typically lighter than the original data, so there might be cases in which this
|
||||||
|
representation could potentially fit in memory even if the full data doesn't.
|
||||||
|
|
||||||
|
For more information, see the guide 'Using XGBoost External Memory Version':
|
||||||
|
\url{https://xgboost.readthedocs.io/en/stable/tutorials/external_memory.html}
|
||||||
|
}
|
||||||
|
\seealso{
|
||||||
|
\link{xgb.DataIter}, \link{xgb.DataBatch}, \link{xgb.ExternalDMatrix},
|
||||||
|
\link{xgb.QuantileDMatrix}
|
||||||
|
}
|
||||||
@ -1,15 +1,12 @@
|
|||||||
% Generated by roxygen2: do not edit by hand
|
% Generated by roxygen2: do not edit by hand
|
||||||
% Please edit documentation in R/xgb.DMatrix.R
|
% Please edit documentation in R/xgb.DMatrix.R
|
||||||
\name{slice}
|
\name{xgb.slice.DMatrix}
|
||||||
\alias{slice}
|
\alias{xgb.slice.DMatrix}
|
||||||
\alias{slice.xgb.DMatrix}
|
|
||||||
\alias{[.xgb.DMatrix}
|
\alias{[.xgb.DMatrix}
|
||||||
\title{Get a new DMatrix containing the specified rows of
|
\title{Get a new DMatrix containing the specified rows of
|
||||||
original xgb.DMatrix object}
|
original xgb.DMatrix object}
|
||||||
\usage{
|
\usage{
|
||||||
slice(object, idxset)
|
xgb.slice.DMatrix(object, idxset)
|
||||||
|
|
||||||
\method{slice}{xgb.DMatrix}(object, idxset)
|
|
||||||
|
|
||||||
\method{[}{xgb.DMatrix}(object, idxset, colset = NULL)
|
\method{[}{xgb.DMatrix}(object, idxset, colset = NULL)
|
||||||
}
|
}
|
||||||
@ -28,7 +25,7 @@ original xgb.DMatrix object
|
|||||||
data(agaricus.train, package='xgboost')
|
data(agaricus.train, package='xgboost')
|
||||||
dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label, nthread = 2))
|
dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label, nthread = 2))
|
||||||
|
|
||||||
dsub <- slice(dtrain, 1:42)
|
dsub <- xgb.slice.DMatrix(dtrain, 1:42)
|
||||||
labels1 <- getinfo(dsub, 'label')
|
labels1 <- getinfo(dsub, 'label')
|
||||||
dsub <- dtrain[1:42, ]
|
dsub <- dtrain[1:42, ]
|
||||||
labels2 <- getinfo(dsub, 'label')
|
labels2 <- getinfo(dsub, 'label')
|
||||||
@ -54,6 +54,14 @@ extern SEXP XGDMatrixCreateFromDF_R(SEXP, SEXP, SEXP);
|
|||||||
extern SEXP XGDMatrixGetStrFeatureInfo_R(SEXP, SEXP);
|
extern SEXP XGDMatrixGetStrFeatureInfo_R(SEXP, SEXP);
|
||||||
extern SEXP XGDMatrixNumCol_R(SEXP);
|
extern SEXP XGDMatrixNumCol_R(SEXP);
|
||||||
extern SEXP XGDMatrixNumRow_R(SEXP);
|
extern SEXP XGDMatrixNumRow_R(SEXP);
|
||||||
|
extern SEXP XGProxyDMatrixCreate_R();
|
||||||
|
extern SEXP XGProxyDMatrixSetDataDense_R(SEXP, SEXP);
|
||||||
|
extern SEXP XGProxyDMatrixSetDataCSR_R(SEXP, SEXP);
|
||||||
|
extern SEXP XGProxyDMatrixSetDataColumnar_R(SEXP, SEXP);
|
||||||
|
extern SEXP XGDMatrixCreateFromCallback_R(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP);
|
||||||
|
extern SEXP XGQuantileDMatrixCreateFromCallback_R(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP);
|
||||||
|
extern SEXP XGDMatrixFree_R(SEXP);
|
||||||
|
extern SEXP XGGetRNAIntAsDouble();
|
||||||
extern SEXP XGDMatrixGetQuantileCut_R(SEXP);
|
extern SEXP XGDMatrixGetQuantileCut_R(SEXP);
|
||||||
extern SEXP XGDMatrixNumNonMissing_R(SEXP);
|
extern SEXP XGDMatrixNumNonMissing_R(SEXP);
|
||||||
extern SEXP XGDMatrixGetDataAsCSR_R(SEXP);
|
extern SEXP XGDMatrixGetDataAsCSR_R(SEXP);
|
||||||
@ -105,6 +113,14 @@ static const R_CallMethodDef CallEntries[] = {
|
|||||||
{"XGDMatrixGetStrFeatureInfo_R", (DL_FUNC) &XGDMatrixGetStrFeatureInfo_R, 2},
|
{"XGDMatrixGetStrFeatureInfo_R", (DL_FUNC) &XGDMatrixGetStrFeatureInfo_R, 2},
|
||||||
{"XGDMatrixNumCol_R", (DL_FUNC) &XGDMatrixNumCol_R, 1},
|
{"XGDMatrixNumCol_R", (DL_FUNC) &XGDMatrixNumCol_R, 1},
|
||||||
{"XGDMatrixNumRow_R", (DL_FUNC) &XGDMatrixNumRow_R, 1},
|
{"XGDMatrixNumRow_R", (DL_FUNC) &XGDMatrixNumRow_R, 1},
|
||||||
|
{"XGProxyDMatrixCreate_R", (DL_FUNC) &XGProxyDMatrixCreate_R, 0},
|
||||||
|
{"XGProxyDMatrixSetDataDense_R", (DL_FUNC) &XGProxyDMatrixSetDataDense_R, 2},
|
||||||
|
{"XGProxyDMatrixSetDataCSR_R", (DL_FUNC) &XGProxyDMatrixSetDataCSR_R, 2},
|
||||||
|
{"XGProxyDMatrixSetDataColumnar_R", (DL_FUNC) &XGProxyDMatrixSetDataColumnar_R, 2},
|
||||||
|
{"XGDMatrixCreateFromCallback_R", (DL_FUNC) &XGDMatrixCreateFromCallback_R, 7},
|
||||||
|
{"XGQuantileDMatrixCreateFromCallback_R", (DL_FUNC) &XGQuantileDMatrixCreateFromCallback_R, 8},
|
||||||
|
{"XGDMatrixFree_R", (DL_FUNC) &XGDMatrixFree_R, 1},
|
||||||
|
{"XGGetRNAIntAsDouble", (DL_FUNC) &XGGetRNAIntAsDouble, 0},
|
||||||
{"XGDMatrixGetQuantileCut_R", (DL_FUNC) &XGDMatrixGetQuantileCut_R, 1},
|
{"XGDMatrixGetQuantileCut_R", (DL_FUNC) &XGDMatrixGetQuantileCut_R, 1},
|
||||||
{"XGDMatrixNumNonMissing_R", (DL_FUNC) &XGDMatrixNumNonMissing_R, 1},
|
{"XGDMatrixNumNonMissing_R", (DL_FUNC) &XGDMatrixNumNonMissing_R, 1},
|
||||||
{"XGDMatrixGetDataAsCSR_R", (DL_FUNC) &XGDMatrixGetDataAsCSR_R, 1},
|
{"XGDMatrixGetDataAsCSR_R", (DL_FUNC) &XGDMatrixGetDataAsCSR_R, 1},
|
||||||
|
|||||||
@ -27,7 +27,12 @@
|
|||||||
#include "./xgboost_R.h" // Must follow other includes.
|
#include "./xgboost_R.h" // Must follow other includes.
|
||||||
|
|
||||||
namespace {
|
namespace {
|
||||||
struct ErrorWithUnwind : public std::exception {};
|
|
||||||
|
/* Note: this class is used as a throwable exception.
|
||||||
|
Some xgboost C functions that use callbacks will catch exceptions
|
||||||
|
that happen inside of the callback execution, hence it purposefully
|
||||||
|
doesn't inherit from 'std::exception' even if used as such. */
|
||||||
|
struct ErrorWithUnwind {};
|
||||||
|
|
||||||
void ThrowExceptionFromRError(void *, Rboolean jump) {
|
void ThrowExceptionFromRError(void *, Rboolean jump) {
|
||||||
if (jump) {
|
if (jump) {
|
||||||
@ -51,6 +56,27 @@ SEXP SafeMkChar(const char *c_str, SEXP continuation_token) {
|
|||||||
continuation_token);
|
continuation_token);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
struct RFunAndEnv {
|
||||||
|
SEXP R_fun;
|
||||||
|
SEXP R_calling_env;
|
||||||
|
};
|
||||||
|
|
||||||
|
SEXP WrappedExecFun(void *void_ptr) {
|
||||||
|
RFunAndEnv *r_fun_and_env = static_cast<RFunAndEnv*>(void_ptr);
|
||||||
|
SEXP f_expr = Rf_protect(Rf_lang1(r_fun_and_env->R_fun));
|
||||||
|
SEXP out = Rf_protect(Rf_eval(f_expr, r_fun_and_env->R_calling_env));
|
||||||
|
Rf_unprotect(2);
|
||||||
|
return out;
|
||||||
|
}
|
||||||
|
|
||||||
|
SEXP SafeExecFun(SEXP R_fun, SEXP R_calling_env, SEXP continuation_token) {
|
||||||
|
RFunAndEnv r_fun_and_env{R_fun, R_calling_env};
|
||||||
|
return R_UnwindProtect(
|
||||||
|
WrappedExecFun, static_cast<void*>(&r_fun_and_env),
|
||||||
|
ThrowExceptionFromRError, nullptr,
|
||||||
|
continuation_token);
|
||||||
|
}
|
||||||
|
|
||||||
SEXP WrappedAllocReal(void *void_ptr) {
|
SEXP WrappedAllocReal(void *void_ptr) {
|
||||||
size_t *size = static_cast<size_t*>(void_ptr);
|
size_t *size = static_cast<size_t*>(void_ptr);
|
||||||
return Rf_allocVector(REALSXP, *size);
|
return Rf_allocVector(REALSXP, *size);
|
||||||
@ -140,6 +166,47 @@ SEXP SafeAllocInteger(size_t size, SEXP continuation_token) {
|
|||||||
return "";
|
return "";
|
||||||
}
|
}
|
||||||
|
|
||||||
|
[[nodiscard]] std::string MakeArrayInterfaceFromRDataFrame(SEXP R_df) {
|
||||||
|
auto make_vec = [&](auto const *ptr, std::size_t len) {
|
||||||
|
auto v = xgboost::linalg::MakeVec(ptr, len);
|
||||||
|
return xgboost::linalg::ArrayInterface(v);
|
||||||
|
};
|
||||||
|
|
||||||
|
R_xlen_t n_features = Rf_xlength(R_df);
|
||||||
|
std::vector<xgboost::Json> array(n_features);
|
||||||
|
CHECK_GT(n_features, 0);
|
||||||
|
std::size_t len = Rf_xlength(VECTOR_ELT(R_df, 0));
|
||||||
|
|
||||||
|
// The `data.frame` in R actually converts all data into numeric. The other type
|
||||||
|
// handlers here are not used. At the moment they are kept as a reference for when we
|
||||||
|
// can avoid making data copies during transformation.
|
||||||
|
for (R_xlen_t i = 0; i < n_features; ++i) {
|
||||||
|
switch (TYPEOF(VECTOR_ELT(R_df, i))) {
|
||||||
|
case INTSXP: {
|
||||||
|
auto const *ptr = INTEGER(VECTOR_ELT(R_df, i));
|
||||||
|
array[i] = make_vec(ptr, len);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case REALSXP: {
|
||||||
|
auto const *ptr = REAL(VECTOR_ELT(R_df, i));
|
||||||
|
array[i] = make_vec(ptr, len);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case LGLSXP: {
|
||||||
|
auto const *ptr = LOGICAL(VECTOR_ELT(R_df, i));
|
||||||
|
array[i] = make_vec(ptr, len);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
default: {
|
||||||
|
LOG(FATAL) << "data.frame has unsupported type.";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
xgboost::Json jinterface{std::move(array)};
|
||||||
|
return xgboost::Json::Dump(jinterface);
|
||||||
|
}
|
||||||
|
|
||||||
[[nodiscard]] std::string MakeJsonConfigForArray(SEXP missing, SEXP n_threads, SEXPTYPE arr_type) {
|
[[nodiscard]] std::string MakeJsonConfigForArray(SEXP missing, SEXP n_threads, SEXPTYPE arr_type) {
|
||||||
using namespace ::xgboost; // NOLINT
|
using namespace ::xgboost; // NOLINT
|
||||||
Json jconfig{Object{}};
|
Json jconfig{Object{}};
|
||||||
@ -335,51 +402,13 @@ XGB_DLL SEXP XGDMatrixCreateFromDF_R(SEXP df, SEXP missing, SEXP n_threads) {
|
|||||||
R_API_BEGIN();
|
R_API_BEGIN();
|
||||||
|
|
||||||
DMatrixHandle handle;
|
DMatrixHandle handle;
|
||||||
|
|
||||||
auto make_vec = [&](auto const *ptr, std::int32_t len) {
|
|
||||||
auto v = xgboost::linalg::MakeVec(ptr, len);
|
|
||||||
return xgboost::linalg::ArrayInterface(v);
|
|
||||||
};
|
|
||||||
|
|
||||||
std::int32_t rc{0};
|
std::int32_t rc{0};
|
||||||
{
|
{
|
||||||
using xgboost::Json;
|
std::string sinterface = MakeArrayInterfaceFromRDataFrame(df);
|
||||||
auto n_features = Rf_xlength(df);
|
xgboost::Json jconfig{xgboost::Object{}};
|
||||||
std::vector<Json> array(n_features);
|
|
||||||
CHECK_GT(n_features, 0);
|
|
||||||
auto len = Rf_xlength(VECTOR_ELT(df, 0));
|
|
||||||
// The `data.frame` in R actually converts all data into numeric. The other type
|
|
||||||
// handlers here are not used. At the moment they are kept as a reference for when we
|
|
||||||
// can avoid making data copies during transformation.
|
|
||||||
for (decltype(n_features) i = 0; i < n_features; ++i) {
|
|
||||||
switch (TYPEOF(VECTOR_ELT(df, i))) {
|
|
||||||
case INTSXP: {
|
|
||||||
auto const *ptr = INTEGER(VECTOR_ELT(df, i));
|
|
||||||
array[i] = make_vec(ptr, len);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
case REALSXP: {
|
|
||||||
auto const *ptr = REAL(VECTOR_ELT(df, i));
|
|
||||||
array[i] = make_vec(ptr, len);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
case LGLSXP: {
|
|
||||||
auto const *ptr = LOGICAL(VECTOR_ELT(df, i));
|
|
||||||
array[i] = make_vec(ptr, len);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
default: {
|
|
||||||
LOG(FATAL) << "data.frame has unsupported type.";
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
Json jinterface{std::move(array)};
|
|
||||||
auto sinterface = Json::Dump(jinterface);
|
|
||||||
Json jconfig{xgboost::Object{}};
|
|
||||||
jconfig["missing"] = asReal(missing);
|
jconfig["missing"] = asReal(missing);
|
||||||
jconfig["nthread"] = asInteger(n_threads);
|
jconfig["nthread"] = asInteger(n_threads);
|
||||||
auto sconfig = Json::Dump(jconfig);
|
std::string sconfig = xgboost::Json::Dump(jconfig);
|
||||||
|
|
||||||
rc = XGDMatrixCreateFromColumnar(sinterface.c_str(), sconfig.c_str(), &handle);
|
rc = XGDMatrixCreateFromColumnar(sinterface.c_str(), sconfig.c_str(), &handle);
|
||||||
}
|
}
|
||||||
@ -632,6 +661,192 @@ XGB_DLL SEXP XGDMatrixNumCol_R(SEXP handle) {
|
|||||||
return ScalarInteger(static_cast<int>(ncol));
|
return ScalarInteger(static_cast<int>(ncol));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
XGB_DLL SEXP XGProxyDMatrixCreate_R() {
|
||||||
|
SEXP out = Rf_protect(R_MakeExternalPtr(nullptr, R_NilValue, R_NilValue));
|
||||||
|
R_API_BEGIN();
|
||||||
|
DMatrixHandle proxy_dmat_handle;
|
||||||
|
CHECK_CALL(XGProxyDMatrixCreate(&proxy_dmat_handle));
|
||||||
|
R_SetExternalPtrAddr(out, proxy_dmat_handle);
|
||||||
|
R_RegisterCFinalizerEx(out, _DMatrixFinalizer, TRUE);
|
||||||
|
Rf_unprotect(1);
|
||||||
|
R_API_END();
|
||||||
|
return out;
|
||||||
|
}
|
||||||
|
|
||||||
|
XGB_DLL SEXP XGProxyDMatrixSetDataDense_R(SEXP handle, SEXP R_mat) {
|
||||||
|
R_API_BEGIN();
|
||||||
|
DMatrixHandle proxy_dmat = R_ExternalPtrAddr(handle);
|
||||||
|
int res_code;
|
||||||
|
{
|
||||||
|
std::string array_str = MakeArrayInterfaceFromRMat(R_mat);
|
||||||
|
res_code = XGProxyDMatrixSetDataDense(proxy_dmat, array_str.c_str());
|
||||||
|
}
|
||||||
|
CHECK_CALL(res_code);
|
||||||
|
R_API_END();
|
||||||
|
return R_NilValue;
|
||||||
|
}
|
||||||
|
|
||||||
|
XGB_DLL SEXP XGProxyDMatrixSetDataCSR_R(SEXP handle, SEXP lst) {
|
||||||
|
R_API_BEGIN();
|
||||||
|
DMatrixHandle proxy_dmat = R_ExternalPtrAddr(handle);
|
||||||
|
int res_code;
|
||||||
|
{
|
||||||
|
std::string array_str_indptr = MakeArrayInterfaceFromRVector(VECTOR_ELT(lst, 0));
|
||||||
|
std::string array_str_indices = MakeArrayInterfaceFromRVector(VECTOR_ELT(lst, 1));
|
||||||
|
std::string array_str_data = MakeArrayInterfaceFromRVector(VECTOR_ELT(lst, 2));
|
||||||
|
const int ncol = Rf_asInteger(VECTOR_ELT(lst, 3));
|
||||||
|
res_code = XGProxyDMatrixSetDataCSR(proxy_dmat,
|
||||||
|
array_str_indptr.c_str(),
|
||||||
|
array_str_indices.c_str(),
|
||||||
|
array_str_data.c_str(),
|
||||||
|
ncol);
|
||||||
|
}
|
||||||
|
CHECK_CALL(res_code);
|
||||||
|
R_API_END();
|
||||||
|
return R_NilValue;
|
||||||
|
}
|
||||||
|
|
||||||
|
XGB_DLL SEXP XGProxyDMatrixSetDataColumnar_R(SEXP handle, SEXP lst) {
|
||||||
|
R_API_BEGIN();
|
||||||
|
DMatrixHandle proxy_dmat = R_ExternalPtrAddr(handle);
|
||||||
|
int res_code;
|
||||||
|
{
|
||||||
|
std::string sinterface = MakeArrayInterfaceFromRDataFrame(lst);
|
||||||
|
res_code = XGProxyDMatrixSetDataColumnar(proxy_dmat, sinterface.c_str());
|
||||||
|
}
|
||||||
|
CHECK_CALL(res_code);
|
||||||
|
R_API_END();
|
||||||
|
return R_NilValue;
|
||||||
|
}
|
||||||
|
|
||||||
|
namespace {
|
||||||
|
|
||||||
|
struct _RDataIterator {
|
||||||
|
SEXP f_next;
|
||||||
|
SEXP f_reset;
|
||||||
|
SEXP calling_env;
|
||||||
|
SEXP continuation_token;
|
||||||
|
|
||||||
|
_RDataIterator(
|
||||||
|
SEXP f_next, SEXP f_reset, SEXP calling_env, SEXP continuation_token) :
|
||||||
|
f_next(f_next), f_reset(f_reset), calling_env(calling_env),
|
||||||
|
continuation_token(continuation_token) {}
|
||||||
|
|
||||||
|
void reset() {
|
||||||
|
SafeExecFun(this->f_reset, this->calling_env, this->continuation_token);
|
||||||
|
}
|
||||||
|
|
||||||
|
int next() {
|
||||||
|
SEXP R_res = Rf_protect(
|
||||||
|
SafeExecFun(this->f_next, this->calling_env, this->continuation_token));
|
||||||
|
int res = Rf_asInteger(R_res);
|
||||||
|
Rf_unprotect(1);
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
void _reset_RDataIterator(DataIterHandle iter) {
|
||||||
|
static_cast<_RDataIterator*>(iter)->reset();
|
||||||
|
}
|
||||||
|
|
||||||
|
int _next_RDataIterator(DataIterHandle iter) {
|
||||||
|
return static_cast<_RDataIterator*>(iter)->next();
|
||||||
|
}
|
||||||
|
|
||||||
|
SEXP XGDMatrixCreateFromCallbackGeneric_R(
|
||||||
|
SEXP f_next, SEXP f_reset, SEXP calling_env, SEXP proxy_dmat,
|
||||||
|
SEXP n_threads, SEXP missing, SEXP max_bin, SEXP ref_dmat,
|
||||||
|
SEXP cache_prefix, bool as_quantile_dmatrix) {
|
||||||
|
SEXP continuation_token = Rf_protect(R_MakeUnwindCont());
|
||||||
|
SEXP out = Rf_protect(R_MakeExternalPtr(nullptr, R_NilValue, R_NilValue));
|
||||||
|
R_API_BEGIN();
|
||||||
|
DMatrixHandle out_dmat;
|
||||||
|
|
||||||
|
int res_code;
|
||||||
|
try {
|
||||||
|
_RDataIterator data_iterator(f_next, f_reset, calling_env, continuation_token);
|
||||||
|
|
||||||
|
std::string str_cache_prefix;
|
||||||
|
xgboost::Json jconfig{xgboost::Object{}};
|
||||||
|
jconfig["missing"] = Rf_asReal(missing);
|
||||||
|
if (!Rf_isNull(n_threads)) {
|
||||||
|
jconfig["nthread"] = Rf_asInteger(n_threads);
|
||||||
|
}
|
||||||
|
if (as_quantile_dmatrix) {
|
||||||
|
if (!Rf_isNull(max_bin)) {
|
||||||
|
jconfig["max_bin"] = Rf_asInteger(max_bin);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
str_cache_prefix = std::string(CHAR(Rf_asChar(cache_prefix)));
|
||||||
|
jconfig["cache_prefix"] = str_cache_prefix;
|
||||||
|
}
|
||||||
|
std::string json_str = xgboost::Json::Dump(jconfig);
|
||||||
|
|
||||||
|
DMatrixHandle ref_dmat_handle = nullptr;
|
||||||
|
if (as_quantile_dmatrix && !Rf_isNull(ref_dmat)) {
|
||||||
|
ref_dmat_handle = R_ExternalPtrAddr(ref_dmat);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (as_quantile_dmatrix) {
|
||||||
|
res_code = XGQuantileDMatrixCreateFromCallback(
|
||||||
|
&data_iterator,
|
||||||
|
R_ExternalPtrAddr(proxy_dmat),
|
||||||
|
ref_dmat_handle,
|
||||||
|
_reset_RDataIterator,
|
||||||
|
_next_RDataIterator,
|
||||||
|
json_str.c_str(),
|
||||||
|
&out_dmat);
|
||||||
|
} else {
|
||||||
|
res_code = XGDMatrixCreateFromCallback(
|
||||||
|
&data_iterator,
|
||||||
|
R_ExternalPtrAddr(proxy_dmat),
|
||||||
|
_reset_RDataIterator,
|
||||||
|
_next_RDataIterator,
|
||||||
|
json_str.c_str(),
|
||||||
|
&out_dmat);
|
||||||
|
}
|
||||||
|
} catch (ErrorWithUnwind &e) {
|
||||||
|
R_ContinueUnwind(continuation_token);
|
||||||
|
}
|
||||||
|
CHECK_CALL(res_code);
|
||||||
|
|
||||||
|
R_SetExternalPtrAddr(out, out_dmat);
|
||||||
|
R_RegisterCFinalizerEx(out, _DMatrixFinalizer, TRUE);
|
||||||
|
Rf_unprotect(2);
|
||||||
|
R_API_END();
|
||||||
|
return out;
|
||||||
|
}
|
||||||
|
|
||||||
|
} /* namespace */
|
||||||
|
|
||||||
|
XGB_DLL SEXP XGQuantileDMatrixCreateFromCallback_R(
|
||||||
|
SEXP f_next, SEXP f_reset, SEXP calling_env, SEXP proxy_dmat,
|
||||||
|
SEXP n_threads, SEXP missing, SEXP max_bin, SEXP ref_dmat) {
|
||||||
|
return XGDMatrixCreateFromCallbackGeneric_R(
|
||||||
|
f_next, f_reset, calling_env, proxy_dmat,
|
||||||
|
n_threads, missing, max_bin, ref_dmat,
|
||||||
|
R_NilValue, true);
|
||||||
|
}
|
||||||
|
|
||||||
|
XGB_DLL SEXP XGDMatrixCreateFromCallback_R(
|
||||||
|
SEXP f_next, SEXP f_reset, SEXP calling_env, SEXP proxy_dmat,
|
||||||
|
SEXP n_threads, SEXP missing, SEXP cache_prefix) {
|
||||||
|
return XGDMatrixCreateFromCallbackGeneric_R(
|
||||||
|
f_next, f_reset, calling_env, proxy_dmat,
|
||||||
|
n_threads, missing, R_NilValue, R_NilValue,
|
||||||
|
cache_prefix, false);
|
||||||
|
}
|
||||||
|
|
||||||
|
XGB_DLL SEXP XGDMatrixFree_R(SEXP proxy_dmat) {
|
||||||
|
_DMatrixFinalizer(proxy_dmat);
|
||||||
|
return R_NilValue;
|
||||||
|
}
|
||||||
|
|
||||||
|
XGB_DLL SEXP XGGetRNAIntAsDouble() {
|
||||||
|
double sentinel_as_double = static_cast<double>(R_NaInt);
|
||||||
|
return Rf_ScalarReal(sentinel_as_double);
|
||||||
|
}
|
||||||
|
|
||||||
XGB_DLL SEXP XGDuplicate_R(SEXP obj) {
|
XGB_DLL SEXP XGDuplicate_R(SEXP obj) {
|
||||||
return Rf_duplicate(obj);
|
return Rf_duplicate(obj);
|
||||||
}
|
}
|
||||||
|
|||||||
@ -161,6 +161,84 @@ XGB_DLL SEXP XGDMatrixNumRow_R(SEXP handle);
|
|||||||
*/
|
*/
|
||||||
XGB_DLL SEXP XGDMatrixNumCol_R(SEXP handle);
|
XGB_DLL SEXP XGDMatrixNumCol_R(SEXP handle);
|
||||||
|
|
||||||
|
/*!
|
||||||
|
<<<<<<< HEAD
|
||||||
|
* \brief create a ProxyDMatrix and get an R externalptr object for it
|
||||||
|
*/
|
||||||
|
XGB_DLL SEXP XGProxyDMatrixCreate_R();
|
||||||
|
|
||||||
|
/*!
|
||||||
|
* \brief Set dense matrix data on a proxy dmatrix
|
||||||
|
* \param handle R externalptr pointing to a ProxyDMatrix
|
||||||
|
* \param R_mat R matrix to set in the proxy dmatrix
|
||||||
|
*/
|
||||||
|
XGB_DLL SEXP XGProxyDMatrixSetDataDense_R(SEXP handle, SEXP R_mat);
|
||||||
|
|
||||||
|
/*!
|
||||||
|
* \brief Set dense matrix data on a proxy dmatrix
|
||||||
|
* \param handle R externalptr pointing to a ProxyDMatrix
|
||||||
|
* \param lst R list containing, in this order:
|
||||||
|
* 1. 'p' or 'indptr' vector of the CSR matrix.
|
||||||
|
* 2. 'j' or 'indices' vector of the CSR matrix.
|
||||||
|
* 3. 'x' or 'data' vector of the CSR matrix.
|
||||||
|
* 4. Number of columns in the CSR matrix.
|
||||||
|
*/
|
||||||
|
XGB_DLL SEXP XGProxyDMatrixSetDataCSR_R(SEXP handle, SEXP lst);
|
||||||
|
|
||||||
|
/*!
|
||||||
|
* \brief Set dense matrix data on a proxy dmatrix
|
||||||
|
* \param handle R externalptr pointing to a ProxyDMatrix
|
||||||
|
* \param lst R list or data.frame object containing its columns as numeric vectors
|
||||||
|
*/
|
||||||
|
XGB_DLL SEXP XGProxyDMatrixSetDataColumnar_R(SEXP handle, SEXP lst);
|
||||||
|
|
||||||
|
/*!
|
||||||
|
* \brief Create a DMatrix from a DataIter with callbacks
|
||||||
|
* \param expr_f_next expression for function(env, proxy_dmat) that sets the data on the proxy
|
||||||
|
* dmatrix and returns either zero (end of batch) or one (batch continues).
|
||||||
|
* \param expr_f_reset expression for function(env) that resets the data iterator to
|
||||||
|
* the beginning (first batch).
|
||||||
|
* \param calling_env R environment where to evaluate the expressions above
|
||||||
|
* \param proxy_dmat R externalptr holding a ProxyDMatrix.
|
||||||
|
* \param n_threads number of parallel threads to use for constructing the DMatrix.
|
||||||
|
* \param missing which value to represent missing value.
|
||||||
|
* \param cache_prefix path of cache file
|
||||||
|
* \return handle R externalptr holding the resulting DMatrix.
|
||||||
|
*/
|
||||||
|
XGB_DLL SEXP XGDMatrixCreateFromCallback_R(
|
||||||
|
SEXP expr_f_next, SEXP expr_f_reset, SEXP calling_env, SEXP proxy_dmat,
|
||||||
|
SEXP n_threads, SEXP missing, SEXP cache_prefix);
|
||||||
|
|
||||||
|
/*!
|
||||||
|
* \brief Create a QuantileDMatrix from a DataIter with callbacks
|
||||||
|
* \param expr_f_next expression for function(env, proxy_dmat) that sets the data on the proxy
|
||||||
|
* dmatrix and returns either zero (end of batch) or one (batch continues).
|
||||||
|
* \param expr_f_reset expression for function(env) that resets the data iterator to
|
||||||
|
* the beginning (first batch).
|
||||||
|
* \param calling_env R environment where to evaluate the expressions above
|
||||||
|
* \param proxy_dmat R externalptr holding a ProxyDMatrix.
|
||||||
|
* \param n_threads number of parallel threads to use for constructing the QuantileDMatrix.
|
||||||
|
* \param missing which value to represent missing value.
|
||||||
|
* \param max_bin maximum number of bins to have in the resulting QuantileDMatrix.
|
||||||
|
* \param ref_dmat an optional reference DMatrix from which to get the bin boundaries.
|
||||||
|
* \return handle R externalptr holding the resulting QuantileDMatrix.
|
||||||
|
*/
|
||||||
|
XGB_DLL SEXP XGQuantileDMatrixCreateFromCallback_R(
|
||||||
|
SEXP expr_f_next, SEXP expr_f_reset, SEXP calling_env, SEXP proxy_dmat,
|
||||||
|
SEXP n_threads, SEXP missing, SEXP max_bin, SEXP ref_dmat);
|
||||||
|
|
||||||
|
/*!
|
||||||
|
* \brief Frees a ProxyDMatrix and empties out the R externalptr object that holds it
|
||||||
|
* \param proxy_dmat R externalptr containing a ProxyDMatrix
|
||||||
|
* \return NULL
|
||||||
|
*/
|
||||||
|
XGB_DLL SEXP XGDMatrixFree_R(SEXP proxy_dmat);
|
||||||
|
|
||||||
|
/*!
|
||||||
|
* \brief Get the value that represents missingness in R integers as a numeric non-missing value.
|
||||||
|
*/
|
||||||
|
XGB_DLL SEXP XGGetRNAIntAsDouble();
|
||||||
|
|
||||||
/*!
|
/*!
|
||||||
* \brief Call R C-level function 'duplicate'
|
* \brief Call R C-level function 'duplicate'
|
||||||
* \param obj Object to duplicate
|
* \param obj Object to duplicate
|
||||||
|
|||||||
@ -166,7 +166,7 @@ test_that("xgb.DMatrix: getinfo & setinfo", {
|
|||||||
test_that("xgb.DMatrix: slice, dim", {
|
test_that("xgb.DMatrix: slice, dim", {
|
||||||
dtest <- xgb.DMatrix(test_data, label = test_label, nthread = n_threads)
|
dtest <- xgb.DMatrix(test_data, label = test_label, nthread = n_threads)
|
||||||
expect_equal(dim(dtest), dim(test_data))
|
expect_equal(dim(dtest), dim(test_data))
|
||||||
dsub1 <- slice(dtest, 1:42)
|
dsub1 <- xgb.slice.DMatrix(dtest, 1:42)
|
||||||
expect_equal(nrow(dsub1), 42)
|
expect_equal(nrow(dsub1), 42)
|
||||||
expect_equal(ncol(dsub1), ncol(test_data))
|
expect_equal(ncol(dsub1), ncol(test_data))
|
||||||
|
|
||||||
@ -182,12 +182,12 @@ test_that("xgb.DMatrix: slice, trailing empty rows", {
|
|||||||
dtrain <- xgb.DMatrix(
|
dtrain <- xgb.DMatrix(
|
||||||
data = train_data, label = train_label, nthread = n_threads
|
data = train_data, label = train_label, nthread = n_threads
|
||||||
)
|
)
|
||||||
slice(dtrain, 6513L)
|
xgb.slice.DMatrix(dtrain, 6513L)
|
||||||
train_data[6513, ] <- 0
|
train_data[6513, ] <- 0
|
||||||
dtrain <- xgb.DMatrix(
|
dtrain <- xgb.DMatrix(
|
||||||
data = train_data, label = train_label, nthread = n_threads
|
data = train_data, label = train_label, nthread = n_threads
|
||||||
)
|
)
|
||||||
slice(dtrain, 6513L)
|
xgb.slice.DMatrix(dtrain, 6513L)
|
||||||
expect_equal(nrow(dtrain), 6513)
|
expect_equal(nrow(dtrain), 6513)
|
||||||
})
|
})
|
||||||
|
|
||||||
@ -338,19 +338,18 @@ test_that("xgb.DMatrix: data.frame", {
|
|||||||
stringsAsFactors = TRUE
|
stringsAsFactors = TRUE
|
||||||
)
|
)
|
||||||
|
|
||||||
m <- xgb.DMatrix(df, enable_categorical = TRUE)
|
m <- xgb.DMatrix(df)
|
||||||
expect_equal(colnames(m), colnames(df))
|
expect_equal(colnames(m), colnames(df))
|
||||||
expect_equal(
|
expect_equal(
|
||||||
getinfo(m, "feature_type"), c("float", "float", "int", "i", "c", "c")
|
getinfo(m, "feature_type"), c("float", "float", "int", "i", "c", "c")
|
||||||
)
|
)
|
||||||
expect_error(xgb.DMatrix(df))
|
|
||||||
|
|
||||||
df <- data.frame(
|
df <- data.frame(
|
||||||
missing = c("a", "b", "d", NA),
|
missing = c("a", "b", "d", NA),
|
||||||
valid = c("a", "b", "d", "c"),
|
valid = c("a", "b", "d", "c"),
|
||||||
stringsAsFactors = TRUE
|
stringsAsFactors = TRUE
|
||||||
)
|
)
|
||||||
m <- xgb.DMatrix(df, enable_categorical = TRUE)
|
m <- xgb.DMatrix(df)
|
||||||
expect_equal(getinfo(m, "feature_type"), c("c", "c"))
|
expect_equal(getinfo(m, "feature_type"), c("c", "c"))
|
||||||
})
|
})
|
||||||
|
|
||||||
@ -380,6 +379,261 @@ test_that("xgb.DMatrix: can take multi-dimensional 'base_margin'", {
|
|||||||
expect_equal(pred_only_x, pred_w_base - b, tolerance = 1e-5)
|
expect_equal(pred_only_x, pred_w_base - b, tolerance = 1e-5)
|
||||||
})
|
})
|
||||||
|
|
||||||
|
test_that("xgb.DMatrix: QuantileDMatrix produces same result as DMatrix", {
|
||||||
|
data(mtcars)
|
||||||
|
y <- mtcars[, 1]
|
||||||
|
x <- mtcars[, -1]
|
||||||
|
|
||||||
|
cast_matrix <- function(x) as.matrix(x)
|
||||||
|
cast_df <- function(x) as.data.frame(x)
|
||||||
|
cast_csr <- function(x) as(as.matrix(x), "RsparseMatrix")
|
||||||
|
casting_funs <- list(cast_matrix, cast_df, cast_csr)
|
||||||
|
|
||||||
|
for (casting_fun in casting_funs) {
|
||||||
|
|
||||||
|
qdm <- xgb.QuantileDMatrix(
|
||||||
|
data = casting_fun(x),
|
||||||
|
label = y,
|
||||||
|
nthread = n_threads,
|
||||||
|
max_bin = 5
|
||||||
|
)
|
||||||
|
params <- list(
|
||||||
|
tree_method = "hist",
|
||||||
|
objective = "reg:squarederror",
|
||||||
|
nthread = n_threads,
|
||||||
|
max_bin = 5
|
||||||
|
)
|
||||||
|
model_qdm <- xgb.train(
|
||||||
|
params = params,
|
||||||
|
data = qdm,
|
||||||
|
nrounds = 2
|
||||||
|
)
|
||||||
|
pred_qdm <- predict(model_qdm, x)
|
||||||
|
|
||||||
|
dm <- xgb.DMatrix(
|
||||||
|
data = x,
|
||||||
|
label = y,
|
||||||
|
nthread = n_threads
|
||||||
|
)
|
||||||
|
model_dm <- xgb.train(
|
||||||
|
params = params,
|
||||||
|
data = dm,
|
||||||
|
nrounds = 2
|
||||||
|
)
|
||||||
|
pred_dm <- predict(model_dm, x)
|
||||||
|
|
||||||
|
expect_equal(pred_qdm, pred_dm)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
test_that("xgb.DMatrix: QuantileDMatrix is not accepted by exact method", {
|
||||||
|
data(mtcars)
|
||||||
|
y <- mtcars[, 1]
|
||||||
|
x <- as.matrix(mtcars[, -1])
|
||||||
|
qdm <- xgb.QuantileDMatrix(
|
||||||
|
data = x,
|
||||||
|
label = y,
|
||||||
|
nthread = n_threads
|
||||||
|
)
|
||||||
|
params <- list(
|
||||||
|
tree_method = "exact",
|
||||||
|
objective = "reg:squarederror",
|
||||||
|
nthread = n_threads
|
||||||
|
)
|
||||||
|
expect_error({
|
||||||
|
xgb.train(
|
||||||
|
params = params,
|
||||||
|
data = qdm,
|
||||||
|
nrounds = 2
|
||||||
|
)
|
||||||
|
})
|
||||||
|
})
|
||||||
|
|
||||||
|
test_that("xgb.DMatrix: ExternalDMatrix produces the same results as regular DMatrix", {
|
||||||
|
data(mtcars)
|
||||||
|
y <- mtcars[, 1]
|
||||||
|
x <- as.matrix(mtcars[, -1])
|
||||||
|
set.seed(123)
|
||||||
|
params <- list(
|
||||||
|
objective = "reg:squarederror",
|
||||||
|
nthread = n_threads
|
||||||
|
)
|
||||||
|
model <- xgb.train(
|
||||||
|
data = xgb.DMatrix(x, label = y),
|
||||||
|
params = params,
|
||||||
|
nrounds = 5
|
||||||
|
)
|
||||||
|
pred <- predict(model, x)
|
||||||
|
|
||||||
|
iterator_env <- as.environment(
|
||||||
|
list(
|
||||||
|
iter = 0,
|
||||||
|
x = mtcars[, -1],
|
||||||
|
y = mtcars[, 1]
|
||||||
|
)
|
||||||
|
)
|
||||||
|
iterator_next <- function(iterator_env) {
|
||||||
|
curr_iter <- iterator_env[["iter"]]
|
||||||
|
if (curr_iter >= 2) {
|
||||||
|
return(NULL)
|
||||||
|
}
|
||||||
|
if (curr_iter == 0) {
|
||||||
|
x_batch <- iterator_env[["x"]][1:16, ]
|
||||||
|
y_batch <- iterator_env[["y"]][1:16]
|
||||||
|
} else {
|
||||||
|
x_batch <- iterator_env[["x"]][17:32, ]
|
||||||
|
y_batch <- iterator_env[["y"]][17:32]
|
||||||
|
}
|
||||||
|
on.exit({
|
||||||
|
iterator_env[["iter"]] <- curr_iter + 1
|
||||||
|
})
|
||||||
|
return(xgb.DataBatch(data = x_batch, label = y_batch))
|
||||||
|
}
|
||||||
|
iterator_reset <- function(iterator_env) {
|
||||||
|
iterator_env[["iter"]] <- 0
|
||||||
|
}
|
||||||
|
data_iterator <- xgb.DataIter(
|
||||||
|
env = iterator_env,
|
||||||
|
f_next = iterator_next,
|
||||||
|
f_reset = iterator_reset
|
||||||
|
)
|
||||||
|
cache_prefix <- tempdir()
|
||||||
|
edm <- xgb.ExternalDMatrix(data_iterator, cache_prefix, nthread = 1)
|
||||||
|
expect_true(inherits(edm, "xgb.ExternalDMatrix"))
|
||||||
|
expect_true(inherits(edm, "xgb.DMatrix"))
|
||||||
|
set.seed(123)
|
||||||
|
model_ext <- xgb.train(
|
||||||
|
data = edm,
|
||||||
|
params = params,
|
||||||
|
nrounds = 5
|
||||||
|
)
|
||||||
|
|
||||||
|
pred_model1_edm <- predict(model, edm)
|
||||||
|
pred_model2_mat <- predict(model_ext, x)
|
||||||
|
pred_model2_edm <- predict(model_ext, edm)
|
||||||
|
|
||||||
|
expect_equal(pred_model1_edm, pred)
|
||||||
|
expect_equal(pred_model2_mat, pred)
|
||||||
|
expect_equal(pred_model2_edm, pred)
|
||||||
|
})
|
||||||
|
|
||||||
|
test_that("xgb.DMatrix: External QDM produces same results as regular QDM", {
|
||||||
|
data(mtcars)
|
||||||
|
y <- mtcars[, 1]
|
||||||
|
x <- as.matrix(mtcars[, -1])
|
||||||
|
set.seed(123)
|
||||||
|
params <- list(
|
||||||
|
objective = "reg:squarederror",
|
||||||
|
nthread = n_threads,
|
||||||
|
max_bin = 3
|
||||||
|
)
|
||||||
|
model <- xgb.train(
|
||||||
|
data = xgb.QuantileDMatrix(
|
||||||
|
x,
|
||||||
|
label = y,
|
||||||
|
nthread = 1,
|
||||||
|
max_bin = 3
|
||||||
|
),
|
||||||
|
params = params,
|
||||||
|
nrounds = 5
|
||||||
|
)
|
||||||
|
pred <- predict(model, x)
|
||||||
|
|
||||||
|
iterator_env <- as.environment(
|
||||||
|
list(
|
||||||
|
iter = 0,
|
||||||
|
x = mtcars[, -1],
|
||||||
|
y = mtcars[, 1]
|
||||||
|
)
|
||||||
|
)
|
||||||
|
iterator_next <- function(iterator_env) {
|
||||||
|
curr_iter <- iterator_env[["iter"]]
|
||||||
|
if (curr_iter >= 2) {
|
||||||
|
return(NULL)
|
||||||
|
}
|
||||||
|
if (curr_iter == 0) {
|
||||||
|
x_batch <- iterator_env[["x"]][1:16, ]
|
||||||
|
y_batch <- iterator_env[["y"]][1:16]
|
||||||
|
} else {
|
||||||
|
x_batch <- iterator_env[["x"]][17:32, ]
|
||||||
|
y_batch <- iterator_env[["y"]][17:32]
|
||||||
|
}
|
||||||
|
on.exit({
|
||||||
|
iterator_env[["iter"]] <- curr_iter + 1
|
||||||
|
})
|
||||||
|
return(xgb.DataBatch(data = x_batch, label = y_batch))
|
||||||
|
}
|
||||||
|
iterator_reset <- function(iterator_env) {
|
||||||
|
iterator_env[["iter"]] <- 0
|
||||||
|
}
|
||||||
|
data_iterator <- xgb.DataIter(
|
||||||
|
env = iterator_env,
|
||||||
|
f_next = iterator_next,
|
||||||
|
f_reset = iterator_reset
|
||||||
|
)
|
||||||
|
cache_prefix <- tempdir()
|
||||||
|
qdm <- xgb.QuantileDMatrix.from_iterator(
|
||||||
|
data_iterator,
|
||||||
|
max_bin = 3,
|
||||||
|
nthread = 1
|
||||||
|
)
|
||||||
|
expect_true(inherits(qdm, "xgb.QuantileDMatrix"))
|
||||||
|
expect_true(inherits(qdm, "xgb.DMatrix"))
|
||||||
|
set.seed(123)
|
||||||
|
model_ext <- xgb.train(
|
||||||
|
data = qdm,
|
||||||
|
params = params,
|
||||||
|
nrounds = 5
|
||||||
|
)
|
||||||
|
|
||||||
|
pred_model1_qdm <- predict(model, qdm)
|
||||||
|
pred_model2_mat <- predict(model_ext, x)
|
||||||
|
pred_model2_qdm <- predict(model_ext, qdm)
|
||||||
|
|
||||||
|
expect_equal(pred_model1_qdm, pred)
|
||||||
|
expect_equal(pred_model2_mat, pred)
|
||||||
|
expect_equal(pred_model2_qdm, pred)
|
||||||
|
})
|
||||||
|
|
||||||
|
test_that("xgb.DMatrix: R errors thrown on DataIterator are thrown back to the user", {
|
||||||
|
data(mtcars)
|
||||||
|
iterator_env <- as.environment(
|
||||||
|
list(
|
||||||
|
iter = 0,
|
||||||
|
x = mtcars[, -1],
|
||||||
|
y = mtcars[, 1]
|
||||||
|
)
|
||||||
|
)
|
||||||
|
iterator_next <- function(iterator_env) {
|
||||||
|
curr_iter <- iterator_env[["iter"]]
|
||||||
|
if (curr_iter >= 2) {
|
||||||
|
return(0)
|
||||||
|
}
|
||||||
|
if (curr_iter == 0) {
|
||||||
|
x_batch <- iterator_env[["x"]][1:16, ]
|
||||||
|
y_batch <- iterator_env[["y"]][1:16]
|
||||||
|
} else {
|
||||||
|
stop("custom error")
|
||||||
|
}
|
||||||
|
on.exit({
|
||||||
|
iterator_env[["iter"]] <- curr_iter + 1
|
||||||
|
})
|
||||||
|
return(xgb.DataBatch(data = x_batch, label = y_batch))
|
||||||
|
}
|
||||||
|
iterator_reset <- function(iterator_env) {
|
||||||
|
iterator_env[["iter"]] <- 0
|
||||||
|
}
|
||||||
|
data_iterator <- xgb.DataIter(
|
||||||
|
env = iterator_env,
|
||||||
|
f_next = iterator_next,
|
||||||
|
f_reset = iterator_reset
|
||||||
|
)
|
||||||
|
expect_error(
|
||||||
|
{xgb.ExternalDMatrix(data_iterator, nthread = 1)},
|
||||||
|
"custom error"
|
||||||
|
)
|
||||||
|
})
|
||||||
|
|
||||||
test_that("xgb.DMatrix: number of non-missing matches data", {
|
test_that("xgb.DMatrix: number of non-missing matches data", {
|
||||||
x <- matrix(1:10, nrow = 5)
|
x <- matrix(1:10, nrow = 5)
|
||||||
dm1 <- xgb.DMatrix(x)
|
dm1 <- xgb.DMatrix(x)
|
||||||
|
|||||||
@ -511,3 +511,82 @@ test_that('convert.labels works', {
|
|||||||
expect_equal(class(res), 'numeric')
|
expect_equal(class(res), 'numeric')
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
|
|
||||||
|
test_that("validate.features works as expected", {
|
||||||
|
data(mtcars)
|
||||||
|
y <- mtcars$mpg
|
||||||
|
x <- as.matrix(mtcars[, -1])
|
||||||
|
dm <- xgb.DMatrix(x, label = y, nthread = 1)
|
||||||
|
model <- xgb.train(
|
||||||
|
params = list(nthread = 1),
|
||||||
|
data = dm,
|
||||||
|
nrounds = 3
|
||||||
|
)
|
||||||
|
|
||||||
|
# result is output as-is when needed
|
||||||
|
res <- validate.features(model, x)
|
||||||
|
expect_equal(res, x)
|
||||||
|
res <- validate.features(model, dm)
|
||||||
|
expect_identical(res, dm)
|
||||||
|
res <- validate.features(model, as(x[1, ], "dsparseVector"))
|
||||||
|
expect_equal(as.numeric(res), unname(x[1, ]))
|
||||||
|
res <- validate.features(model, "file.txt")
|
||||||
|
expect_equal(res, "file.txt")
|
||||||
|
|
||||||
|
# columns are reordered
|
||||||
|
res <- validate.features(model, mtcars[, rev(names(mtcars))])
|
||||||
|
expect_equal(names(res), colnames(x))
|
||||||
|
expect_equal(as.matrix(res), x)
|
||||||
|
res <- validate.features(model, as.matrix(mtcars[, rev(names(mtcars))]))
|
||||||
|
expect_equal(colnames(res), colnames(x))
|
||||||
|
expect_equal(res, x)
|
||||||
|
res <- validate.features(model, mtcars[1, rev(names(mtcars)), drop = FALSE])
|
||||||
|
expect_equal(names(res), colnames(x))
|
||||||
|
expect_equal(unname(as.matrix(res)), unname(x[1, , drop = FALSE]))
|
||||||
|
res <- validate.features(model, as.data.table(mtcars[, rev(names(mtcars))]))
|
||||||
|
expect_equal(names(res), colnames(x))
|
||||||
|
expect_equal(unname(as.matrix(res)), unname(x))
|
||||||
|
|
||||||
|
# error when columns are missing
|
||||||
|
expect_error({
|
||||||
|
validate.features(model, mtcars[, 1:3])
|
||||||
|
})
|
||||||
|
expect_error({
|
||||||
|
validate.features(model, as.matrix(mtcars[, 1:ncol(x)])) # nolint
|
||||||
|
})
|
||||||
|
expect_error({
|
||||||
|
validate.features(model, xgb.DMatrix(mtcars[, 1:3]))
|
||||||
|
})
|
||||||
|
expect_error({
|
||||||
|
validate.features(model, as(x[, 1:3], "CsparseMatrix"))
|
||||||
|
})
|
||||||
|
|
||||||
|
# error when it cannot reorder or subset
|
||||||
|
expect_error({
|
||||||
|
validate.features(model, xgb.DMatrix(mtcars))
|
||||||
|
}, "Feature names")
|
||||||
|
expect_error({
|
||||||
|
validate.features(model, xgb.DMatrix(x[, rev(colnames(x))]))
|
||||||
|
}, "Feature names")
|
||||||
|
|
||||||
|
# no error about types if the booster doesn't have types
|
||||||
|
expect_error({
|
||||||
|
validate.features(model, xgb.DMatrix(x, feature_types = c(rep("q", 5), rep("c", 5))))
|
||||||
|
}, NA)
|
||||||
|
tmp <- mtcars
|
||||||
|
tmp[["vs"]] <- factor(tmp[["vs"]])
|
||||||
|
expect_error({
|
||||||
|
validate.features(model, tmp)
|
||||||
|
}, NA)
|
||||||
|
|
||||||
|
# error when types do not match
|
||||||
|
setinfo(model, "feature_type", rep("q", 10))
|
||||||
|
expect_error({
|
||||||
|
validate.features(model, xgb.DMatrix(x, feature_types = c(rep("q", 5), rep("c", 5))))
|
||||||
|
}, "Feature types")
|
||||||
|
tmp <- mtcars
|
||||||
|
tmp[["vs"]] <- factor(tmp[["vs"]])
|
||||||
|
expect_error({
|
||||||
|
validate.features(model, tmp)
|
||||||
|
}, "Feature types")
|
||||||
|
})
|
||||||
|
|||||||
@ -6,6 +6,7 @@ This demo uses 1D toy data and visualizes how XGBoost fits a tree ensemble. The
|
|||||||
model starts out as a flat line and evolves into a step function in order to account for
|
model starts out as a flat line and evolves into a step function in order to account for
|
||||||
all ranged labels.
|
all ranged labels.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import matplotlib.pyplot as plt
|
import matplotlib.pyplot as plt
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
|
|||||||
@ -3,6 +3,7 @@ Example of training with Dask on CPU
|
|||||||
====================================
|
====================================
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from dask import array as da
|
from dask import array as da
|
||||||
from dask.distributed import Client, LocalCluster
|
from dask.distributed import Client, LocalCluster
|
||||||
|
|
||||||
@ -14,8 +15,9 @@ def main(client):
|
|||||||
# generate some random data for demonstration
|
# generate some random data for demonstration
|
||||||
m = 100000
|
m = 100000
|
||||||
n = 100
|
n = 100
|
||||||
X = da.random.random(size=(m, n), chunks=100)
|
rng = da.random.default_rng(1)
|
||||||
y = da.random.random(size=(m,), chunks=100)
|
X = rng.normal(size=(m, n))
|
||||||
|
y = X.sum(axis=1)
|
||||||
|
|
||||||
# DaskDMatrix acts like normal DMatrix, works as a proxy for local
|
# DaskDMatrix acts like normal DMatrix, works as a proxy for local
|
||||||
# DMatrix scatter around workers.
|
# DMatrix scatter around workers.
|
||||||
|
|||||||
@ -2,6 +2,7 @@
|
|||||||
Example of using callbacks with Dask
|
Example of using callbacks with Dask
|
||||||
====================================
|
====================================
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from dask.distributed import Client, LocalCluster
|
from dask.distributed import Client, LocalCluster
|
||||||
from dask_ml.datasets import make_regression
|
from dask_ml.datasets import make_regression
|
||||||
|
|||||||
@ -2,6 +2,8 @@
|
|||||||
Example of training with Dask on GPU
|
Example of training with Dask on GPU
|
||||||
====================================
|
====================================
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
import cupy as cp
|
||||||
import dask_cudf
|
import dask_cudf
|
||||||
from dask import array as da
|
from dask import array as da
|
||||||
from dask import dataframe as dd
|
from dask import dataframe as dd
|
||||||
@ -72,10 +74,12 @@ if __name__ == "__main__":
|
|||||||
with LocalCUDACluster(n_workers=2, threads_per_worker=4) as cluster:
|
with LocalCUDACluster(n_workers=2, threads_per_worker=4) as cluster:
|
||||||
with Client(cluster) as client:
|
with Client(cluster) as client:
|
||||||
# generate some random data for demonstration
|
# generate some random data for demonstration
|
||||||
|
rng = da.random.default_rng(1)
|
||||||
|
|
||||||
m = 100000
|
m = 100000
|
||||||
n = 100
|
n = 100
|
||||||
X = da.random.random(size=(m, n), chunks=10000)
|
X = rng.normal(size=(m, n))
|
||||||
y = da.random.random(size=(m,), chunks=10000)
|
y = X.sum(axis=1)
|
||||||
|
|
||||||
print("Using DaskQuantileDMatrix")
|
print("Using DaskQuantileDMatrix")
|
||||||
from_ddqdm = using_quantile_device_dmatrix(client, X, y)
|
from_ddqdm = using_quantile_device_dmatrix(client, X, y)
|
||||||
|
|||||||
@ -2,6 +2,7 @@
|
|||||||
Use scikit-learn regressor interface with CPU histogram tree method
|
Use scikit-learn regressor interface with CPU histogram tree method
|
||||||
===================================================================
|
===================================================================
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from dask import array as da
|
from dask import array as da
|
||||||
from dask.distributed import Client, LocalCluster
|
from dask.distributed import Client, LocalCluster
|
||||||
|
|
||||||
|
|||||||
@ -4,6 +4,7 @@ Demo for using and defining callback functions
|
|||||||
|
|
||||||
.. versionadded:: 1.3.0
|
.. versionadded:: 1.3.0
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import os
|
import os
|
||||||
import tempfile
|
import tempfile
|
||||||
|
|||||||
@ -13,6 +13,7 @@ See Also
|
|||||||
- :ref:`sphx_glr_python_examples_cat_in_the_dat.py`
|
- :ref:`sphx_glr_python_examples_cat_in_the_dat.py`
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from typing import List, Tuple
|
from typing import List, Tuple
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|||||||
@ -17,6 +17,7 @@ See Also
|
|||||||
- :ref:`sphx_glr_python_examples_cat_pipeline.py`
|
- :ref:`sphx_glr_python_examples_cat_pipeline.py`
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from typing import Tuple
|
from typing import Tuple
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|||||||
@ -11,6 +11,7 @@ instead of Quantile DMatrix. The feature is not ready for production use yet.
|
|||||||
See :doc:`the tutorial </tutorials/external_memory>` for more details.
|
See :doc:`the tutorial </tutorials/external_memory>` for more details.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import tempfile
|
import tempfile
|
||||||
from typing import Callable, List, Tuple
|
from typing import Callable, List, Tuple
|
||||||
|
|||||||
@ -2,6 +2,7 @@
|
|||||||
Demo for prediction using individual trees and model slices
|
Demo for prediction using individual trees and model slices
|
||||||
===========================================================
|
===========================================================
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import os
|
import os
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|||||||
@ -15,6 +15,7 @@ position debiasing training.
|
|||||||
For an overview of learning to rank in XGBoost, please see
|
For an overview of learning to rank in XGBoost, please see
|
||||||
:doc:`Learning to Rank </tutorials/learning_to_rank>`.
|
:doc:`Learning to Rank </tutorials/learning_to_rank>`.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
|
|||||||
@ -13,6 +13,7 @@ https://scikit-learn.org/stable/auto_examples/ensemble/plot_gradient_boosting_qu
|
|||||||
crossing can happen due to limitation in the algorithm.
|
crossing can happen due to limitation in the algorithm.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
from typing import Dict
|
from typing import Dict
|
||||||
|
|
||||||
|
|||||||
@ -9,6 +9,7 @@ Created on 1 Apr 2015
|
|||||||
|
|
||||||
@author: Jamie Hall
|
@author: Jamie Hall
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import pickle
|
import pickle
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|||||||
@ -2,6 +2,7 @@
|
|||||||
Demo for using xgboost with sklearn
|
Demo for using xgboost with sklearn
|
||||||
===================================
|
===================================
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import multiprocessing
|
import multiprocessing
|
||||||
|
|
||||||
from sklearn.datasets import fetch_california_housing
|
from sklearn.datasets import fetch_california_housing
|
||||||
|
|||||||
@ -4,6 +4,7 @@ Collection of examples for using xgboost.spark estimator interface
|
|||||||
|
|
||||||
@author: Weichen Xu
|
@author: Weichen Xu
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import sklearn.datasets
|
import sklearn.datasets
|
||||||
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, RegressionEvaluator
|
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, RegressionEvaluator
|
||||||
from pyspark.ml.linalg import Vectors
|
from pyspark.ml.linalg import Vectors
|
||||||
|
|||||||
@ -2,6 +2,7 @@
|
|||||||
Using rmm with Dask
|
Using rmm with Dask
|
||||||
===================
|
===================
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import dask
|
import dask
|
||||||
from dask.distributed import Client
|
from dask.distributed import Client
|
||||||
from dask_cuda import LocalCUDACluster
|
from dask_cuda import LocalCUDACluster
|
||||||
|
|||||||
@ -2,6 +2,7 @@
|
|||||||
Using rmm on a single node device
|
Using rmm on a single node device
|
||||||
=================================
|
=================================
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import rmm
|
import rmm
|
||||||
from sklearn.datasets import make_classification
|
from sklearn.datasets import make_classification
|
||||||
|
|
||||||
|
|||||||
@ -26,3 +26,12 @@ Tutorials
|
|||||||
|
|
||||||
Introduction to XGBoost in R <xgboostPresentation>
|
Introduction to XGBoost in R <xgboostPresentation>
|
||||||
Understanding your dataset with XGBoost <discoverYourData>
|
Understanding your dataset with XGBoost <discoverYourData>
|
||||||
|
|
||||||
|
************
|
||||||
|
Other topics
|
||||||
|
************
|
||||||
|
|
||||||
|
.. toctree::
|
||||||
|
:maxdepth: 2
|
||||||
|
:titlesonly:
|
||||||
|
Handling of indexable elements <index_base>
|
||||||
|
|||||||
29
doc/R-package/index_base.rst
Normal file
29
doc/R-package/index_base.rst
Normal file
@ -0,0 +1,29 @@
|
|||||||
|
.. _index_base:
|
||||||
|
|
||||||
|
Handling of indexable elements
|
||||||
|
==============================
|
||||||
|
|
||||||
|
There are many functionalities in XGBoost which refer to indexable elements in a countable set, such as boosting rounds / iterations / trees in a model (which can be referred to by number), classes, categories / levels in categorical features, among others.
|
||||||
|
|
||||||
|
XGBoost, being written in C++, uses base-0 indexing and considers ranges / sequences to be inclusive of the left end but not the right one - for example, a range (0, 3) would include the first three elements, numbered 0, 1, and 2.
|
||||||
|
|
||||||
|
The Python interface uses this same logic, since this is also the way that indexing in Python works, but other languages like R have different logic. In R, indexing is base-1 and ranges / sequences are inclusive of both ends - for example, to refer to the first three elements in a sequence, the interval would be written as (1, 3), and the elements numbered 1, 2, and 3.
|
||||||
|
|
||||||
|
In order to provide a more idiomatic R interface, XGBoost adjusts its user-facing R interface to follow this and similar R conventions, but internally, it needs to convert all these numbers to the format that the C interface uses. This is made more problematic by the fact that models are meant to be serializable and loadable in other interfaces, which will have different indexing logic.
|
||||||
|
|
||||||
|
The following adjustments are made in the R interface:
|
||||||
|
|
||||||
|
- Slicing method for DMatrix, which takes an array of integers, is converted to base-0 indexing by subtracting 1 from each element. Note that this is done in the C-level wrapper function for R, unlike all other conversions which are done in R before being passed to C.
|
||||||
|
- Slicing method for Booster takes a sequence defined by start, end, and step. The R interface is made to work the same way as R's ``seq`` from the user's POV, so it always adjusts the left end by subtracting one, and depending on whether the step size ends exactly or not at the right end, will also adjust the right end to be non-inclusive in C indexing.
|
||||||
|
- Parameter ``iterationrange`` in ``predict`` is also made to behave the same way as R's ``seq``. Since it doesn't have a step size, just adjusting the left end by subtracting 1 suffices here.
|
||||||
|
- ``best_iteration``, depending on the context, might be stored as both a C-level booster attribute, and as an R attribute. Since the C-level attributes are shared across interfaces and used in prediction methods, in order to improve compatibility, it leaves this C-level attribute in base-0 indexing, but the R attribute, if present, will be adjusted to base-1 indexing. Note that the ``predict`` method in R and other interfaces will look at the C-level attribute only.
|
||||||
|
- Other references to iteration numbers or boosting rounds, such as when printing metrics or saving model snapshots, also follow base-1 indexing. These other references are coded entirely in R, as the C-level functions do not handle such functionalities.
|
||||||
|
- Terminal leaf / node numbers are returned in base-0 indexing, just like they come from the C interface.
|
||||||
|
- Tree numbers in plots follow base-1 indexing. Note that these are only displayed when producing these plots through the R interface's own handling of DiagrammeR objects, but not when using the C-level GraphViz 'dot' format generator for plots.
|
||||||
|
- Feature numbers when producing feature importances, JSONs, trees-to-tables, and SHAP; are all following base-0 indexing.
|
||||||
|
- Categorical features are defined in R as a ``factor`` type which encodes with base-1 indexing. When categorical features are passed as R ``factor`` types, the conversion is done automatically to base-0 indexing, but if the user whishes to manually supply categorical features as already-encoded integers, then those integers need to already be in base-0 encoding.
|
||||||
|
- Categorical levels (categories) in outputs such as plots, JSONs, and trees-to-tables; are also referred to using base-0 indexing, regardless of whether they went into the model as integers or as ``factor``-typed columns.
|
||||||
|
- Categorical labels for DMatrices do not undergo any extra processing - the user must supply base-0 encoded labels.
|
||||||
|
- A function to retrieve class-specific coefficients when using the linear coefficients history callback takes a class index parameter, which also does not undergo any conversion (i.e. user must pass a base-0 index), in order to match with the label logic - that is, the same class index will refer to the class encoded with that number in the DMatrix ``label`` field.
|
||||||
|
|
||||||
|
New additions to the R interface that take on indexable elements should be mindful of these conventions and try to mimic R's behavior as much as possible.
|
||||||
@ -104,7 +104,7 @@ using cross validation with early stopping, here is a snippet to begin with:
|
|||||||
|
|
||||||
clf = xgb.XGBClassifier(tree_method="hist", early_stopping_rounds=3)
|
clf = xgb.XGBClassifier(tree_method="hist", early_stopping_rounds=3)
|
||||||
|
|
||||||
resutls = {}
|
results = {}
|
||||||
|
|
||||||
for train, test in cv.split(X, y):
|
for train, test in cv.split(X, y):
|
||||||
X_train = X[train]
|
X_train = X[train]
|
||||||
@ -114,7 +114,7 @@ using cross validation with early stopping, here is a snippet to begin with:
|
|||||||
est, train_score, test_score = fit_and_score(
|
est, train_score, test_score = fit_and_score(
|
||||||
clone(clf), X_train, X_test, y_train, y_test
|
clone(clf), X_train, X_test, y_train, y_test
|
||||||
)
|
)
|
||||||
resutls[est] = (train_score, test_score)
|
results[est] = (train_score, test_score)
|
||||||
|
|
||||||
|
|
||||||
***********************************
|
***********************************
|
||||||
|
|||||||
@ -96,8 +96,8 @@ Sample Script
|
|||||||
|
|
||||||
import xgboost as xgb
|
import xgboost as xgb
|
||||||
# read in data
|
# read in data
|
||||||
dtrain = xgb.DMatrix('demo/data/agaricus.txt.train')
|
dtrain = xgb.DMatrix('demo/data/agaricus.txt.train?format=libsvm')
|
||||||
dtest = xgb.DMatrix('demo/data/agaricus.txt.test')
|
dtest = xgb.DMatrix('demo/data/agaricus.txt.test?format=libsvm')
|
||||||
# specify parameters via map
|
# specify parameters via map
|
||||||
param = {'booster': 'dart',
|
param = {'booster': 'dart',
|
||||||
'max_depth': 5, 'learning_rate': 0.1,
|
'max_depth': 5, 'learning_rate': 0.1,
|
||||||
|
|||||||
101
plugin/sycl/common/partition_builder.h
Normal file
101
plugin/sycl/common/partition_builder.h
Normal file
@ -0,0 +1,101 @@
|
|||||||
|
/*!
|
||||||
|
* Copyright 2017-2024 XGBoost contributors
|
||||||
|
*/
|
||||||
|
#ifndef PLUGIN_SYCL_COMMON_PARTITION_BUILDER_H_
|
||||||
|
#define PLUGIN_SYCL_COMMON_PARTITION_BUILDER_H_
|
||||||
|
|
||||||
|
#pragma GCC diagnostic push
|
||||||
|
#pragma GCC diagnostic ignored "-Wtautological-constant-compare"
|
||||||
|
#pragma GCC diagnostic ignored "-W#pragma-messages"
|
||||||
|
#include <xgboost/data.h>
|
||||||
|
#pragma GCC diagnostic pop
|
||||||
|
#include <xgboost/tree_model.h>
|
||||||
|
|
||||||
|
#include <algorithm>
|
||||||
|
#include <vector>
|
||||||
|
#include <utility>
|
||||||
|
|
||||||
|
#pragma GCC diagnostic push
|
||||||
|
#pragma GCC diagnostic ignored "-Wtautological-constant-compare"
|
||||||
|
#include "../../../src/common/column_matrix.h"
|
||||||
|
#pragma GCC diagnostic pop
|
||||||
|
|
||||||
|
#include "../data.h"
|
||||||
|
|
||||||
|
#include <CL/sycl.hpp>
|
||||||
|
|
||||||
|
namespace xgboost {
|
||||||
|
namespace sycl {
|
||||||
|
namespace common {
|
||||||
|
|
||||||
|
// The builder is required for samples partition to left and rights children for set of nodes
|
||||||
|
class PartitionBuilder {
|
||||||
|
public:
|
||||||
|
template<typename Func>
|
||||||
|
void Init(::sycl::queue* qu, size_t n_nodes, Func funcNTaks) {
|
||||||
|
qu_ = qu;
|
||||||
|
nodes_offsets_.resize(n_nodes+1);
|
||||||
|
result_rows_.resize(2 * n_nodes);
|
||||||
|
n_nodes_ = n_nodes;
|
||||||
|
|
||||||
|
|
||||||
|
nodes_offsets_[0] = 0;
|
||||||
|
for (size_t i = 1; i < n_nodes+1; ++i) {
|
||||||
|
nodes_offsets_[i] = nodes_offsets_[i-1] + funcNTaks(i-1);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (data_.Size() < nodes_offsets_[n_nodes]) {
|
||||||
|
data_.Resize(qu, nodes_offsets_[n_nodes]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t GetNLeftElems(int nid) const {
|
||||||
|
return result_rows_[2 * nid];
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
size_t GetNRightElems(int nid) const {
|
||||||
|
return result_rows_[2 * nid + 1];
|
||||||
|
}
|
||||||
|
|
||||||
|
// For test purposes only
|
||||||
|
void SetNLeftElems(int nid, size_t val) {
|
||||||
|
result_rows_[2 * nid] = val;
|
||||||
|
}
|
||||||
|
|
||||||
|
// For test purposes only
|
||||||
|
void SetNRightElems(int nid, size_t val) {
|
||||||
|
result_rows_[2 * nid + 1] = val;
|
||||||
|
}
|
||||||
|
|
||||||
|
xgboost::common::Span<size_t> GetData(int nid) {
|
||||||
|
return { data_.Data() + nodes_offsets_[nid], nodes_offsets_[nid + 1] - nodes_offsets_[nid] };
|
||||||
|
}
|
||||||
|
|
||||||
|
void MergeToArray(size_t nid,
|
||||||
|
size_t* data_result,
|
||||||
|
::sycl::event event) {
|
||||||
|
size_t n_nodes_total = GetNLeftElems(nid) + GetNRightElems(nid);
|
||||||
|
if (n_nodes_total > 0) {
|
||||||
|
const size_t* data = data_.Data() + nodes_offsets_[nid];
|
||||||
|
qu_->memcpy(data_result, data, sizeof(size_t) * n_nodes_total, event);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
protected:
|
||||||
|
std::vector<size_t> nodes_offsets_;
|
||||||
|
std::vector<size_t> result_rows_;
|
||||||
|
size_t n_nodes_;
|
||||||
|
|
||||||
|
USMVector<size_t, MemoryType::on_device> parts_size_;
|
||||||
|
USMVector<size_t, MemoryType::on_device> data_;
|
||||||
|
|
||||||
|
::sycl::queue* qu_;
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace common
|
||||||
|
} // namespace sycl
|
||||||
|
} // namespace xgboost
|
||||||
|
|
||||||
|
|
||||||
|
#endif // PLUGIN_SYCL_COMMON_PARTITION_BUILDER_H_
|
||||||
@ -2,6 +2,7 @@
|
|||||||
Custom hook to customize the behavior of Hatchling.
|
Custom hook to customize the behavior of Hatchling.
|
||||||
Here, we customize the tag of the generated wheels.
|
Here, we customize the tag of the generated wheels.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import sysconfig
|
import sysconfig
|
||||||
from typing import Any, Dict
|
from typing import Any, Dict
|
||||||
|
|
||||||
|
|||||||
@ -1,4 +1,5 @@
|
|||||||
"""Build configuration"""
|
"""Build configuration"""
|
||||||
|
|
||||||
import dataclasses
|
import dataclasses
|
||||||
from typing import Any, Dict, List, Optional
|
from typing import Any, Dict, List, Optional
|
||||||
|
|
||||||
|
|||||||
@ -1,6 +1,7 @@
|
|||||||
"""
|
"""
|
||||||
Functions for building libxgboost
|
Functions for building libxgboost
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
import pathlib
|
import pathlib
|
||||||
|
|||||||
@ -4,6 +4,7 @@ Builds source distribution and binary wheels, following PEP 517 / PEP 660.
|
|||||||
Reuses components of Hatchling (https://github.com/pypa/hatch/tree/master/backend) for the sake
|
Reuses components of Hatchling (https://github.com/pypa/hatch/tree/master/backend) for the sake
|
||||||
of brevity.
|
of brevity.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import dataclasses
|
import dataclasses
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
|
|||||||
@ -1,6 +1,7 @@
|
|||||||
"""
|
"""
|
||||||
Functions for building sdist
|
Functions for building sdist
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
import pathlib
|
import pathlib
|
||||||
|
|
||||||
|
|||||||
@ -1,6 +1,7 @@
|
|||||||
"""
|
"""
|
||||||
Utility functions for implementing PEP 517 backend
|
Utility functions for implementing PEP 517 backend
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
import pathlib
|
import pathlib
|
||||||
import shutil
|
import shutil
|
||||||
|
|||||||
@ -36,6 +36,11 @@ PandasDType = Any # real type is pandas.core.dtypes.base.ExtensionDtype
|
|||||||
|
|
||||||
FloatCompatible = Union[float, np.float32, np.float64]
|
FloatCompatible = Union[float, np.float32, np.float64]
|
||||||
|
|
||||||
|
# typing.SupportsInt is not suitable here since floating point values are convertible to
|
||||||
|
# integers as well.
|
||||||
|
Integer = Union[int, np.integer]
|
||||||
|
IterationRange = Tuple[Integer, Integer]
|
||||||
|
|
||||||
# callables
|
# callables
|
||||||
FPreProcCallable = Callable
|
FPreProcCallable = Callable
|
||||||
|
|
||||||
|
|||||||
@ -1,4 +1,5 @@
|
|||||||
"""XGBoost collective communication related API."""
|
"""XGBoost collective communication related API."""
|
||||||
|
|
||||||
import ctypes
|
import ctypes
|
||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
|
|||||||
@ -48,6 +48,8 @@ from ._typing import (
|
|||||||
FeatureInfo,
|
FeatureInfo,
|
||||||
FeatureNames,
|
FeatureNames,
|
||||||
FeatureTypes,
|
FeatureTypes,
|
||||||
|
Integer,
|
||||||
|
IterationRange,
|
||||||
ModelIn,
|
ModelIn,
|
||||||
NumpyOrCupy,
|
NumpyOrCupy,
|
||||||
TransformedData,
|
TransformedData,
|
||||||
@ -62,13 +64,11 @@ class XGBoostError(ValueError):
|
|||||||
|
|
||||||
|
|
||||||
@overload
|
@overload
|
||||||
def from_pystr_to_cstr(data: str) -> bytes:
|
def from_pystr_to_cstr(data: str) -> bytes: ...
|
||||||
...
|
|
||||||
|
|
||||||
|
|
||||||
@overload
|
@overload
|
||||||
def from_pystr_to_cstr(data: List[str]) -> ctypes.Array:
|
def from_pystr_to_cstr(data: List[str]) -> ctypes.Array: ...
|
||||||
...
|
|
||||||
|
|
||||||
|
|
||||||
def from_pystr_to_cstr(data: Union[str, List[str]]) -> Union[bytes, ctypes.Array]:
|
def from_pystr_to_cstr(data: Union[str, List[str]]) -> Union[bytes, ctypes.Array]:
|
||||||
@ -798,9 +798,23 @@ class DMatrix: # pylint: disable=too-many-instance-attributes,too-many-public-m
|
|||||||
Set names for features.
|
Set names for features.
|
||||||
feature_types :
|
feature_types :
|
||||||
|
|
||||||
Set types for features. When `enable_categorical` is set to `True`, string
|
Set types for features. If `data` is a DataFrame type and passing
|
||||||
"c" represents categorical data type while "q" represents numerical feature
|
`enable_categorical=True`, the types will be deduced automatically
|
||||||
type. For categorical features, the input is assumed to be preprocessed and
|
from the column types.
|
||||||
|
|
||||||
|
Otherwise, one can pass a list-like input with the same length as number
|
||||||
|
of columns in `data`, with the following possible values:
|
||||||
|
- "c", which represents categorical columns.
|
||||||
|
- "q", which represents numeric columns.
|
||||||
|
- "int", which represents integer columns.
|
||||||
|
- "i", which represents boolean columns.
|
||||||
|
|
||||||
|
Note that, while categorical types are treated differently from
|
||||||
|
the rest for model fitting purposes, the other types do not influence
|
||||||
|
the generated model, but have effects in other functionalities such as
|
||||||
|
feature importances.
|
||||||
|
|
||||||
|
For categorical features, the input is assumed to be preprocessed and
|
||||||
encoded by the users. The encoding can be done via
|
encoded by the users. The encoding can be done via
|
||||||
:py:class:`sklearn.preprocessing.OrdinalEncoder` or pandas dataframe
|
:py:class:`sklearn.preprocessing.OrdinalEncoder` or pandas dataframe
|
||||||
`.cat.codes` method. This is useful when users want to specify categorical
|
`.cat.codes` method. This is useful when users want to specify categorical
|
||||||
@ -1812,19 +1826,25 @@ class Booster:
|
|||||||
state["handle"] = handle
|
state["handle"] = handle
|
||||||
self.__dict__.update(state)
|
self.__dict__.update(state)
|
||||||
|
|
||||||
def __getitem__(self, val: Union[int, tuple, slice]) -> "Booster":
|
def __getitem__(self, val: Union[Integer, tuple, slice]) -> "Booster":
|
||||||
"""Get a slice of the tree-based model.
|
"""Get a slice of the tree-based model.
|
||||||
|
|
||||||
.. versionadded:: 1.3.0
|
.. versionadded:: 1.3.0
|
||||||
|
|
||||||
"""
|
"""
|
||||||
if isinstance(val, int):
|
# convert to slice for all other types
|
||||||
val = slice(val, val + 1)
|
if isinstance(val, (np.integer, int)):
|
||||||
|
val = slice(int(val), int(val + 1))
|
||||||
|
if isinstance(val, type(Ellipsis)):
|
||||||
|
val = slice(0, 0)
|
||||||
if isinstance(val, tuple):
|
if isinstance(val, tuple):
|
||||||
raise ValueError("Only supports slicing through 1 dimension.")
|
raise ValueError("Only supports slicing through 1 dimension.")
|
||||||
|
# All supported types are now slice
|
||||||
|
# FIXME(jiamingy): Use `types.EllipsisType` once Python 3.10 is used.
|
||||||
if not isinstance(val, slice):
|
if not isinstance(val, slice):
|
||||||
msg = _expect((int, slice), type(val))
|
msg = _expect((int, slice, np.integer, type(Ellipsis)), type(val))
|
||||||
raise TypeError(msg)
|
raise TypeError(msg)
|
||||||
|
|
||||||
if isinstance(val.start, type(Ellipsis)) or val.start is None:
|
if isinstance(val.start, type(Ellipsis)) or val.start is None:
|
||||||
start = 0
|
start = 0
|
||||||
else:
|
else:
|
||||||
@ -2246,12 +2266,13 @@ class Booster:
|
|||||||
pred_interactions: bool = False,
|
pred_interactions: bool = False,
|
||||||
validate_features: bool = True,
|
validate_features: bool = True,
|
||||||
training: bool = False,
|
training: bool = False,
|
||||||
iteration_range: Tuple[int, int] = (0, 0),
|
iteration_range: IterationRange = (0, 0),
|
||||||
strict_shape: bool = False,
|
strict_shape: bool = False,
|
||||||
) -> np.ndarray:
|
) -> np.ndarray:
|
||||||
"""Predict with data. The full model will be used unless `iteration_range` is specified,
|
"""Predict with data. The full model will be used unless `iteration_range` is
|
||||||
meaning user have to either slice the model or use the ``best_iteration``
|
specified, meaning user have to either slice the model or use the
|
||||||
attribute to get prediction from best model returned from early stopping.
|
``best_iteration`` attribute to get prediction from best model returned from
|
||||||
|
early stopping.
|
||||||
|
|
||||||
.. note::
|
.. note::
|
||||||
|
|
||||||
@ -2336,8 +2357,8 @@ class Booster:
|
|||||||
args = {
|
args = {
|
||||||
"type": 0,
|
"type": 0,
|
||||||
"training": training,
|
"training": training,
|
||||||
"iteration_begin": iteration_range[0],
|
"iteration_begin": int(iteration_range[0]),
|
||||||
"iteration_end": iteration_range[1],
|
"iteration_end": int(iteration_range[1]),
|
||||||
"strict_shape": strict_shape,
|
"strict_shape": strict_shape,
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -2373,7 +2394,7 @@ class Booster:
|
|||||||
def inplace_predict(
|
def inplace_predict(
|
||||||
self,
|
self,
|
||||||
data: DataType,
|
data: DataType,
|
||||||
iteration_range: Tuple[int, int] = (0, 0),
|
iteration_range: IterationRange = (0, 0),
|
||||||
predict_type: str = "value",
|
predict_type: str = "value",
|
||||||
missing: float = np.nan,
|
missing: float = np.nan,
|
||||||
validate_features: bool = True,
|
validate_features: bool = True,
|
||||||
@ -2439,8 +2460,8 @@ class Booster:
|
|||||||
args = make_jcargs(
|
args = make_jcargs(
|
||||||
type=1 if predict_type == "margin" else 0,
|
type=1 if predict_type == "margin" else 0,
|
||||||
training=False,
|
training=False,
|
||||||
iteration_begin=iteration_range[0],
|
iteration_begin=int(iteration_range[0]),
|
||||||
iteration_end=iteration_range[1],
|
iteration_end=int(iteration_range[1]),
|
||||||
missing=missing,
|
missing=missing,
|
||||||
strict_shape=strict_shape,
|
strict_shape=strict_shape,
|
||||||
cache_id=0,
|
cache_id=0,
|
||||||
|
|||||||
@ -61,7 +61,7 @@ from typing import (
|
|||||||
import numpy
|
import numpy
|
||||||
|
|
||||||
from xgboost import collective, config
|
from xgboost import collective, config
|
||||||
from xgboost._typing import _T, FeatureNames, FeatureTypes
|
from xgboost._typing import _T, FeatureNames, FeatureTypes, IterationRange
|
||||||
from xgboost.callback import TrainingCallback
|
from xgboost.callback import TrainingCallback
|
||||||
from xgboost.compat import DataFrame, LazyLoader, concat, lazy_isinstance
|
from xgboost.compat import DataFrame, LazyLoader, concat, lazy_isinstance
|
||||||
from xgboost.core import (
|
from xgboost.core import (
|
||||||
@ -1146,9 +1146,9 @@ async def _direct_predict_impl( # pylint: disable=too-many-branches
|
|||||||
if _can_output_df(isinstance(data, dd.DataFrame), output_shape):
|
if _can_output_df(isinstance(data, dd.DataFrame), output_shape):
|
||||||
if base_margin is not None and isinstance(base_margin, da.Array):
|
if base_margin is not None and isinstance(base_margin, da.Array):
|
||||||
# Easier for map_partitions
|
# Easier for map_partitions
|
||||||
base_margin_df: Optional[
|
base_margin_df: Optional[Union[dd.DataFrame, dd.Series]] = (
|
||||||
Union[dd.DataFrame, dd.Series]
|
base_margin.to_dask_dataframe()
|
||||||
] = base_margin.to_dask_dataframe()
|
)
|
||||||
else:
|
else:
|
||||||
base_margin_df = base_margin
|
base_margin_df = base_margin
|
||||||
predictions = dd.map_partitions(
|
predictions = dd.map_partitions(
|
||||||
@ -1263,7 +1263,7 @@ async def _predict_async(
|
|||||||
approx_contribs: bool,
|
approx_contribs: bool,
|
||||||
pred_interactions: bool,
|
pred_interactions: bool,
|
||||||
validate_features: bool,
|
validate_features: bool,
|
||||||
iteration_range: Tuple[int, int],
|
iteration_range: IterationRange,
|
||||||
strict_shape: bool,
|
strict_shape: bool,
|
||||||
) -> _DaskCollection:
|
) -> _DaskCollection:
|
||||||
_booster = await _get_model_future(client, model)
|
_booster = await _get_model_future(client, model)
|
||||||
@ -1410,7 +1410,7 @@ def predict( # pylint: disable=unused-argument
|
|||||||
approx_contribs: bool = False,
|
approx_contribs: bool = False,
|
||||||
pred_interactions: bool = False,
|
pred_interactions: bool = False,
|
||||||
validate_features: bool = True,
|
validate_features: bool = True,
|
||||||
iteration_range: Tuple[int, int] = (0, 0),
|
iteration_range: IterationRange = (0, 0),
|
||||||
strict_shape: bool = False,
|
strict_shape: bool = False,
|
||||||
) -> Any:
|
) -> Any:
|
||||||
"""Run prediction with a trained booster.
|
"""Run prediction with a trained booster.
|
||||||
@ -1458,7 +1458,7 @@ async def _inplace_predict_async( # pylint: disable=too-many-branches
|
|||||||
global_config: Dict[str, Any],
|
global_config: Dict[str, Any],
|
||||||
model: Union[Booster, Dict, "distributed.Future"],
|
model: Union[Booster, Dict, "distributed.Future"],
|
||||||
data: _DataT,
|
data: _DataT,
|
||||||
iteration_range: Tuple[int, int],
|
iteration_range: IterationRange,
|
||||||
predict_type: str,
|
predict_type: str,
|
||||||
missing: float,
|
missing: float,
|
||||||
validate_features: bool,
|
validate_features: bool,
|
||||||
@ -1516,7 +1516,7 @@ def inplace_predict( # pylint: disable=unused-argument
|
|||||||
client: Optional["distributed.Client"],
|
client: Optional["distributed.Client"],
|
||||||
model: Union[TrainReturnT, Booster, "distributed.Future"],
|
model: Union[TrainReturnT, Booster, "distributed.Future"],
|
||||||
data: _DataT,
|
data: _DataT,
|
||||||
iteration_range: Tuple[int, int] = (0, 0),
|
iteration_range: IterationRange = (0, 0),
|
||||||
predict_type: str = "value",
|
predict_type: str = "value",
|
||||||
missing: float = numpy.nan,
|
missing: float = numpy.nan,
|
||||||
validate_features: bool = True,
|
validate_features: bool = True,
|
||||||
@ -1624,7 +1624,7 @@ class DaskScikitLearnBase(XGBModel):
|
|||||||
output_margin: bool,
|
output_margin: bool,
|
||||||
validate_features: bool,
|
validate_features: bool,
|
||||||
base_margin: Optional[_DaskCollection],
|
base_margin: Optional[_DaskCollection],
|
||||||
iteration_range: Optional[Tuple[int, int]],
|
iteration_range: Optional[IterationRange],
|
||||||
) -> Any:
|
) -> Any:
|
||||||
iteration_range = self._get_iteration_range(iteration_range)
|
iteration_range = self._get_iteration_range(iteration_range)
|
||||||
if self._can_use_inplace_predict():
|
if self._can_use_inplace_predict():
|
||||||
@ -1664,7 +1664,7 @@ class DaskScikitLearnBase(XGBModel):
|
|||||||
output_margin: bool = False,
|
output_margin: bool = False,
|
||||||
validate_features: bool = True,
|
validate_features: bool = True,
|
||||||
base_margin: Optional[_DaskCollection] = None,
|
base_margin: Optional[_DaskCollection] = None,
|
||||||
iteration_range: Optional[Tuple[int, int]] = None,
|
iteration_range: Optional[IterationRange] = None,
|
||||||
) -> Any:
|
) -> Any:
|
||||||
_assert_dask_support()
|
_assert_dask_support()
|
||||||
return self.client.sync(
|
return self.client.sync(
|
||||||
@ -1679,7 +1679,7 @@ class DaskScikitLearnBase(XGBModel):
|
|||||||
async def _apply_async(
|
async def _apply_async(
|
||||||
self,
|
self,
|
||||||
X: _DataT,
|
X: _DataT,
|
||||||
iteration_range: Optional[Tuple[int, int]] = None,
|
iteration_range: Optional[IterationRange] = None,
|
||||||
) -> Any:
|
) -> Any:
|
||||||
iteration_range = self._get_iteration_range(iteration_range)
|
iteration_range = self._get_iteration_range(iteration_range)
|
||||||
test_dmatrix = await DaskDMatrix(
|
test_dmatrix = await DaskDMatrix(
|
||||||
@ -1700,7 +1700,7 @@ class DaskScikitLearnBase(XGBModel):
|
|||||||
def apply(
|
def apply(
|
||||||
self,
|
self,
|
||||||
X: _DataT,
|
X: _DataT,
|
||||||
iteration_range: Optional[Tuple[int, int]] = None,
|
iteration_range: Optional[IterationRange] = None,
|
||||||
) -> Any:
|
) -> Any:
|
||||||
_assert_dask_support()
|
_assert_dask_support()
|
||||||
return self.client.sync(self._apply_async, X, iteration_range=iteration_range)
|
return self.client.sync(self._apply_async, X, iteration_range=iteration_range)
|
||||||
@ -1962,7 +1962,7 @@ class DaskXGBClassifier(DaskScikitLearnBase, XGBClassifierBase):
|
|||||||
X: _DataT,
|
X: _DataT,
|
||||||
validate_features: bool,
|
validate_features: bool,
|
||||||
base_margin: Optional[_DaskCollection],
|
base_margin: Optional[_DaskCollection],
|
||||||
iteration_range: Optional[Tuple[int, int]],
|
iteration_range: Optional[IterationRange],
|
||||||
) -> _DaskCollection:
|
) -> _DaskCollection:
|
||||||
if self.objective == "multi:softmax":
|
if self.objective == "multi:softmax":
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
@ -1987,7 +1987,7 @@ class DaskXGBClassifier(DaskScikitLearnBase, XGBClassifierBase):
|
|||||||
X: _DaskCollection,
|
X: _DaskCollection,
|
||||||
validate_features: bool = True,
|
validate_features: bool = True,
|
||||||
base_margin: Optional[_DaskCollection] = None,
|
base_margin: Optional[_DaskCollection] = None,
|
||||||
iteration_range: Optional[Tuple[int, int]] = None,
|
iteration_range: Optional[IterationRange] = None,
|
||||||
) -> Any:
|
) -> Any:
|
||||||
_assert_dask_support()
|
_assert_dask_support()
|
||||||
return self._client_sync(
|
return self._client_sync(
|
||||||
@ -2006,7 +2006,7 @@ class DaskXGBClassifier(DaskScikitLearnBase, XGBClassifierBase):
|
|||||||
output_margin: bool,
|
output_margin: bool,
|
||||||
validate_features: bool,
|
validate_features: bool,
|
||||||
base_margin: Optional[_DaskCollection],
|
base_margin: Optional[_DaskCollection],
|
||||||
iteration_range: Optional[Tuple[int, int]],
|
iteration_range: Optional[IterationRange],
|
||||||
) -> _DaskCollection:
|
) -> _DaskCollection:
|
||||||
pred_probs = await super()._predict_async(
|
pred_probs = await super()._predict_async(
|
||||||
data, output_margin, validate_features, base_margin, iteration_range
|
data, output_margin, validate_features, base_margin, iteration_range
|
||||||
|
|||||||
@ -1,4 +1,5 @@
|
|||||||
"""Utilities for the XGBoost Dask interface."""
|
"""Utilities for the XGBoost Dask interface."""
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
from typing import TYPE_CHECKING, Any, Dict
|
from typing import TYPE_CHECKING, Any, Dict
|
||||||
|
|
||||||
|
|||||||
@ -22,7 +22,7 @@ from typing import (
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
from scipy.special import softmax
|
from scipy.special import softmax
|
||||||
|
|
||||||
from ._typing import ArrayLike, FeatureNames, FeatureTypes, ModelIn
|
from ._typing import ArrayLike, FeatureNames, FeatureTypes, IterationRange, ModelIn
|
||||||
from .callback import TrainingCallback
|
from .callback import TrainingCallback
|
||||||
|
|
||||||
# Do not use class names on scikit-learn directly. Re-define the classes on
|
# Do not use class names on scikit-learn directly. Re-define the classes on
|
||||||
@ -1039,8 +1039,8 @@ class XGBModel(XGBModelBase):
|
|||||||
return False
|
return False
|
||||||
|
|
||||||
def _get_iteration_range(
|
def _get_iteration_range(
|
||||||
self, iteration_range: Optional[Tuple[int, int]]
|
self, iteration_range: Optional[IterationRange]
|
||||||
) -> Tuple[int, int]:
|
) -> IterationRange:
|
||||||
if iteration_range is None or iteration_range[1] == 0:
|
if iteration_range is None or iteration_range[1] == 0:
|
||||||
# Use best_iteration if defined.
|
# Use best_iteration if defined.
|
||||||
try:
|
try:
|
||||||
@ -1057,7 +1057,7 @@ class XGBModel(XGBModelBase):
|
|||||||
output_margin: bool = False,
|
output_margin: bool = False,
|
||||||
validate_features: bool = True,
|
validate_features: bool = True,
|
||||||
base_margin: Optional[ArrayLike] = None,
|
base_margin: Optional[ArrayLike] = None,
|
||||||
iteration_range: Optional[Tuple[int, int]] = None,
|
iteration_range: Optional[IterationRange] = None,
|
||||||
) -> ArrayLike:
|
) -> ArrayLike:
|
||||||
"""Predict with `X`. If the model is trained with early stopping, then
|
"""Predict with `X`. If the model is trained with early stopping, then
|
||||||
:py:attr:`best_iteration` is used automatically. The estimator uses
|
:py:attr:`best_iteration` is used automatically. The estimator uses
|
||||||
@ -1129,7 +1129,7 @@ class XGBModel(XGBModelBase):
|
|||||||
def apply(
|
def apply(
|
||||||
self,
|
self,
|
||||||
X: ArrayLike,
|
X: ArrayLike,
|
||||||
iteration_range: Optional[Tuple[int, int]] = None,
|
iteration_range: Optional[IterationRange] = None,
|
||||||
) -> np.ndarray:
|
) -> np.ndarray:
|
||||||
"""Return the predicted leaf every tree for each sample. If the model is trained
|
"""Return the predicted leaf every tree for each sample. If the model is trained
|
||||||
with early stopping, then :py:attr:`best_iteration` is used automatically.
|
with early stopping, then :py:attr:`best_iteration` is used automatically.
|
||||||
@ -1465,7 +1465,7 @@ class XGBClassifier(XGBModel, XGBClassifierBase):
|
|||||||
output_margin: bool = False,
|
output_margin: bool = False,
|
||||||
validate_features: bool = True,
|
validate_features: bool = True,
|
||||||
base_margin: Optional[ArrayLike] = None,
|
base_margin: Optional[ArrayLike] = None,
|
||||||
iteration_range: Optional[Tuple[int, int]] = None,
|
iteration_range: Optional[IterationRange] = None,
|
||||||
) -> ArrayLike:
|
) -> ArrayLike:
|
||||||
with config_context(verbosity=self.verbosity):
|
with config_context(verbosity=self.verbosity):
|
||||||
class_probs = super().predict(
|
class_probs = super().predict(
|
||||||
@ -1500,7 +1500,7 @@ class XGBClassifier(XGBModel, XGBClassifierBase):
|
|||||||
X: ArrayLike,
|
X: ArrayLike,
|
||||||
validate_features: bool = True,
|
validate_features: bool = True,
|
||||||
base_margin: Optional[ArrayLike] = None,
|
base_margin: Optional[ArrayLike] = None,
|
||||||
iteration_range: Optional[Tuple[int, int]] = None,
|
iteration_range: Optional[IterationRange] = None,
|
||||||
) -> np.ndarray:
|
) -> np.ndarray:
|
||||||
"""Predict the probability of each `X` example being of a given class. If the
|
"""Predict the probability of each `X` example being of a given class. If the
|
||||||
model is trained with early stopping, then :py:attr:`best_iteration` is used
|
model is trained with early stopping, then :py:attr:`best_iteration` is used
|
||||||
@ -1942,7 +1942,7 @@ class XGBRanker(XGBModel, XGBRankerMixIn):
|
|||||||
output_margin: bool = False,
|
output_margin: bool = False,
|
||||||
validate_features: bool = True,
|
validate_features: bool = True,
|
||||||
base_margin: Optional[ArrayLike] = None,
|
base_margin: Optional[ArrayLike] = None,
|
||||||
iteration_range: Optional[Tuple[int, int]] = None,
|
iteration_range: Optional[IterationRange] = None,
|
||||||
) -> ArrayLike:
|
) -> ArrayLike:
|
||||||
X, _ = _get_qid(X, None)
|
X, _ = _get_qid(X, None)
|
||||||
return super().predict(
|
return super().predict(
|
||||||
@ -1956,7 +1956,7 @@ class XGBRanker(XGBModel, XGBRankerMixIn):
|
|||||||
def apply(
|
def apply(
|
||||||
self,
|
self,
|
||||||
X: ArrayLike,
|
X: ArrayLike,
|
||||||
iteration_range: Optional[Tuple[int, int]] = None,
|
iteration_range: Optional[IterationRange] = None,
|
||||||
) -> ArrayLike:
|
) -> ArrayLike:
|
||||||
X, _ = _get_qid(X, None)
|
X, _ = _get_qid(X, None)
|
||||||
return super().apply(X, iteration_range)
|
return super().apply(X, iteration_range)
|
||||||
|
|||||||
@ -1,4 +1,5 @@
|
|||||||
"""XGBoost pyspark integration submodule for core code."""
|
"""XGBoost pyspark integration submodule for core code."""
|
||||||
|
|
||||||
import base64
|
import base64
|
||||||
|
|
||||||
# pylint: disable=fixme, too-many-ancestors, protected-access, no-member, invalid-name
|
# pylint: disable=fixme, too-many-ancestors, protected-access, no-member, invalid-name
|
||||||
|
|||||||
@ -1,4 +1,5 @@
|
|||||||
"""Xgboost pyspark integration submodule for estimator API."""
|
"""Xgboost pyspark integration submodule for estimator API."""
|
||||||
|
|
||||||
# pylint: disable=too-many-ancestors
|
# pylint: disable=too-many-ancestors
|
||||||
# pylint: disable=fixme, too-many-ancestors, protected-access, no-member, invalid-name
|
# pylint: disable=fixme, too-many-ancestors, protected-access, no-member, invalid-name
|
||||||
# pylint: disable=unused-argument, too-many-locals
|
# pylint: disable=unused-argument, too-many-locals
|
||||||
|
|||||||
@ -1,4 +1,5 @@
|
|||||||
"""Xgboost pyspark integration submodule for params."""
|
"""Xgboost pyspark integration submodule for params."""
|
||||||
|
|
||||||
from typing import Dict
|
from typing import Dict
|
||||||
|
|
||||||
# pylint: disable=too-few-public-methods
|
# pylint: disable=too-few-public-methods
|
||||||
@ -55,7 +56,6 @@ class HasFeaturesCols(Params):
|
|||||||
|
|
||||||
|
|
||||||
class HasEnableSparseDataOptim(Params):
|
class HasEnableSparseDataOptim(Params):
|
||||||
|
|
||||||
"""
|
"""
|
||||||
This is a Params based class that is extended by _SparkXGBParams
|
This is a Params based class that is extended by _SparkXGBParams
|
||||||
and holds the variable to store the boolean config of enabling sparse data optimization.
|
and holds the variable to store the boolean config of enabling sparse data optimization.
|
||||||
|
|||||||
@ -1,4 +1,5 @@
|
|||||||
"""Xgboost pyspark integration submodule for helper functions."""
|
"""Xgboost pyspark integration submodule for helper functions."""
|
||||||
|
|
||||||
# pylint: disable=fixme
|
# pylint: disable=fixme
|
||||||
|
|
||||||
import inspect
|
import inspect
|
||||||
|
|||||||
@ -2,6 +2,7 @@
|
|||||||
change without notice.
|
change without notice.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# pylint: disable=invalid-name,missing-function-docstring,import-error
|
# pylint: disable=invalid-name,missing-function-docstring,import-error
|
||||||
import gc
|
import gc
|
||||||
import importlib.util
|
import importlib.util
|
||||||
|
|||||||
@ -1,4 +1,5 @@
|
|||||||
"""Tests for training continuation."""
|
"""Tests for training continuation."""
|
||||||
|
|
||||||
import json
|
import json
|
||||||
from typing import Any, Dict, TypeVar
|
from typing import Any, Dict, TypeVar
|
||||||
|
|
||||||
|
|||||||
@ -1,4 +1,5 @@
|
|||||||
"""Tests for dask shared by different test modules."""
|
"""Tests for dask shared by different test modules."""
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from dask import array as da
|
from dask import array as da
|
||||||
|
|||||||
@ -1,4 +1,5 @@
|
|||||||
"""Tests related to the `DataIter` interface."""
|
"""Tests related to the `DataIter` interface."""
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
import xgboost
|
import xgboost
|
||||||
|
|||||||
@ -1,4 +1,5 @@
|
|||||||
"""Tests for evaluation metrics."""
|
"""Tests for evaluation metrics."""
|
||||||
|
|
||||||
from typing import Dict, List
|
from typing import Dict, List
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|||||||
@ -1,4 +1,5 @@
|
|||||||
"""Testing code shared by other tests."""
|
"""Testing code shared by other tests."""
|
||||||
|
|
||||||
# pylint: disable=invalid-name
|
# pylint: disable=invalid-name
|
||||||
import collections
|
import collections
|
||||||
import importlib.util
|
import importlib.util
|
||||||
|
|||||||
@ -1,4 +1,5 @@
|
|||||||
"""Tests for updaters."""
|
"""Tests for updaters."""
|
||||||
|
|
||||||
import json
|
import json
|
||||||
from functools import partial, update_wrapper
|
from functools import partial, update_wrapper
|
||||||
from typing import Any, Dict, List
|
from typing import Any, Dict, List
|
||||||
|
|||||||
@ -1,5 +1,5 @@
|
|||||||
/**
|
/**
|
||||||
* Copyright 2014-2023 by XGBoost Contributors
|
* Copyright 2014-2024 by XGBoost Contributors
|
||||||
*/
|
*/
|
||||||
#include "xgboost/c_api.h"
|
#include "xgboost/c_api.h"
|
||||||
|
|
||||||
@ -994,8 +994,8 @@ XGB_DLL int XGBoosterBoostOneIter(BoosterHandle handle, DMatrixHandle dtrain, bs
|
|||||||
auto *learner = static_cast<Learner *>(handle);
|
auto *learner = static_cast<Learner *>(handle);
|
||||||
auto ctx = learner->Ctx()->MakeCPU();
|
auto ctx = learner->Ctx()->MakeCPU();
|
||||||
|
|
||||||
auto t_grad = linalg::MakeTensorView(&ctx, common::Span{grad, len}, len);
|
auto t_grad = linalg::MakeTensorView(&ctx, common::Span{grad, static_cast<size_t>(len)}, len);
|
||||||
auto t_hess = linalg::MakeTensorView(&ctx, common::Span{hess, len}, len);
|
auto t_hess = linalg::MakeTensorView(&ctx, common::Span{hess, static_cast<size_t>(len)}, len);
|
||||||
|
|
||||||
auto s_grad = linalg::ArrayInterfaceStr(t_grad);
|
auto s_grad = linalg::ArrayInterfaceStr(t_grad);
|
||||||
auto s_hess = linalg::ArrayInterfaceStr(t_hess);
|
auto s_hess = linalg::ArrayInterfaceStr(t_hess);
|
||||||
|
|||||||
@ -1,5 +1,5 @@
|
|||||||
/**
|
/**
|
||||||
* Copyright 2017-2023, XGBoost Contributors
|
* Copyright 2017-2024, XGBoost Contributors
|
||||||
* \file column_matrix.h
|
* \file column_matrix.h
|
||||||
* \brief Utility for fast column-wise access
|
* \brief Utility for fast column-wise access
|
||||||
* \author Philip Cho
|
* \author Philip Cho
|
||||||
@ -176,7 +176,7 @@ class ColumnMatrix {
|
|||||||
void SetValid(typename LBitField32::index_type i) {missing.Clear(i);}
|
void SetValid(typename LBitField32::index_type i) {missing.Clear(i);}
|
||||||
/** @brief assign the storage to the view. */
|
/** @brief assign the storage to the view. */
|
||||||
void InitView() {
|
void InitView() {
|
||||||
missing = LBitField32{Span{storage.data(), storage.size()}};
|
missing = LBitField32{Span{storage.data(), static_cast<size_t>(storage.size())}};
|
||||||
}
|
}
|
||||||
|
|
||||||
void GrowTo(std::size_t n_elements, bool init) {
|
void GrowTo(std::size_t n_elements, bool init) {
|
||||||
@ -318,8 +318,8 @@ class ColumnMatrix {
|
|||||||
common::Span<const BinIdxType> bin_index = {
|
common::Span<const BinIdxType> bin_index = {
|
||||||
reinterpret_cast<const BinIdxType*>(&index_[feature_offset * bins_type_size_]),
|
reinterpret_cast<const BinIdxType*>(&index_[feature_offset * bins_type_size_]),
|
||||||
column_size};
|
column_size};
|
||||||
return std::move(DenseColumnIter<BinIdxType, any_missing>{
|
return DenseColumnIter<BinIdxType, any_missing>{
|
||||||
bin_index, static_cast<bst_bin_t>(index_base_[fidx]), missing_.missing, feature_offset});
|
bin_index, static_cast<bst_bin_t>(index_base_[fidx]), missing_.missing, feature_offset};
|
||||||
}
|
}
|
||||||
|
|
||||||
// all columns are dense column and has no missing value
|
// all columns are dense column and has no missing value
|
||||||
@ -332,7 +332,7 @@ class ColumnMatrix {
|
|||||||
DispatchBinType(bins_type_size_, [&](auto t) {
|
DispatchBinType(bins_type_size_, [&](auto t) {
|
||||||
using ColumnBinT = decltype(t);
|
using ColumnBinT = decltype(t);
|
||||||
auto column_index = Span<ColumnBinT>{reinterpret_cast<ColumnBinT*>(index_.data()),
|
auto column_index = Span<ColumnBinT>{reinterpret_cast<ColumnBinT*>(index_.data()),
|
||||||
index_.size() / sizeof(ColumnBinT)};
|
static_cast<size_t>(index_.size() / sizeof(ColumnBinT))};
|
||||||
ParallelFor(n_samples, n_threads, [&](auto rid) {
|
ParallelFor(n_samples, n_threads, [&](auto rid) {
|
||||||
rid += base_rowid;
|
rid += base_rowid;
|
||||||
const size_t ibegin = rid * n_features;
|
const size_t ibegin = rid * n_features;
|
||||||
|
|||||||
@ -1,5 +1,5 @@
|
|||||||
/**
|
/**
|
||||||
* Copyright 2017-2023 by XGBoost Contributors
|
* Copyright 2017-2024 by XGBoost Contributors
|
||||||
* \file hist_util.h
|
* \file hist_util.h
|
||||||
* \brief Utility for fast histogram aggregation
|
* \brief Utility for fast histogram aggregation
|
||||||
* \author Philip Cho, Tianqi Chen
|
* \author Philip Cho, Tianqi Chen
|
||||||
@ -113,8 +113,8 @@ class HistogramCuts {
|
|||||||
auto end = ptrs[column_id + 1];
|
auto end = ptrs[column_id + 1];
|
||||||
auto beg = ptrs[column_id];
|
auto beg = ptrs[column_id];
|
||||||
auto it = std::upper_bound(values.cbegin() + beg, values.cbegin() + end, value);
|
auto it = std::upper_bound(values.cbegin() + beg, values.cbegin() + end, value);
|
||||||
auto idx = it - values.cbegin();
|
auto idx = static_cast<bst_bin_t>(it - values.cbegin());
|
||||||
idx -= !!(idx == end);
|
idx -= !!(idx == static_cast<bst_bin_t>(end));
|
||||||
return idx;
|
return idx;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -136,8 +136,8 @@ class HistogramCuts {
|
|||||||
auto beg = ptrs[fidx] + vals.cbegin();
|
auto beg = ptrs[fidx] + vals.cbegin();
|
||||||
// Truncates the value in case it's not perfectly rounded.
|
// Truncates the value in case it's not perfectly rounded.
|
||||||
auto v = static_cast<float>(common::AsCat(value));
|
auto v = static_cast<float>(common::AsCat(value));
|
||||||
auto bin_idx = std::lower_bound(beg, end, v) - vals.cbegin();
|
auto bin_idx = static_cast<bst_bin_t>(std::lower_bound(beg, end, v) - vals.cbegin());
|
||||||
if (bin_idx == ptrs.at(fidx + 1)) {
|
if (bin_idx == static_cast<bst_bin_t>(ptrs.at(fidx + 1))) {
|
||||||
bin_idx -= 1;
|
bin_idx -= 1;
|
||||||
}
|
}
|
||||||
return bin_idx;
|
return bin_idx;
|
||||||
|
|||||||
@ -1,5 +1,5 @@
|
|||||||
/**
|
/**
|
||||||
* Copyright 2023, XGBoost Contributors
|
* Copyright 2023-2024, XGBoost Contributors
|
||||||
*/
|
*/
|
||||||
#ifndef XGBOOST_COMMON_REF_RESOURCE_VIEW_H_
|
#ifndef XGBOOST_COMMON_REF_RESOURCE_VIEW_H_
|
||||||
#define XGBOOST_COMMON_REF_RESOURCE_VIEW_H_
|
#define XGBOOST_COMMON_REF_RESOURCE_VIEW_H_
|
||||||
@ -76,7 +76,7 @@ class RefResourceView {
|
|||||||
|
|
||||||
[[nodiscard]] size_type size() const { return size_; } // NOLINT
|
[[nodiscard]] size_type size() const { return size_; } // NOLINT
|
||||||
[[nodiscard]] size_type size_bytes() const { // NOLINT
|
[[nodiscard]] size_type size_bytes() const { // NOLINT
|
||||||
return Span<const value_type>{data(), size()}.size_bytes();
|
return Span<const value_type>{data(), static_cast<size_t>(size())}.size_bytes();
|
||||||
}
|
}
|
||||||
[[nodiscard]] value_type* data() { return ptr_; }; // NOLINT
|
[[nodiscard]] value_type* data() { return ptr_; }; // NOLINT
|
||||||
[[nodiscard]] value_type const* data() const { return ptr_; }; // NOLINT
|
[[nodiscard]] value_type const* data() const { return ptr_; }; // NOLINT
|
||||||
|
|||||||
@ -1,12 +1,12 @@
|
|||||||
/**
|
/**
|
||||||
* Copyright 2022 by XGBoost Contributors
|
* Copyright 2022-2024, XGBoost Contributors
|
||||||
*/
|
*/
|
||||||
#ifndef XGBOOST_COMMON_TRANSFORM_ITERATOR_H_
|
#ifndef XGBOOST_COMMON_TRANSFORM_ITERATOR_H_
|
||||||
#define XGBOOST_COMMON_TRANSFORM_ITERATOR_H_
|
#define XGBOOST_COMMON_TRANSFORM_ITERATOR_H_
|
||||||
|
|
||||||
#include <cstddef> // std::size_t
|
#include <cstddef> // std::size_t
|
||||||
#include <iterator> // std::random_access_iterator_tag
|
#include <iterator> // std::random_access_iterator_tag
|
||||||
#include <type_traits> // std::result_of_t, std::add_pointer_t, std::add_lvalue_reference_t
|
#include <type_traits> // for invoke_result_t, add_pointer_t, add_lvalue_reference_t
|
||||||
#include <utility> // std::forward
|
#include <utility> // std::forward
|
||||||
|
|
||||||
#include "xgboost/span.h" // ptrdiff_t
|
#include "xgboost/span.h" // ptrdiff_t
|
||||||
@ -26,7 +26,7 @@ class IndexTransformIter {
|
|||||||
|
|
||||||
public:
|
public:
|
||||||
using iterator_category = std::random_access_iterator_tag; // NOLINT
|
using iterator_category = std::random_access_iterator_tag; // NOLINT
|
||||||
using reference = std::result_of_t<Fn(std::size_t)>; // NOLINT
|
using reference = std::invoke_result_t<Fn, std::size_t>; // NOLINT
|
||||||
using value_type = std::remove_cv_t<std::remove_reference_t<reference>>; // NOLINT
|
using value_type = std::remove_cv_t<std::remove_reference_t<reference>>; // NOLINT
|
||||||
using difference_type = detail::ptrdiff_t; // NOLINT
|
using difference_type = detail::ptrdiff_t; // NOLINT
|
||||||
using pointer = std::add_pointer_t<value_type>; // NOLINT
|
using pointer = std::add_pointer_t<value_type>; // NOLINT
|
||||||
|
|||||||
@ -1,5 +1,5 @@
|
|||||||
/**
|
/**
|
||||||
* Copyright 2019-2023 by XGBoost Contributors
|
* Copyright 2019-2024, XGBoost Contributors
|
||||||
* \file array_interface.h
|
* \file array_interface.h
|
||||||
* \brief View of __array_interface__
|
* \brief View of __array_interface__
|
||||||
*/
|
*/
|
||||||
@ -12,7 +12,7 @@
|
|||||||
#include <limits> // for numeric_limits
|
#include <limits> // for numeric_limits
|
||||||
#include <map>
|
#include <map>
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <type_traits> // std::alignment_of,std::remove_pointer_t
|
#include <type_traits> // for alignment_of, remove_pointer_t, invoke_result_t
|
||||||
#include <utility>
|
#include <utility>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
@ -645,7 +645,7 @@ auto DispatchDType(ArrayInterfaceHandler::Type dtype, Fn dispatch) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return std::result_of_t<Fn(std::int8_t)>();
|
return std::invoke_result_t<Fn, std::int8_t>();
|
||||||
}
|
}
|
||||||
|
|
||||||
template <std::int32_t D, typename Fn>
|
template <std::int32_t D, typename Fn>
|
||||||
|
|||||||
@ -261,12 +261,10 @@ bool NoInfInData(AdapterBatchT const& batch, IsValidFunctor is_valid) {
|
|||||||
auto counting = thrust::make_counting_iterator(0llu);
|
auto counting = thrust::make_counting_iterator(0llu);
|
||||||
auto value_iter = dh::MakeTransformIterator<bool>(counting, [=] XGBOOST_DEVICE(std::size_t idx) {
|
auto value_iter = dh::MakeTransformIterator<bool>(counting, [=] XGBOOST_DEVICE(std::size_t idx) {
|
||||||
auto v = batch.GetElement(idx).value;
|
auto v = batch.GetElement(idx).value;
|
||||||
if (!is_valid(v)) {
|
if (is_valid(v) && isinf(v)) {
|
||||||
// discard the invalid elements.
|
return false;
|
||||||
return true;
|
|
||||||
}
|
}
|
||||||
// check that there's no inf in data.
|
return true;
|
||||||
return !std::isinf(v);
|
|
||||||
});
|
});
|
||||||
dh::XGBCachingDeviceAllocator<char> alloc;
|
dh::XGBCachingDeviceAllocator<char> alloc;
|
||||||
// The default implementation in thrust optimizes any_of/none_of/all_of by using small
|
// The default implementation in thrust optimizes any_of/none_of/all_of by using small
|
||||||
|
|||||||
@ -1,5 +1,5 @@
|
|||||||
/**
|
/**
|
||||||
* Copyright 2019-2023 by XGBoost contributors
|
* Copyright 2019-2024, XGBoost contributors
|
||||||
*/
|
*/
|
||||||
#include <thrust/iterator/discard_iterator.h>
|
#include <thrust/iterator/discard_iterator.h>
|
||||||
#include <thrust/iterator/transform_output_iterator.h>
|
#include <thrust/iterator/transform_output_iterator.h>
|
||||||
@ -13,7 +13,7 @@
|
|||||||
#include "../common/hist_util.cuh"
|
#include "../common/hist_util.cuh"
|
||||||
#include "../common/transform_iterator.h" // MakeIndexTransformIter
|
#include "../common/transform_iterator.h" // MakeIndexTransformIter
|
||||||
#include "./ellpack_page.cuh"
|
#include "./ellpack_page.cuh"
|
||||||
#include "device_adapter.cuh" // for HasInfInData
|
#include "device_adapter.cuh" // for NoInfInData
|
||||||
#include "ellpack_page.h"
|
#include "ellpack_page.h"
|
||||||
#include "gradient_index.h"
|
#include "gradient_index.h"
|
||||||
#include "xgboost/data.h"
|
#include "xgboost/data.h"
|
||||||
|
|||||||
@ -1,5 +1,5 @@
|
|||||||
/**
|
/**
|
||||||
* Copyright 2017-2023, XGBoost Contributors
|
* Copyright 2017-2024, XGBoost Contributors
|
||||||
* \brief Data type for fast histogram aggregation.
|
* \brief Data type for fast histogram aggregation.
|
||||||
*/
|
*/
|
||||||
#include "gradient_index.h"
|
#include "gradient_index.h"
|
||||||
@ -148,7 +148,8 @@ void GHistIndexMatrix::ResizeIndex(const size_t n_index, const bool isDense) {
|
|||||||
new_vec = {new_ptr, n_bytes / sizeof(std::uint8_t), malloc_resource};
|
new_vec = {new_ptr, n_bytes / sizeof(std::uint8_t), malloc_resource};
|
||||||
}
|
}
|
||||||
this->data = std::move(new_vec);
|
this->data = std::move(new_vec);
|
||||||
this->index = common::Index{common::Span{data.data(), data.size()}, t_size};
|
this->index = common::Index{common::Span{data.data(), static_cast<size_t>(data.size())},
|
||||||
|
t_size};
|
||||||
};
|
};
|
||||||
|
|
||||||
if ((MaxNumBinPerFeat() - 1 <= static_cast<int>(std::numeric_limits<uint8_t>::max())) &&
|
if ((MaxNumBinPerFeat() - 1 <= static_cast<int>(std::numeric_limits<uint8_t>::max())) &&
|
||||||
|
|||||||
@ -1,5 +1,5 @@
|
|||||||
/**
|
/**
|
||||||
* Copyright 2021-2023 XGBoost contributors
|
* Copyright 2021-2024 XGBoost contributors
|
||||||
*/
|
*/
|
||||||
#include <cstddef> // for size_t
|
#include <cstddef> // for size_t
|
||||||
#include <cstdint> // for uint8_t
|
#include <cstdint> // for uint8_t
|
||||||
@ -40,7 +40,9 @@ class GHistIndexRawFormat : public SparsePageFormat<GHistIndexMatrix> {
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
// - index
|
// - index
|
||||||
page->index = common::Index{common::Span{page->data.data(), page->data.size()}, size_type};
|
page->index =
|
||||||
|
common::Index{common::Span{page->data.data(), static_cast<size_t>(page->data.size())},
|
||||||
|
size_type};
|
||||||
|
|
||||||
// hit count
|
// hit count
|
||||||
if (!common::ReadVec(fi, &page->hit_count)) {
|
if (!common::ReadVec(fi, &page->hit_count)) {
|
||||||
|
|||||||
@ -1,5 +1,5 @@
|
|||||||
/**
|
/**
|
||||||
* Copyright 2020-2023, XGBoost contributors
|
* Copyright 2020-2024, XGBoost contributors
|
||||||
*/
|
*/
|
||||||
#ifndef XGBOOST_DATA_PROXY_DMATRIX_H_
|
#ifndef XGBOOST_DATA_PROXY_DMATRIX_H_
|
||||||
#define XGBOOST_DATA_PROXY_DMATRIX_H_
|
#define XGBOOST_DATA_PROXY_DMATRIX_H_
|
||||||
@ -7,6 +7,7 @@
|
|||||||
#include <any> // for any, any_cast
|
#include <any> // for any, any_cast
|
||||||
#include <memory>
|
#include <memory>
|
||||||
#include <string>
|
#include <string>
|
||||||
|
#include <type_traits> // for invoke_result_t
|
||||||
#include <utility>
|
#include <utility>
|
||||||
|
|
||||||
#include "adapter.h"
|
#include "adapter.h"
|
||||||
@ -171,10 +172,10 @@ decltype(auto) HostAdapterDispatch(DMatrixProxy const* proxy, Fn fn, bool* type_
|
|||||||
LOG(FATAL) << "Unknown type: " << proxy->Adapter().type().name();
|
LOG(FATAL) << "Unknown type: " << proxy->Adapter().type().name();
|
||||||
}
|
}
|
||||||
if constexpr (get_value) {
|
if constexpr (get_value) {
|
||||||
return std::result_of_t<Fn(
|
return std::invoke_result_t<
|
||||||
decltype(std::declval<std::shared_ptr<ArrayAdapter>>()->Value()))>();
|
Fn, decltype(std::declval<std::shared_ptr<ArrayAdapter>>()->Value())>();
|
||||||
} else {
|
} else {
|
||||||
return std::result_of_t<Fn(decltype(std::declval<std::shared_ptr<ArrayAdapter>>()))>();
|
return std::invoke_result_t<Fn, decltype(std::declval<std::shared_ptr<ArrayAdapter>>())>();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -1,5 +1,5 @@
|
|||||||
/**
|
/**
|
||||||
* Copyright 2019-2023 by XGBoost Contributors
|
* Copyright 2019-2024, XGBoost Contributors
|
||||||
* \file simple_dmatrix.cuh
|
* \file simple_dmatrix.cuh
|
||||||
*/
|
*/
|
||||||
#ifndef XGBOOST_DATA_SIMPLE_DMATRIX_CUH_
|
#ifndef XGBOOST_DATA_SIMPLE_DMATRIX_CUH_
|
||||||
@ -11,7 +11,7 @@
|
|||||||
|
|
||||||
#include "../common/device_helpers.cuh"
|
#include "../common/device_helpers.cuh"
|
||||||
#include "../common/error_msg.h" // for InfInData
|
#include "../common/error_msg.h" // for InfInData
|
||||||
#include "device_adapter.cuh" // for HasInfInData
|
#include "device_adapter.cuh" // for NoInfInData
|
||||||
|
|
||||||
namespace xgboost::data {
|
namespace xgboost::data {
|
||||||
|
|
||||||
|
|||||||
@ -1,5 +1,5 @@
|
|||||||
/**
|
/**
|
||||||
* Copyright 2017-2023 by Contributors
|
* Copyright 2017-2024 by Contributors
|
||||||
*/
|
*/
|
||||||
#include "xgboost/predictor.h"
|
#include "xgboost/predictor.h"
|
||||||
|
|
||||||
@ -46,7 +46,7 @@ void ValidateBaseMarginShape(linalg::Tensor<float, D> const& margin, bst_row_t n
|
|||||||
void Predictor::InitOutPredictions(const MetaInfo& info, HostDeviceVector<bst_float>* out_preds,
|
void Predictor::InitOutPredictions(const MetaInfo& info, HostDeviceVector<bst_float>* out_preds,
|
||||||
const gbm::GBTreeModel& model) const {
|
const gbm::GBTreeModel& model) const {
|
||||||
CHECK_NE(model.learner_model_param->num_output_group, 0);
|
CHECK_NE(model.learner_model_param->num_output_group, 0);
|
||||||
std::size_t n{model.learner_model_param->OutputLength() * info.num_row_};
|
auto n = static_cast<size_t>(model.learner_model_param->OutputLength() * info.num_row_);
|
||||||
|
|
||||||
const HostDeviceVector<bst_float>* base_margin = info.base_margin_.Data();
|
const HostDeviceVector<bst_float>* base_margin = info.base_margin_.Data();
|
||||||
if (ctx_->Device().IsCUDA()) {
|
if (ctx_->Device().IsCUDA()) {
|
||||||
|
|||||||
@ -1,5 +1,5 @@
|
|||||||
/**
|
/**
|
||||||
* Copyright 2023 by XGBoost Contributors
|
* Copyright 2023-2024 by XGBoost Contributors
|
||||||
*/
|
*/
|
||||||
#ifndef XGBOOST_TREE_HIST_HIST_CACHE_H_
|
#ifndef XGBOOST_TREE_HIST_HIST_CACHE_H_
|
||||||
#define XGBOOST_TREE_HIST_HIST_CACHE_H_
|
#define XGBOOST_TREE_HIST_HIST_CACHE_H_
|
||||||
@ -48,11 +48,13 @@ class BoundedHistCollection {
|
|||||||
BoundedHistCollection() = default;
|
BoundedHistCollection() = default;
|
||||||
common::GHistRow operator[](std::size_t idx) {
|
common::GHistRow operator[](std::size_t idx) {
|
||||||
auto offset = node_map_.at(idx);
|
auto offset = node_map_.at(idx);
|
||||||
return common::Span{data_->data(), data_->size()}.subspan(offset, n_total_bins_);
|
return common::Span{data_->data(), static_cast<size_t>(data_->size())}.subspan(
|
||||||
|
offset, n_total_bins_);
|
||||||
}
|
}
|
||||||
common::ConstGHistRow operator[](std::size_t idx) const {
|
common::ConstGHistRow operator[](std::size_t idx) const {
|
||||||
auto offset = node_map_.at(idx);
|
auto offset = node_map_.at(idx);
|
||||||
return common::Span{data_->data(), data_->size()}.subspan(offset, n_total_bins_);
|
return common::Span{data_->data(), static_cast<size_t>(data_->size())}.subspan(
|
||||||
|
offset, n_total_bins_);
|
||||||
}
|
}
|
||||||
void Reset(bst_bin_t n_total_bins, std::size_t n_cached_nodes) {
|
void Reset(bst_bin_t n_total_bins, std::size_t n_cached_nodes) {
|
||||||
n_total_bins_ = n_total_bins;
|
n_total_bins_ = n_total_bins;
|
||||||
|
|||||||
@ -32,6 +32,8 @@ Foreach-Object {
|
|||||||
|
|
||||||
Write-Host "--- Run Python tests"
|
Write-Host "--- Run Python tests"
|
||||||
python -X faulthandler -m pytest -v -s -rxXs --fulltrace tests/python
|
python -X faulthandler -m pytest -v -s -rxXs --fulltrace tests/python
|
||||||
|
if ($LASTEXITCODE -ne 0) { throw "Last command failed" }
|
||||||
Write-Host "--- Run Python tests with GPU"
|
Write-Host "--- Run Python tests with GPU"
|
||||||
python -X faulthandler -m pytest -v -s -rxXs --fulltrace -m "(not slow) and (not mgpu)"`
|
python -X faulthandler -m pytest -v -s -rxXs --fulltrace -m "(not slow) and (not mgpu)"`
|
||||||
tests/python-gpu
|
tests/python-gpu
|
||||||
|
if ($LASTEXITCODE -ne 0) { throw "Last command failed" }
|
||||||
|
|||||||
8
tests/ci_build/Dockerfile.i386
Normal file
8
tests/ci_build/Dockerfile.i386
Normal file
@ -0,0 +1,8 @@
|
|||||||
|
FROM i386/debian:sid
|
||||||
|
|
||||||
|
ENV DEBIAN_FRONTEND noninteractive
|
||||||
|
SHELL ["/bin/bash", "-c"] # Use Bash as shell
|
||||||
|
|
||||||
|
RUN \
|
||||||
|
apt-get update && \
|
||||||
|
apt-get install -y tar unzip wget git build-essential ninja-build cmake
|
||||||
@ -7,6 +7,7 @@ needed, run CMake .
|
|||||||
If this is a RC release, the version for JVM packages has the form
|
If this is a RC release, the version for JVM packages has the form
|
||||||
<major>.<minor>.<patch>-RC1
|
<major>.<minor>.<patch>-RC1
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import datetime
|
import datetime
|
||||||
import os
|
import os
|
||||||
|
|||||||
@ -1,4 +1,5 @@
|
|||||||
"""Utilities for packaging R code and running tests."""
|
"""Utilities for packaging R code and running tests."""
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import os
|
import os
|
||||||
import shutil
|
import shutil
|
||||||
|
|||||||
@ -1,4 +1,5 @@
|
|||||||
"""Utilities for the CI."""
|
"""Utilities for the CI."""
|
||||||
|
|
||||||
import os
|
import os
|
||||||
from datetime import datetime, timedelta
|
from datetime import datetime, timedelta
|
||||||
from functools import wraps
|
from functools import wraps
|
||||||
|
|||||||
@ -19,8 +19,38 @@ if (USE_HIP)
|
|||||||
endif (USE_HIP)
|
endif (USE_HIP)
|
||||||
|
|
||||||
file(GLOB_RECURSE SYCL_TEST_SOURCES "plugin/test_sycl_*.cc")
|
file(GLOB_RECURSE SYCL_TEST_SOURCES "plugin/test_sycl_*.cc")
|
||||||
if(NOT PLUGIN_SYCL)
|
|
||||||
list(REMOVE_ITEM TEST_SOURCES ${SYCL_TEST_SOURCES})
|
list(REMOVE_ITEM TEST_SOURCES ${SYCL_TEST_SOURCES})
|
||||||
|
|
||||||
|
if(PLUGIN_SYCL)
|
||||||
|
set(CMAKE_CXX_COMPILER "icpx")
|
||||||
|
file(GLOB_RECURSE SYCL_TEST_SOURCES "plugin/test_sycl_*.cc")
|
||||||
|
add_library(plugin_sycl_test OBJECT ${SYCL_TEST_SOURCES})
|
||||||
|
|
||||||
|
target_include_directories(plugin_sycl_test
|
||||||
|
PRIVATE
|
||||||
|
${gtest_SOURCE_DIR}/include
|
||||||
|
${xgboost_SOURCE_DIR}/include
|
||||||
|
${xgboost_SOURCE_DIR}/dmlc-core/include
|
||||||
|
${xgboost_SOURCE_DIR}/rabit/include)
|
||||||
|
|
||||||
|
target_compile_definitions(plugin_sycl_test PUBLIC -DXGBOOST_USE_SYCL=1)
|
||||||
|
|
||||||
|
target_link_libraries(plugin_sycl_test PUBLIC -fsycl)
|
||||||
|
|
||||||
|
set_target_properties(plugin_sycl_test PROPERTIES
|
||||||
|
COMPILE_FLAGS -fsycl
|
||||||
|
CXX_STANDARD 17
|
||||||
|
CXX_STANDARD_REQUIRED ON
|
||||||
|
POSITION_INDEPENDENT_CODE ON)
|
||||||
|
if(USE_OPENMP)
|
||||||
|
find_package(OpenMP REQUIRED)
|
||||||
|
set_target_properties(plugin_sycl_test PROPERTIES
|
||||||
|
COMPILE_FLAGS "-fsycl -qopenmp")
|
||||||
|
endif()
|
||||||
|
# Get compilation and link flags of plugin_sycl and propagate to testxgboost
|
||||||
|
target_link_libraries(testxgboost PUBLIC plugin_sycl_test)
|
||||||
|
# Add all objects of plugin_sycl to testxgboost
|
||||||
|
target_sources(testxgboost INTERFACE $<TARGET_OBJECTS:plugin_sycl_test>)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if(PLUGIN_FEDERATED)
|
if(PLUGIN_FEDERATED)
|
||||||
|
|||||||
@ -1,5 +1,5 @@
|
|||||||
/**
|
/**
|
||||||
* Copyright 2019-2023 XGBoost contributors
|
* Copyright 2019-2024 XGBoost contributors
|
||||||
*/
|
*/
|
||||||
#include <gtest/gtest.h>
|
#include <gtest/gtest.h>
|
||||||
#include <xgboost/c_api.h>
|
#include <xgboost/c_api.h>
|
||||||
@ -212,8 +212,8 @@ TEST(CAPI, JsonModelIO) {
|
|||||||
bst_ulong saved_len{0};
|
bst_ulong saved_len{0};
|
||||||
XGBoosterSaveModelToBuffer(handle, R"({"format": "ubj"})", &saved_len, &saved);
|
XGBoosterSaveModelToBuffer(handle, R"({"format": "ubj"})", &saved_len, &saved);
|
||||||
ASSERT_EQ(len, saved_len);
|
ASSERT_EQ(len, saved_len);
|
||||||
auto l = StringView{data, len};
|
auto l = StringView{data, static_cast<size_t>(len)};
|
||||||
auto r = StringView{saved, saved_len};
|
auto r = StringView{saved, static_cast<size_t>(saved_len)};
|
||||||
ASSERT_EQ(l.size(), r.size());
|
ASSERT_EQ(l.size(), r.size());
|
||||||
ASSERT_EQ(l, r);
|
ASSERT_EQ(l, r);
|
||||||
|
|
||||||
|
|||||||
@ -1,5 +1,5 @@
|
|||||||
/**
|
/**
|
||||||
* Copyright 2016-2023 by XGBoost contributors
|
* Copyright 2016-2024 by XGBoost contributors
|
||||||
*/
|
*/
|
||||||
#include "helpers.h"
|
#include "helpers.h"
|
||||||
|
|
||||||
@ -216,7 +216,7 @@ SimpleLCG::StateType SimpleLCG::Max() const { return max(); }
|
|||||||
static_assert(SimpleLCG::max() - SimpleLCG::min());
|
static_assert(SimpleLCG::max() - SimpleLCG::min());
|
||||||
|
|
||||||
void RandomDataGenerator::GenerateLabels(std::shared_ptr<DMatrix> p_fmat) const {
|
void RandomDataGenerator::GenerateLabels(std::shared_ptr<DMatrix> p_fmat) const {
|
||||||
RandomDataGenerator{p_fmat->Info().num_row_, this->n_targets_, 0.0f}.GenerateDense(
|
RandomDataGenerator{static_cast<bst_row_t>(p_fmat->Info().num_row_), this->n_targets_, 0.0f}.GenerateDense(
|
||||||
p_fmat->Info().labels.Data());
|
p_fmat->Info().labels.Data());
|
||||||
CHECK_EQ(p_fmat->Info().labels.Size(), this->rows_ * this->n_targets_);
|
CHECK_EQ(p_fmat->Info().labels.Size(), this->rows_ * this->n_targets_);
|
||||||
p_fmat->Info().labels.Reshape(this->rows_, this->n_targets_);
|
p_fmat->Info().labels.Reshape(this->rows_, this->n_targets_);
|
||||||
@ -458,7 +458,7 @@ void RandomDataGenerator::GenerateCSR(
|
|||||||
EXPECT_EQ(row_count, dmat->Info().num_row_);
|
EXPECT_EQ(row_count, dmat->Info().num_row_);
|
||||||
|
|
||||||
if (with_label) {
|
if (with_label) {
|
||||||
RandomDataGenerator{dmat->Info().num_row_, this->n_targets_, 0.0f}.GenerateDense(
|
RandomDataGenerator{static_cast<bst_row_t>(dmat->Info().num_row_), this->n_targets_, 0.0f}.GenerateDense(
|
||||||
dmat->Info().labels.Data());
|
dmat->Info().labels.Data());
|
||||||
CHECK_EQ(dmat->Info().labels.Size(), this->rows_ * this->n_targets_);
|
CHECK_EQ(dmat->Info().labels.Size(), this->rows_ * this->n_targets_);
|
||||||
dmat->Info().labels.Reshape(this->rows_, this->n_targets_);
|
dmat->Info().labels.Reshape(this->rows_, this->n_targets_);
|
||||||
|
|||||||
@ -1,5 +1,5 @@
|
|||||||
/**
|
/**
|
||||||
* Copyright 2016-2023 by XGBoost contributors
|
* Copyright 2016-2024 by XGBoost contributors
|
||||||
*/
|
*/
|
||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
@ -238,7 +238,7 @@ class RandomDataGenerator {
|
|||||||
|
|
||||||
bst_bin_t bins_{0};
|
bst_bin_t bins_{0};
|
||||||
std::vector<FeatureType> ft_;
|
std::vector<FeatureType> ft_;
|
||||||
bst_cat_t max_cat_;
|
bst_cat_t max_cat_{32};
|
||||||
|
|
||||||
Json ArrayInterfaceImpl(HostDeviceVector<float>* storage, size_t rows, size_t cols) const;
|
Json ArrayInterfaceImpl(HostDeviceVector<float>* storage, size_t rows, size_t cols) const;
|
||||||
|
|
||||||
|
|||||||
@ -2,7 +2,11 @@
|
|||||||
* Copyright 2018-2023 XGBoost contributors
|
* Copyright 2018-2023 XGBoost contributors
|
||||||
*/
|
*/
|
||||||
#include <gtest/gtest.h>
|
#include <gtest/gtest.h>
|
||||||
|
#pragma GCC diagnostic push
|
||||||
|
#pragma GCC diagnostic ignored "-Wtautological-constant-compare"
|
||||||
|
#pragma GCC diagnostic ignored "-W#pragma-messages"
|
||||||
#include <xgboost/context.h>
|
#include <xgboost/context.h>
|
||||||
|
#pragma GCC diagnostic pop
|
||||||
|
|
||||||
#include "../objective/test_multiclass_obj.h"
|
#include "../objective/test_multiclass_obj.h"
|
||||||
|
|
||||||
|
|||||||
91
tests/cpp/plugin/test_sycl_partition_builder.cc
Normal file
91
tests/cpp/plugin/test_sycl_partition_builder.cc
Normal file
@ -0,0 +1,91 @@
|
|||||||
|
/**
|
||||||
|
* Copyright 2020-2024 by XGBoost contributors
|
||||||
|
*/
|
||||||
|
#include <gtest/gtest.h>
|
||||||
|
|
||||||
|
#include <string>
|
||||||
|
#include <utility>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
#include "../../../plugin/sycl/common/partition_builder.h"
|
||||||
|
#include "../../../plugin/sycl/device_manager.h"
|
||||||
|
#include "../helpers.h"
|
||||||
|
|
||||||
|
namespace xgboost::sycl::common {
|
||||||
|
|
||||||
|
TEST(SyclPartitionBuilder, BasicTest) {
|
||||||
|
constexpr size_t kNodes = 5;
|
||||||
|
// Number of rows for each node
|
||||||
|
std::vector<size_t> rows = { 5, 5, 10, 1, 2 };
|
||||||
|
|
||||||
|
DeviceManager device_manager;
|
||||||
|
auto qu = device_manager.GetQueue(DeviceOrd::SyclDefault());
|
||||||
|
PartitionBuilder builder;
|
||||||
|
builder.Init(&qu, kNodes, [&](size_t i) {
|
||||||
|
return rows[i];
|
||||||
|
});
|
||||||
|
|
||||||
|
// We test here only the basics, thus syntetic partition builder is adopted
|
||||||
|
// Number of rows to go left for each node.
|
||||||
|
std::vector<size_t> rows_for_left_node = { 2, 0, 7, 1, 2 };
|
||||||
|
|
||||||
|
size_t first_row_id = 0;
|
||||||
|
for(size_t nid = 0; nid < kNodes; ++nid) {
|
||||||
|
size_t n_rows_nodes = rows[nid];
|
||||||
|
|
||||||
|
auto rid_buff = builder.GetData(nid);
|
||||||
|
size_t rid_buff_size = rid_buff.size();
|
||||||
|
auto* rid_buff_ptr = rid_buff.data();
|
||||||
|
|
||||||
|
size_t n_left = rows_for_left_node[nid];
|
||||||
|
size_t n_right = rows[nid] - n_left;
|
||||||
|
|
||||||
|
qu.submit([&](::sycl::handler& cgh) {
|
||||||
|
cgh.parallel_for<>(::sycl::range<1>(n_left), [=](::sycl::id<1> pid) {
|
||||||
|
int row_id = first_row_id + pid[0];
|
||||||
|
rid_buff_ptr[pid[0]] = row_id;
|
||||||
|
});
|
||||||
|
});
|
||||||
|
qu.wait();
|
||||||
|
first_row_id += n_left;
|
||||||
|
|
||||||
|
// We are storing indexes for the right side in the tail of the array to save some memory
|
||||||
|
qu.submit([&](::sycl::handler& cgh) {
|
||||||
|
cgh.parallel_for<>(::sycl::range<1>(n_right), [=](::sycl::id<1> pid) {
|
||||||
|
int row_id = first_row_id + pid[0];
|
||||||
|
rid_buff_ptr[rid_buff_size - pid[0] - 1] = row_id;
|
||||||
|
});
|
||||||
|
});
|
||||||
|
qu.wait();
|
||||||
|
first_row_id += n_right;
|
||||||
|
|
||||||
|
builder.SetNLeftElems(nid, n_left);
|
||||||
|
builder.SetNRightElems(nid, n_right);
|
||||||
|
}
|
||||||
|
|
||||||
|
::sycl::event event;
|
||||||
|
std::vector<size_t> v(*std::max_element(rows.begin(), rows.end()));
|
||||||
|
size_t row_id = 0;
|
||||||
|
for(size_t nid = 0; nid < kNodes; ++nid) {
|
||||||
|
builder.MergeToArray(nid, v.data(), event);
|
||||||
|
qu.wait();
|
||||||
|
|
||||||
|
// Check that row_id for left side are correct
|
||||||
|
for(size_t j = 0; j < rows_for_left_node[nid]; ++j) {
|
||||||
|
ASSERT_EQ(v[j], row_id++);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check that row_id for right side are correct
|
||||||
|
for(size_t j = 0; j < rows[nid] - rows_for_left_node[nid]; ++j) {
|
||||||
|
ASSERT_EQ(v[rows[nid] - j - 1], row_id++);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check that number of left/right rows are correct
|
||||||
|
size_t n_left = builder.GetNLeftElems(nid);
|
||||||
|
size_t n_right = builder.GetNRightElems(nid);
|
||||||
|
ASSERT_EQ(n_left, rows_for_left_node[nid]);
|
||||||
|
ASSERT_EQ(n_right, (rows[nid] - rows_for_left_node[nid]));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace xgboost::common
|
||||||
@ -2,11 +2,19 @@
|
|||||||
* Copyright 2017-2023 XGBoost contributors
|
* Copyright 2017-2023 XGBoost contributors
|
||||||
*/
|
*/
|
||||||
#include <gtest/gtest.h>
|
#include <gtest/gtest.h>
|
||||||
|
#pragma GCC diagnostic push
|
||||||
|
#pragma GCC diagnostic ignored "-Wtautological-constant-compare"
|
||||||
|
#pragma GCC diagnostic ignored "-W#pragma-messages"
|
||||||
#include <xgboost/predictor.h>
|
#include <xgboost/predictor.h>
|
||||||
|
#pragma GCC diagnostic pop
|
||||||
|
|
||||||
|
#pragma GCC diagnostic push
|
||||||
|
#pragma GCC diagnostic ignored "-Wtautological-constant-compare"
|
||||||
#include "../../../src/data/adapter.h"
|
#include "../../../src/data/adapter.h"
|
||||||
#include "../../../src/data/proxy_dmatrix.h"
|
|
||||||
#include "../../../src/gbm/gbtree.h"
|
#include "../../../src/gbm/gbtree.h"
|
||||||
|
#pragma GCC diagnostic pop
|
||||||
|
|
||||||
|
#include "../../../src/data/proxy_dmatrix.h"
|
||||||
#include "../../../src/gbm/gbtree_model.h"
|
#include "../../../src/gbm/gbtree_model.h"
|
||||||
#include "../filesystem.h" // dmlc::TemporaryDirectory
|
#include "../filesystem.h" // dmlc::TemporaryDirectory
|
||||||
#include "../helpers.h"
|
#include "../helpers.h"
|
||||||
|
|||||||
@ -2,7 +2,11 @@
|
|||||||
* Copyright 2017-2019 XGBoost contributors
|
* Copyright 2017-2019 XGBoost contributors
|
||||||
*/
|
*/
|
||||||
#include <gtest/gtest.h>
|
#include <gtest/gtest.h>
|
||||||
|
#pragma GCC diagnostic push
|
||||||
|
#pragma GCC diagnostic ignored "-Wtautological-constant-compare"
|
||||||
|
#pragma GCC diagnostic ignored "-W#pragma-messages"
|
||||||
#include <xgboost/objective.h>
|
#include <xgboost/objective.h>
|
||||||
|
#pragma GCC diagnostic pop
|
||||||
#include <xgboost/context.h>
|
#include <xgboost/context.h>
|
||||||
|
|
||||||
#include "../helpers.h"
|
#include "../helpers.h"
|
||||||
|
|||||||
@ -1,5 +1,6 @@
|
|||||||
"""Loading a pickled model generated by test_pickling.py, only used by
|
"""Loading a pickled model generated by test_pickling.py, only used by
|
||||||
`test_gpu_with_dask.py`"""
|
`test_gpu_with_dask.py`"""
|
||||||
|
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
|
|
||||||
|
|||||||
@ -1,4 +1,5 @@
|
|||||||
"""Test model IO with pickle."""
|
"""Test model IO with pickle."""
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import pickle
|
import pickle
|
||||||
import subprocess
|
import subprocess
|
||||||
|
|||||||
@ -152,6 +152,7 @@ class TestGPUPredict:
|
|||||||
|
|
||||||
@pytest.mark.parametrize("device", ["cpu", "cuda"])
|
@pytest.mark.parametrize("device", ["cpu", "cuda"])
|
||||||
@pytest.mark.skipif(**tm.no_cupy())
|
@pytest.mark.skipif(**tm.no_cupy())
|
||||||
|
@pytest.mark.skipif(**tm.no_cudf())
|
||||||
def test_inplace_predict_device_type(self, device: str) -> None:
|
def test_inplace_predict_device_type(self, device: str) -> None:
|
||||||
"""Test inplace predict with different device and data types.
|
"""Test inplace predict with different device and data types.
|
||||||
|
|
||||||
|
|||||||
@ -249,7 +249,7 @@ def test_custom_objective(
|
|||||||
clf.fit(X, y)
|
clf.fit(X, y)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skipif(**tm.no_pandas())
|
@pytest.mark.skipif(**tm.no_cudf())
|
||||||
def test_ranking_qid_df():
|
def test_ranking_qid_df():
|
||||||
import cudf
|
import cudf
|
||||||
|
|
||||||
|
|||||||
@ -7,6 +7,7 @@ import pytest
|
|||||||
|
|
||||||
import xgboost as xgb
|
import xgboost as xgb
|
||||||
from xgboost import testing as tm
|
from xgboost import testing as tm
|
||||||
|
from xgboost.core import Integer
|
||||||
from xgboost.testing.updater import ResetStrategy
|
from xgboost.testing.updater import ResetStrategy
|
||||||
|
|
||||||
dpath = tm.data_dir(__file__)
|
dpath = tm.data_dir(__file__)
|
||||||
@ -97,15 +98,15 @@ class TestModels:
|
|||||||
def test_boost_from_prediction(self):
|
def test_boost_from_prediction(self):
|
||||||
# Re-construct dtrain here to avoid modification
|
# Re-construct dtrain here to avoid modification
|
||||||
margined, _ = tm.load_agaricus(__file__)
|
margined, _ = tm.load_agaricus(__file__)
|
||||||
bst = xgb.train({'tree_method': 'hist'}, margined, 1)
|
bst = xgb.train({"tree_method": "hist"}, margined, 1)
|
||||||
predt_0 = bst.predict(margined, output_margin=True)
|
predt_0 = bst.predict(margined, output_margin=True)
|
||||||
margined.set_base_margin(predt_0)
|
margined.set_base_margin(predt_0)
|
||||||
bst = xgb.train({'tree_method': 'hist'}, margined, 1)
|
bst = xgb.train({"tree_method": "hist"}, margined, 1)
|
||||||
predt_1 = bst.predict(margined)
|
predt_1 = bst.predict(margined)
|
||||||
|
|
||||||
assert np.any(np.abs(predt_1 - predt_0) > 1e-6)
|
assert np.any(np.abs(predt_1 - predt_0) > 1e-6)
|
||||||
dtrain, _ = tm.load_agaricus(__file__)
|
dtrain, _ = tm.load_agaricus(__file__)
|
||||||
bst = xgb.train({'tree_method': 'hist'}, dtrain, 2)
|
bst = xgb.train({"tree_method": "hist"}, dtrain, 2)
|
||||||
predt_2 = bst.predict(dtrain)
|
predt_2 = bst.predict(dtrain)
|
||||||
assert np.all(np.abs(predt_2 - predt_1) < 1e-6)
|
assert np.all(np.abs(predt_2 - predt_1) < 1e-6)
|
||||||
|
|
||||||
@ -331,10 +332,15 @@ class TestModels:
|
|||||||
dtrain: xgb.DMatrix,
|
dtrain: xgb.DMatrix,
|
||||||
num_parallel_tree: int,
|
num_parallel_tree: int,
|
||||||
num_classes: int,
|
num_classes: int,
|
||||||
num_boost_round: int
|
num_boost_round: int,
|
||||||
|
use_np_type: bool,
|
||||||
):
|
):
|
||||||
beg = 3
|
beg = 3
|
||||||
|
if use_np_type:
|
||||||
|
end: Integer = np.int32(7)
|
||||||
|
else:
|
||||||
end = 7
|
end = 7
|
||||||
|
|
||||||
sliced: xgb.Booster = booster[beg:end]
|
sliced: xgb.Booster = booster[beg:end]
|
||||||
assert sliced.feature_types == booster.feature_types
|
assert sliced.feature_types == booster.feature_types
|
||||||
|
|
||||||
@ -345,7 +351,7 @@ class TestModels:
|
|||||||
sliced = booster[beg:end:2]
|
sliced = booster[beg:end:2]
|
||||||
assert sliced_trees == len(sliced.get_dump())
|
assert sliced_trees == len(sliced.get_dump())
|
||||||
|
|
||||||
sliced = booster[beg: ...]
|
sliced = booster[beg:]
|
||||||
sliced_trees = (num_boost_round - beg) * num_parallel_tree * num_classes
|
sliced_trees = (num_boost_round - beg) * num_parallel_tree * num_classes
|
||||||
assert sliced_trees == len(sliced.get_dump())
|
assert sliced_trees == len(sliced.get_dump())
|
||||||
|
|
||||||
@ -357,7 +363,7 @@ class TestModels:
|
|||||||
sliced_trees = end * num_parallel_tree * num_classes
|
sliced_trees = end * num_parallel_tree * num_classes
|
||||||
assert sliced_trees == len(sliced.get_dump())
|
assert sliced_trees == len(sliced.get_dump())
|
||||||
|
|
||||||
sliced = booster[...: end]
|
sliced = booster[: end]
|
||||||
sliced_trees = end * num_parallel_tree * num_classes
|
sliced_trees = end * num_parallel_tree * num_classes
|
||||||
assert sliced_trees == len(sliced.get_dump())
|
assert sliced_trees == len(sliced.get_dump())
|
||||||
|
|
||||||
@ -383,14 +389,14 @@ class TestModels:
|
|||||||
assert len(trees) == num_boost_round
|
assert len(trees) == num_boost_round
|
||||||
|
|
||||||
with pytest.raises(TypeError):
|
with pytest.raises(TypeError):
|
||||||
booster["wrong type"]
|
booster["wrong type"] # type: ignore
|
||||||
with pytest.raises(IndexError):
|
with pytest.raises(IndexError):
|
||||||
booster[: num_boost_round + 1]
|
booster[: num_boost_round + 1]
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
booster[1, 2] # too many dims
|
booster[1, 2] # too many dims
|
||||||
# setitem is not implemented as model is immutable during slicing.
|
# setitem is not implemented as model is immutable during slicing.
|
||||||
with pytest.raises(TypeError):
|
with pytest.raises(TypeError):
|
||||||
booster[...: end] = booster
|
booster[:end] = booster # type: ignore
|
||||||
|
|
||||||
sliced_0 = booster[1:3]
|
sliced_0 = booster[1:3]
|
||||||
np.testing.assert_allclose(
|
np.testing.assert_allclose(
|
||||||
@ -446,15 +452,21 @@ class TestModels:
|
|||||||
|
|
||||||
assert len(booster.get_dump()) == total_trees
|
assert len(booster.get_dump()) == total_trees
|
||||||
|
|
||||||
self.run_slice(booster, dtrain, num_parallel_tree, num_classes, num_boost_round)
|
self.run_slice(
|
||||||
|
booster, dtrain, num_parallel_tree, num_classes, num_boost_round, False
|
||||||
|
)
|
||||||
|
|
||||||
bytesarray = booster.save_raw(raw_format="ubj")
|
bytesarray = booster.save_raw(raw_format="ubj")
|
||||||
booster = xgb.Booster(model_file=bytesarray)
|
booster = xgb.Booster(model_file=bytesarray)
|
||||||
self.run_slice(booster, dtrain, num_parallel_tree, num_classes, num_boost_round)
|
self.run_slice(
|
||||||
|
booster, dtrain, num_parallel_tree, num_classes, num_boost_round, False
|
||||||
|
)
|
||||||
|
|
||||||
bytesarray = booster.save_raw(raw_format="deprecated")
|
bytesarray = booster.save_raw(raw_format="deprecated")
|
||||||
booster = xgb.Booster(model_file=bytesarray)
|
booster = xgb.Booster(model_file=bytesarray)
|
||||||
self.run_slice(booster, dtrain, num_parallel_tree, num_classes, num_boost_round)
|
self.run_slice(
|
||||||
|
booster, dtrain, num_parallel_tree, num_classes, num_boost_round, True
|
||||||
|
)
|
||||||
|
|
||||||
def test_slice_multi(self) -> None:
|
def test_slice_multi(self) -> None:
|
||||||
from sklearn.datasets import make_classification
|
from sklearn.datasets import make_classification
|
||||||
@ -479,7 +491,7 @@ class TestModels:
|
|||||||
},
|
},
|
||||||
num_boost_round=num_boost_round,
|
num_boost_round=num_boost_round,
|
||||||
dtrain=Xy,
|
dtrain=Xy,
|
||||||
callbacks=[ResetStrategy()]
|
callbacks=[ResetStrategy()],
|
||||||
)
|
)
|
||||||
sliced = [t for t in booster]
|
sliced = [t for t in booster]
|
||||||
assert len(sliced) == 16
|
assert len(sliced) == 16
|
||||||
|
|||||||
@ -1,4 +1,5 @@
|
|||||||
"""Tests for running inplace prediction."""
|
"""Tests for running inplace prediction."""
|
||||||
|
|
||||||
from concurrent.futures import ThreadPoolExecutor
|
from concurrent.futures import ThreadPoolExecutor
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
@ -61,7 +62,7 @@ def run_predict_leaf(device: str) -> np.ndarray:
|
|||||||
|
|
||||||
validate_leaf_output(leaf, num_parallel_tree)
|
validate_leaf_output(leaf, num_parallel_tree)
|
||||||
|
|
||||||
n_iters = 2
|
n_iters = np.int32(2)
|
||||||
sliced = booster.predict(
|
sliced = booster.predict(
|
||||||
m,
|
m,
|
||||||
pred_leaf=True,
|
pred_leaf=True,
|
||||||
|
|||||||
@ -440,7 +440,7 @@ def test_regression():
|
|||||||
preds = xgb_model.predict(X[test_index])
|
preds = xgb_model.predict(X[test_index])
|
||||||
# test other params in XGBRegressor().fit
|
# test other params in XGBRegressor().fit
|
||||||
preds2 = xgb_model.predict(
|
preds2 = xgb_model.predict(
|
||||||
X[test_index], output_margin=True, iteration_range=(0, 3)
|
X[test_index], output_margin=True, iteration_range=(0, np.int16(3))
|
||||||
)
|
)
|
||||||
preds3 = xgb_model.predict(
|
preds3 = xgb_model.predict(
|
||||||
X[test_index], output_margin=True, iteration_range=None
|
X[test_index], output_margin=True, iteration_range=None
|
||||||
|
|||||||
@ -1,4 +1,5 @@
|
|||||||
"""Copyright 2019-2023, XGBoost contributors"""
|
"""Copyright 2019-2023, XGBoost contributors"""
|
||||||
|
|
||||||
import asyncio
|
import asyncio
|
||||||
import json
|
import json
|
||||||
from collections import OrderedDict
|
from collections import OrderedDict
|
||||||
|
|||||||
@ -1,4 +1,5 @@
|
|||||||
"""Copyright 2019-2022 XGBoost contributors"""
|
"""Copyright 2019-2022 XGBoost contributors"""
|
||||||
|
|
||||||
import asyncio
|
import asyncio
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user