308 lines
13 KiB
R
308 lines
13 KiB
R
% Generated by roxygen2: do not edit by hand
|
|
% Please edit documentation in R/xgb.Booster.R
|
|
\name{predict.xgb.Booster}
|
|
\alias{predict.xgb.Booster}
|
|
\title{Predict method for XGBoost model}
|
|
\usage{
|
|
\method{predict}{xgb.Booster}(
|
|
object,
|
|
newdata,
|
|
missing = NA,
|
|
outputmargin = FALSE,
|
|
predleaf = FALSE,
|
|
predcontrib = FALSE,
|
|
approxcontrib = FALSE,
|
|
predinteraction = FALSE,
|
|
training = FALSE,
|
|
iterationrange = NULL,
|
|
strict_shape = FALSE,
|
|
avoid_transpose = FALSE,
|
|
validate_features = FALSE,
|
|
base_margin = NULL,
|
|
...
|
|
)
|
|
}
|
|
\arguments{
|
|
\item{object}{Object of class \code{xgb.Booster}.}
|
|
|
|
\item{newdata}{Takes \code{data.frame}, \code{matrix}, \code{dgCMatrix}, \code{dgRMatrix}, \code{dsparseVector},
|
|
local data file, or \code{xgb.DMatrix}.
|
|
|
|
For single-row predictions on sparse data, it is recommended to use CSR format. If passing
|
|
a sparse vector, it will take it as a row vector.
|
|
|
|
Note that, for repeated predictions on the same data, one might want to create a DMatrix to
|
|
pass here instead of passing R types like matrices or data frames, as predictions will be
|
|
faster on DMatrix.
|
|
|
|
If \code{newdata} is a \code{data.frame}, be aware that:
|
|
\itemize{
|
|
\item Columns will be converted to numeric if they aren't already, which could potentially make
|
|
the operation slower than in an equivalent \code{matrix} object.
|
|
\item The order of the columns must match with that of the data from which the model was fitted
|
|
(i.e. columns will not be referenced by their names, just by their order in the data).
|
|
\item If the model was fitted to data with categorical columns, these columns must be of
|
|
\code{factor} type here, and must use the same encoding (i.e. have the same levels).
|
|
\item If \code{newdata} contains any \code{factor} columns, they will be converted to base-0
|
|
encoding (same as during DMatrix creation) - hence, one should not pass a \code{factor}
|
|
under a column which during training had a different type.
|
|
}}
|
|
|
|
\item{missing}{Float value that represents missing values in data
|
|
(e.g., 0 or some other extreme value).
|
|
|
|
This parameter is not used when \code{newdata} is an \code{xgb.DMatrix} - in such cases,
|
|
should pass this as an argument to the DMatrix constructor instead.}
|
|
|
|
\item{outputmargin}{Whether the prediction should be returned in the form of
|
|
original untransformed sum of predictions from boosting iterations' results.
|
|
E.g., setting \code{outputmargin = TRUE} for logistic regression would return log-odds
|
|
instead of probabilities.}
|
|
|
|
\item{predleaf}{Whether to predict per-tree leaf indices.}
|
|
|
|
\item{predcontrib}{Whether to return feature contributions to individual predictions (see Details).}
|
|
|
|
\item{approxcontrib}{Whether to use a fast approximation for feature contributions (see Details).}
|
|
|
|
\item{predinteraction}{Whether to return contributions of feature interactions to individual predictions (see Details).}
|
|
|
|
\item{training}{Whether the prediction result is used for training. For dart booster,
|
|
training predicting will perform dropout.}
|
|
|
|
\item{iterationrange}{Sequence of rounds/iterations from the model to use for prediction, specified by passing
|
|
a two-dimensional vector with the start and end numbers in the sequence (same format as R's \code{seq} - i.e.
|
|
base-1 indexing, and inclusive of both ends).
|
|
|
|
For example, passing \code{c(1,20)} will predict using the first twenty iterations, while passing \code{c(1,1)} will
|
|
predict using only the first one.
|
|
|
|
If passing \code{NULL}, will either stop at the best iteration if the model used early stopping, or use all
|
|
of the iterations (rounds) otherwise.
|
|
|
|
If passing "all", will use all of the rounds regardless of whether the model had early stopping or not.}
|
|
|
|
\item{strict_shape}{Whether to always return an array with the same dimensions for the given prediction mode
|
|
regardless of the model type - meaning that, for example, both a multi-class and a binary classification
|
|
model would generate output arrays with the same number of dimensions, with the 'class' dimension having
|
|
size equal to '1' for the binary model.
|
|
|
|
If passing \code{FALSE} (the default), dimensions will be simplified according to the model type, so that a
|
|
binary classification model for example would not have a redundant dimension for 'class'.
|
|
|
|
See documentation for the return type for the exact shape of the output arrays for each prediction mode.}
|
|
|
|
\item{avoid_transpose}{Whether to output the resulting predictions in the same memory layout in which they
|
|
are generated by the core XGBoost library, without transposing them to match the expected output shape.
|
|
|
|
Internally, XGBoost uses row-major order for the predictions it generates, while R arrays use column-major
|
|
order, hence the result needs to be transposed in order to have the expected shape when represented as
|
|
an R array or matrix, which might be a slow operation.
|
|
|
|
If passing \code{TRUE}, then the result will have dimensions in reverse order - for example, rows
|
|
will be the last dimensions instead of the first dimension.}
|
|
|
|
\item{validate_features}{When \code{TRUE}, validate that the Booster's and newdata's
|
|
feature_names match (only applicable when both \code{object} and \code{newdata} have feature names).
|
|
|
|
If the column names differ and \code{newdata} is not an \code{xgb.DMatrix}, will try to reorder
|
|
the columns in \code{newdata} to match with the booster's.
|
|
|
|
If the booster has feature types and \code{newdata} is either an \code{xgb.DMatrix} or
|
|
\code{data.frame}, will additionally verify that categorical columns are of the
|
|
correct type in \code{newdata}, throwing an error if they do not match.
|
|
|
|
If passing \code{FALSE}, it is assumed that the feature names and types are the same,
|
|
and come in the same order as in the training data.
|
|
|
|
Note that this check might add some sizable latency to the predictions, so it's
|
|
recommended to disable it for performance-sensitive applications.}
|
|
|
|
\item{base_margin}{Base margin used for boosting from existing model.
|
|
|
|
Note that, if \code{newdata} is an \code{xgb.DMatrix} object, this argument will
|
|
be ignored as it needs to be added to the DMatrix instead (e.g. by passing it as
|
|
an argument in its constructor, or by calling \code{\link[=setinfo.xgb.DMatrix]{setinfo.xgb.DMatrix()}}.}
|
|
|
|
\item{...}{Not used.}
|
|
}
|
|
\value{
|
|
A numeric vector or array, with corresponding dimensions depending on the prediction mode and on
|
|
parameter \code{strict_shape} as follows:
|
|
|
|
If passing \code{strict_shape=FALSE}:\itemize{
|
|
\item For regression or binary classification: a vector of length \code{nrows}.
|
|
\item For multi-class and multi-target objectives: a matrix of dimensions \verb{[nrows, ngroups]}.
|
|
|
|
Note that objective variant \code{multi:softmax} defaults towards predicting most likely class (a vector
|
|
\code{nrows}) instead of per-class probabilities.
|
|
\item For \code{predleaf}: a matrix with one column per tree.
|
|
|
|
For multi-class / multi-target, they will be arranged so that columns in the output will have
|
|
the leafs from one group followed by leafs of the other group (e.g. order will be \code{group1:feat1},
|
|
\code{group1:feat2}, ..., \code{group2:feat1}, \code{group2:feat2}, ...).
|
|
\item For \code{predcontrib}: when not multi-class / multi-target, a matrix with dimensions
|
|
\verb{[nrows, nfeats+1]}. The last "+ 1" column corresponds to the baseline value.
|
|
|
|
For multi-class and multi-target objectives, will be an array with dimensions \verb{[nrows, ngroups, nfeats+1]}.
|
|
|
|
The contribution values are on the scale of untransformed margin (e.g., for binary classification,
|
|
the values are log-odds deviations from the baseline).
|
|
\item For \code{predinteraction}: when not multi-class / multi-target, the output is a 3D array of
|
|
dimensions \verb{[nrows, nfeats+1, nfeats+1]}. The off-diagonal (in the last two dimensions)
|
|
elements represent different feature interaction contributions. The array is symmetric w.r.t. the last
|
|
two dimensions. The "+ 1" columns corresponds to the baselines. Summing this array along the last
|
|
dimension should produce practically the same result as \code{predcontrib = TRUE}.
|
|
|
|
For multi-class and multi-target, will be a 4D array with dimensions \verb{[nrows, ngroups, nfeats+1, nfeats+1]}
|
|
}
|
|
|
|
If passing \code{strict_shape=FALSE}, the result is always an array:\itemize{
|
|
\item For normal predictions, the dimension is \verb{[nrows, ngroups]}.
|
|
\item For \code{predcontrib=TRUE}, the dimension is \verb{[nrows, ngroups, nfeats+1]}.
|
|
\item For \code{predinteraction=TRUE}, the dimension is \verb{[nrows, ngroups, nfeats+1, nfeats+1]}.
|
|
\item For \code{predleaf=TRUE}, the dimension is \verb{[nrows, niter, ngroups, num_parallel_tree]}.
|
|
}
|
|
|
|
If passing \code{avoid_transpose=TRUE}, then the dimensions in all cases will be in reverse order - for
|
|
example, for \code{predinteraction}, they will be \verb{[nfeats+1, nfeats+1, ngroups, nrows]}
|
|
instead of \verb{[nrows, ngroups, nfeats+1, nfeats+1]}.
|
|
}
|
|
\description{
|
|
Predict values on data based on XGBoost model.
|
|
}
|
|
\details{
|
|
Note that \code{iterationrange} would currently do nothing for predictions from "gblinear",
|
|
since "gblinear" doesn't keep its boosting history.
|
|
|
|
One possible practical applications of the \code{predleaf} option is to use the model
|
|
as a generator of new features which capture non-linearity and interactions,
|
|
e.g., as implemented in \code{\link[=xgb.create.features]{xgb.create.features()}}.
|
|
|
|
Setting \code{predcontrib = TRUE} allows to calculate contributions of each feature to
|
|
individual predictions. For "gblinear" booster, feature contributions are simply linear terms
|
|
(feature_beta * feature_value). For "gbtree" booster, feature contributions are SHAP
|
|
values (Lundberg 2017) that sum to the difference between the expected output
|
|
of the model and the current prediction (where the hessian weights are used to compute the expectations).
|
|
Setting \code{approxcontrib = TRUE} approximates these values following the idea explained
|
|
in \url{http://blog.datadive.net/interpreting-random-forests/}.
|
|
|
|
With \code{predinteraction = TRUE}, SHAP values of contributions of interaction of each pair of features
|
|
are computed. Note that this operation might be rather expensive in terms of compute and memory.
|
|
Since it quadratically depends on the number of features, it is recommended to perform selection
|
|
of the most important features first. See below about the format of the returned results.
|
|
|
|
The \code{predict()} method uses as many threads as defined in \code{xgb.Booster} object (all by default).
|
|
If you want to change their number, assign a new number to \code{nthread} using \code{\link[=xgb.parameters<-]{xgb.parameters<-()}}.
|
|
Note that converting a matrix to \code{\link[=xgb.DMatrix]{xgb.DMatrix()}} uses multiple threads too.
|
|
}
|
|
\examples{
|
|
## binary classification:
|
|
|
|
data(agaricus.train, package = "xgboost")
|
|
data(agaricus.test, package = "xgboost")
|
|
|
|
## Keep the number of threads to 2 for examples
|
|
nthread <- 2
|
|
data.table::setDTthreads(nthread)
|
|
|
|
train <- agaricus.train
|
|
test <- agaricus.test
|
|
|
|
bst <- xgb.train(
|
|
data = xgb.DMatrix(train$data, label = train$label),
|
|
max_depth = 2,
|
|
eta = 0.5,
|
|
nthread = nthread,
|
|
nrounds = 5,
|
|
objective = "binary:logistic"
|
|
)
|
|
|
|
# use all trees by default
|
|
pred <- predict(bst, test$data)
|
|
# use only the 1st tree
|
|
pred1 <- predict(bst, test$data, iterationrange = c(1, 1))
|
|
|
|
# Predicting tree leafs:
|
|
# the result is an nsamples X ntrees matrix
|
|
pred_leaf <- predict(bst, test$data, predleaf = TRUE)
|
|
str(pred_leaf)
|
|
|
|
# Predicting feature contributions to predictions:
|
|
# the result is an nsamples X (nfeatures + 1) matrix
|
|
pred_contr <- predict(bst, test$data, predcontrib = TRUE)
|
|
str(pred_contr)
|
|
# verify that contributions' sums are equal to log-odds of predictions (up to float precision):
|
|
summary(rowSums(pred_contr) - qlogis(pred))
|
|
# for the 1st record, let's inspect its features that had non-zero contribution to prediction:
|
|
contr1 <- pred_contr[1,]
|
|
contr1 <- contr1[-length(contr1)] # drop intercept
|
|
contr1 <- contr1[contr1 != 0] # drop non-contributing features
|
|
contr1 <- contr1[order(abs(contr1))] # order by contribution magnitude
|
|
old_mar <- par("mar")
|
|
par(mar = old_mar + c(0,7,0,0))
|
|
barplot(contr1, horiz = TRUE, las = 2, xlab = "contribution to prediction in log-odds")
|
|
par(mar = old_mar)
|
|
|
|
|
|
## multiclass classification in iris dataset:
|
|
|
|
lb <- as.numeric(iris$Species) - 1
|
|
num_class <- 3
|
|
|
|
set.seed(11)
|
|
|
|
bst <- xgb.train(
|
|
data = xgb.DMatrix(as.matrix(iris[, -5]), label = lb),
|
|
max_depth = 4,
|
|
eta = 0.5,
|
|
nthread = 2,
|
|
nrounds = 10,
|
|
subsample = 0.5,
|
|
objective = "multi:softprob",
|
|
num_class = num_class
|
|
)
|
|
|
|
# predict for softmax returns num_class probability numbers per case:
|
|
pred <- predict(bst, as.matrix(iris[, -5]))
|
|
str(pred)
|
|
# convert the probabilities to softmax labels
|
|
pred_labels <- max.col(pred) - 1
|
|
# the following should result in the same error as seen in the last iteration
|
|
sum(pred_labels != lb) / length(lb)
|
|
|
|
# compare with predictions from softmax:
|
|
set.seed(11)
|
|
|
|
bst <- xgb.train(
|
|
data = xgb.DMatrix(as.matrix(iris[, -5]), label = lb),
|
|
max_depth = 4,
|
|
eta = 0.5,
|
|
nthread = 2,
|
|
nrounds = 10,
|
|
subsample = 0.5,
|
|
objective = "multi:softmax",
|
|
num_class = num_class
|
|
)
|
|
|
|
pred <- predict(bst, as.matrix(iris[, -5]))
|
|
str(pred)
|
|
all.equal(pred, pred_labels)
|
|
# prediction from using only 5 iterations should result
|
|
# in the same error as seen in iteration 5:
|
|
pred5 <- predict(bst, as.matrix(iris[, -5]), iterationrange = c(1, 5))
|
|
sum(pred5 != lb) / length(lb)
|
|
|
|
}
|
|
\references{
|
|
\enumerate{
|
|
\item Scott M. Lundberg, Su-In Lee, "A Unified Approach to Interpreting Model Predictions",
|
|
NIPS Proceedings 2017, \url{https://arxiv.org/abs/1705.07874}
|
|
\item Scott M. Lundberg, Su-In Lee, "Consistent feature attribution for tree ensembles",
|
|
\url{https://arxiv.org/abs/1706.06060}
|
|
}
|
|
}
|
|
\seealso{
|
|
\code{\link[=xgb.train]{xgb.train()}}
|
|
}
|