130 lines
5.1 KiB
R
130 lines
5.1 KiB
R
% Generated by roxygen2: do not edit by hand
|
|
% Please edit documentation in R/xgb.Booster.R
|
|
\name{predict.xgb.Booster}
|
|
\alias{predict.xgb.Booster}
|
|
\alias{predict.xgb.Booster.handle}
|
|
\title{Predict method for eXtreme Gradient Boosting model}
|
|
\usage{
|
|
\method{predict}{xgb.Booster}(object, newdata, missing = NA,
|
|
outputmargin = FALSE, ntreelimit = NULL, predleaf = FALSE,
|
|
reshape = FALSE, ...)
|
|
|
|
\method{predict}{xgb.Booster.handle}(object, ...)
|
|
}
|
|
\arguments{
|
|
\item{object}{Object of class \code{xgb.Booster} or \code{xgb.Booster.handle}}
|
|
|
|
\item{newdata}{takes \code{matrix}, \code{dgCMatrix}, local data file or \code{xgb.DMatrix}.}
|
|
|
|
\item{missing}{Missing is only used when input is dense matrix. Pick a float value that represents
|
|
missing values in data (e.g., sometimes 0 or some other extreme value is used).}
|
|
|
|
\item{outputmargin}{whether the prediction should be returned in the for of original untransformed
|
|
sum of predictions from boosting iterations' results. E.g., setting \code{outputmargin=TRUE} for
|
|
logistic regression would result in predictions for log-odds instead of probabilities.}
|
|
|
|
\item{ntreelimit}{limit the number of model's trees or boosting iterations used in prediction (see Details).
|
|
It will use all the trees by default (\code{NULL} value).}
|
|
|
|
\item{predleaf}{whether predict leaf index instead.}
|
|
|
|
\item{reshape}{whether to reshape the vector of predictions to a matrix form when there are several
|
|
prediction outputs per case. This option has no effect when \code{predleaf = TRUE}.}
|
|
|
|
\item{...}{Parameters passed to \code{predict.xgb.Booster}}
|
|
}
|
|
\value{
|
|
For regression or binary classification, it returns a vector of length \code{nrows(newdata)}.
|
|
For multiclass classification, either a \code{num_class * nrows(newdata)} vector or
|
|
a \code{(nrows(newdata), num_class)} dimension matrix is returned, depending on
|
|
the \code{reshape} value.
|
|
|
|
When \code{predleaf = TRUE}, the output is a matrix object with the
|
|
number of columns corresponding to the number of trees.
|
|
}
|
|
\description{
|
|
Predicted values based on either xgboost model or model handle object.
|
|
}
|
|
\details{
|
|
Note that \code{ntreelimit} is not necesserily equal to the number of boosting iterations
|
|
and it is not necesserily equal to the number of trees in a model.
|
|
E.g., in a random forest-like model, \code{ntreelimit} would limit the number of trees.
|
|
But for multiclass classification, there are multiple trees per iteration,
|
|
but \code{ntreelimit} limits the number of boosting iterations.
|
|
|
|
Also note that \code{ntreelimit} would currently do nothing for predictions from gblinear,
|
|
since gblinear doesn't keep its boosting history.
|
|
|
|
One possible practical applications of the \code{predleaf} option is to use the model
|
|
as a generator of new features which capture non-linearity and interactions,
|
|
e.g., as implemented in \code{\link{xgb.create.features}}.
|
|
}
|
|
\examples{
|
|
## binary classification:
|
|
|
|
data(agaricus.train, package='xgboost')
|
|
data(agaricus.test, package='xgboost')
|
|
train <- agaricus.train
|
|
test <- agaricus.test
|
|
|
|
bst <- xgboost(data = train$data, label = train$label, max_depth = 2,
|
|
eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic")
|
|
# use all trees by default
|
|
pred <- predict(bst, test$data)
|
|
# use only the 1st tree
|
|
pred <- predict(bst, test$data, ntreelimit = 1)
|
|
|
|
|
|
## multiclass classification in iris dataset:
|
|
|
|
lb <- as.numeric(iris$Species) - 1
|
|
num_class <- 3
|
|
set.seed(11)
|
|
bst <- xgboost(data = as.matrix(iris[, -5]), label = lb,
|
|
max_depth = 4, eta = 0.5, nthread = 2, nrounds = 10, subsample = 0.5,
|
|
objective = "multi:softprob", num_class = num_class)
|
|
# predict for softmax returns num_class probability numbers per case:
|
|
pred <- predict(bst, as.matrix(iris[, -5]))
|
|
str(pred)
|
|
# reshape it to a num_class-columns matrix
|
|
pred <- matrix(pred, ncol=num_class, byrow=TRUE)
|
|
# convert the probabilities to softmax labels
|
|
pred_labels <- max.col(pred) - 1
|
|
# the following should result in the same error as seen in the last iteration
|
|
sum(pred_labels != lb)/length(lb)
|
|
|
|
# compare that to the predictions from softmax:
|
|
set.seed(11)
|
|
bst <- xgboost(data = as.matrix(iris[, -5]), label = lb,
|
|
max_depth = 4, eta = 0.5, nthread = 2, nrounds = 10, subsample = 0.5,
|
|
objective = "multi:softmax", num_class = num_class)
|
|
pred <- predict(bst, as.matrix(iris[, -5]))
|
|
str(pred)
|
|
all.equal(pred, pred_labels)
|
|
# prediction from using only 5 iterations should result
|
|
# in the same error as seen in iteration 5:
|
|
pred5 <- predict(bst, as.matrix(iris[, -5]), ntreelimit=5)
|
|
sum(pred5 != lb)/length(lb)
|
|
|
|
|
|
## random forest-like model of 25 trees for binary classification:
|
|
|
|
set.seed(11)
|
|
bst <- xgboost(data = train$data, label = train$label, max_depth = 5,
|
|
nthread = 2, nrounds = 1, objective = "binary:logistic",
|
|
num_parallel_tree = 25, subsample = 0.6, colsample_bytree = 0.1)
|
|
# Inspect the prediction error vs number of trees:
|
|
lb <- test$label
|
|
dtest <- xgb.DMatrix(test$data, label=lb)
|
|
err <- sapply(1:25, function(n) {
|
|
pred <- predict(bst, dtest, ntreelimit=n)
|
|
sum((pred > 0.5) != lb)/length(lb)
|
|
})
|
|
plot(err, type='l', ylim=c(0,0.1), xlab='#trees')
|
|
|
|
}
|
|
\seealso{
|
|
\code{\link{xgb.train}}.
|
|
}
|
|
|