xgboost/R-package/man/predict.xgb.Booster.Rd

% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/xgb.Booster.R
\name{predict.xgb.Booster}
\alias{predict.xgb.Booster}
\alias{predict.xgb.Booster.handle}
\title{Predict method for eXtreme Gradient Boosting model}
\usage{
\method{predict}{xgb.Booster}(object, newdata, missing = NA,
  outputmargin = FALSE, ntreelimit = NULL, predleaf = FALSE,
  reshape = FALSE, ...)

\method{predict}{xgb.Booster.handle}(object, ...)
}
\arguments{
\item{object}{Object of class \code{xgb.Booster} or \code{xgb.Booster.handle}}

\item{newdata}{takes \code{matrix}, \code{dgCMatrix}, local data file or \code{xgb.DMatrix}.}

\item{missing}{Missing is only used when input is dense matrix. Pick a float value that represents
missing values in data (e.g., sometimes 0 or some other extreme value is used).}

\item{outputmargin}{whether the prediction should be returned in the for of original untransformed
sum of predictions from boosting iterations' results. E.g., setting \code{outputmargin=TRUE} for
logistic regression would result in predictions for log-odds instead of probabilities.}

\item{ntreelimit}{limit the number of model's trees or boosting iterations used in prediction (see Details).
It will use all the trees by default (\code{NULL} value).}

\item{predleaf}{whether predict leaf index instead.}

\item{reshape}{whether to reshape the vector of predictions to a matrix form when there are several
prediction outputs per case. This option has no effect when \code{predleaf = TRUE}.}

\item{...}{Parameters passed to \code{predict.xgb.Booster}}
}
\value{
For regression or binary classification, it returns a vector of length \code{nrows(newdata)}.
For multiclass classification, either a \code{num_class * nrows(newdata)} vector or
a \code{(nrows(newdata), num_class)} dimension matrix is returned, depending on
the \code{reshape} value.

When \code{predleaf = TRUE}, the output is a matrix object with the
number of columns corresponding to the number of trees.
}
\description{
Predicted values based on either xgboost model or model handle object.
}
\details{
Note that \code{ntreelimit} is not necesserily equal to the number of boosting iterations
and it is not necesserily equal to the number of trees in a model.
E.g., in a random forest-like model, \code{ntreelimit} would limit the number of trees.
But for multiclass classification, there are multiple trees per iteration,
but \code{ntreelimit} limits the number of boosting iterations.

Also note that \code{ntreelimit} would currently do nothing for predictions from gblinear,
since gblinear doesn't keep its boosting history.

One possible practical applications of the \code{predleaf} option is to use the model
as a generator of new features which capture non-linearity and interactions,
e.g., as implemented in \code{\link{xgb.create.features}}.
}
\examples{
## binary classification:

data(agaricus.train, package='xgboost')
data(agaricus.test, package='xgboost')
train <- agaricus.train
test <- agaricus.test

bst <- xgboost(data = train$data, label = train$label, max_depth = 2,
               eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic")
# use all trees by default
pred <- predict(bst, test$data)
# use only the 1st tree
pred <- predict(bst, test$data, ntreelimit = 1)


## multiclass classification in iris dataset:

lb <- as.numeric(iris$Species) - 1
num_class <- 3
set.seed(11)
bst <- xgboost(data = as.matrix(iris[, -5]), label = lb,
               max_depth = 4, eta = 0.5, nthread = 2, nrounds = 10, subsample = 0.5,
               objective = "multi:softprob", num_class = num_class)
# predict for softmax returns num_class probability numbers per case:
pred <- predict(bst, as.matrix(iris[, -5]))
str(pred)
# reshape it to a num_class-columns matrix
pred <- matrix(pred, ncol=num_class, byrow=TRUE)
# convert the probabilities to softmax labels
pred_labels <- max.col(pred) - 1
# the following should result in the same error as seen in the last iteration
sum(pred_labels != lb)/length(lb)

# compare that to the predictions from softmax:
set.seed(11)
bst <- xgboost(data = as.matrix(iris[, -5]), label = lb,
               max_depth = 4, eta = 0.5, nthread = 2, nrounds = 10, subsample = 0.5,
               objective = "multi:softmax", num_class = num_class)
pred <- predict(bst, as.matrix(iris[, -5]))
str(pred)
all.equal(pred, pred_labels)
# prediction from using only 5 iterations should result
# in the same error as seen in iteration 5:
pred5 <- predict(bst, as.matrix(iris[, -5]), ntreelimit=5)
sum(pred5 != lb)/length(lb)


## random forest-like model of 25 trees for binary classification:

set.seed(11)
bst <- xgboost(data = train$data, label = train$label, max_depth = 5,
               nthread = 2, nrounds = 1, objective = "binary:logistic",
               num_parallel_tree = 25, subsample = 0.6, colsample_bytree = 0.1)
# Inspect the prediction error vs number of trees:
lb <- test$label
dtest <- xgb.DMatrix(test$data, label=lb)
err <- sapply(1:25, function(n) {
  pred <- predict(bst, dtest, ntreelimit=n)
  sum((pred > 0.5) != lb)/length(lb)
})
plot(err, type='l', ylim=c(0,0.1), xlab='#trees')

}
\seealso{
\code{\link{xgb.train}}.
}