[R] docs update - callbacks and parameter style

2016-06-27 01:59:58 -05:00
parent e9eb34fabc
commit a0aa305268
28 changed files with 564 additions and 162 deletions
--- a/R-package/man/predict.xgb.Booster.Rd
+++ b/R-package/man/predict.xgb.Booster.Rd
@@ -6,53 +6,124 @@
 \title{Predict method for eXtreme Gradient Boosting model}
 \usage{
 \method{predict}{xgb.Booster}(object, newdata, missing = NA,
-  outputmargin = FALSE, ntreelimit = NULL, predleaf = FALSE)
+  outputmargin = FALSE, ntreelimit = NULL, predleaf = FALSE,
+  reshape = FALSE)

 \method{predict}{xgb.Booster.handle}(object, ...)
 }
 \arguments{
 \item{object}{Object of class \code{xgb.Booster} or \code{xgb.Booster.handle}}

-\item{newdata}{takes \code{matrix}, \code{dgCMatrix}, local data file or 
-\code{xgb.DMatrix}.}
+\item{newdata}{takes \code{matrix}, \code{dgCMatrix}, local data file or \code{xgb.DMatrix}.}

-\item{missing}{Missing is only used when input is dense matrix, pick a float 
-value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values.}
+\item{missing}{Missing is only used when input is dense matrix. Pick a float value that represents
+missing values in data (e.g., sometimes 0 or some other extreme value is used).}

-\item{outputmargin}{whether the prediction should be shown in the original
-value of sum of functions, when outputmargin=TRUE, the prediction is 
-untransformed margin value. In logistic regression, outputmargin=T will
-output value before logistic transformation.}
+\item{outputmargin}{whether the prediction should be returned in the for of original untransformed 
+sum of predictions from boosting iterations' results. E.g., setting \code{outputmargin=TRUE} for 
+logistic regression would result in predictions for log-odds instead of probabilities.}

-\item{ntreelimit}{limit number of trees used in prediction, this parameter is
-only valid for gbtree, but not for gblinear. set it to be value bigger 
-than 0. It will use all trees by default.}
+\item{ntreelimit}{limit the number of model's trees or boosting iterations used in prediction (see Details).
+It will use all the trees by default (\code{NULL} value).}

-\item{predleaf}{whether predict leaf index instead. If set to TRUE, the output will be a matrix object.}
+\item{predleaf}{whether predict leaf index instead.}

-\item{...}{Parameters pass to \code{predict.xgb.Booster}}
+\item{reshape}{whether to reshape the vector of predictions to a matrix form when there are several 
+prediction outputs per case. This option has no effect when \code{predleaf = TRUE}.}
+
+\item{...}{Parameters passed to \code{predict.xgb.Booster}}
+}
+\value{
+For regression or binary classification, it returns a vector of length \code{nrows(newdata)}.
+For multiclass classification, either a \code{num_class * nrows(newdata)} vector or 
+a \code{(nrows(newdata), num_class)} dimension matrix is returned, depending on 
+the \code{reshape} value.
+
+When \code{predleaf = TRUE}, the output is a matrix object with the 
+number of columns corresponding to the number of trees.
 }
 \description{
 Predicted values based on either xgboost model or model handle object.
 }
 \details{
-The option \code{ntreelimit} purpose is to let the user train a model with lots 
-of trees but use only the first trees for prediction to avoid overfitting 
-(without having to train a new model with less trees).
+Note that \code{ntreelimit} is not necesserily equal to the number of boosting iterations
+and it is not necesserily equal to the number of trees in a model.
+E.g., in a random forest-like model, \code{ntreelimit} would limit the number of trees.
+But for multiclass classification, there are multiple trees per iteration, 
+but \code{ntreelimit} limits the number of boosting iterations.

-The option \code{predleaf} purpose is inspired from §3.1 of the paper 
-\code{Practical Lessons from Predicting Clicks on Ads at Facebook}.
-The idea is to use the model as a generator of new features which capture non linear link 
-from original features.
+Also note that \code{ntreelimit} would currently do nothing for predictions from gblinear, 
+since gblinear doesn't keep its boosting history. 
+
+One possible practical applications of the \code{predleaf} option is to use the model 
+as a generator of new features which capture non-linearity and interactions, 
+e.g., as implemented in \code{\link{xgb.create.features}}.
 }
 \examples{
+## binary classification:
+
 data(agaricus.train, package='xgboost')
 data(agaricus.test, package='xgboost')
 train <- agaricus.train
 test <- agaricus.test

-bst <- xgboost(data = train$data, label = train$label, max.depth = 2, 
-               eta = 1, nthread = 2, nround = 2,objective = "binary:logistic")
+bst <- xgboost(data = train$data, label = train$label, max_depth = 2, 
+               eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic")
+# use all trees by default
 pred <- predict(bst, test$data)
+# use only the 1st tree
+pred <- predict(bst, test$data, ntreelimit = 1)
+
+
+## multiclass classification in iris dataset:
+
+lb <- as.numeric(iris$Species) - 1
+num_class <- 3
+set.seed(11)
+bst <- xgboost(data = as.matrix(iris[, -5]), label = lb,
+               max_depth = 4, eta = 0.5, nthread = 2, nrounds = 10, subsample = 0.5,
+               objective = "multi:softprob", num_class = num_class)
+# predict for softmax returns num_class probability numbers per case:
+pred <- predict(bst, as.matrix(iris[, -5]))
+str(pred)
+# reshape it to a num_class-columns matrix
+pred <- matrix(pred, ncol=num_class, byrow=TRUE)
+# convert the probabilities to softmax labels
+pred_labels <- max.col(pred) - 1
+# the following should result in the same error as seen in the last iteration
+sum(pred_labels != lb)/length(lb)
+
+# compare that to the predictions from softmax:
+set.seed(11)
+bst <- xgboost(data = as.matrix(iris[, -5]), label = lb,
+               max_depth = 4, eta = 0.5, nthread = 2, nrounds = 10, subsample = 0.5,
+               objective = "multi:softmax", num_class = num_class)
+pred <- predict(bst, as.matrix(iris[, -5]))
+str(pred)
+all.equal(pred, pred_labels)
+# prediction from using only 5 iterations should result 
+# in the same error as seen in iteration 5:
+pred5 <- predict(bst, as.matrix(iris[, -5]), ntreelimit=5)
+sum(pred5 != lb)/length(lb)
+
+
+## random forest-like model of 25 trees for binary classification:
+
+set.seed(11)
+bst <- xgboost(data = train$data, label = train$label, max_depth = 5,
+               nthread = 2, nrounds = 1, objective = "binary:logistic",
+               num_parallel_tree = 25, subsample = 0.6, colsample_bytree = 0.1)
+# Inspect the prediction error vs number of trees:
+lb <- test$label
+dtest <- xgb.DMatrix(test$data, label=lb)
+err <- sapply(1:25, function(n) {
+  pred <- predict(bst, dtest, ntreelimit=n)
+  sum((pred > 0.5) != lb)/length(lb)
+})
+plot(err, type='l', ylim=c(0,0.1), xlab='#trees')
+
+}
+\seealso{
+\code{\link{xgb.train}}.
 }