[R] in predict: doc, examples, reshape parameter

2016-06-27 01:49:57 -05:00 · 2016-06-27 01:49:57 -05:00 · b9aeeda074
commit b9aeeda074
parent c342614a81
1 changed files with 139 additions and 43 deletions
--- a/R-package/R/xgb.Booster.R
+++ b/R-package/R/xgb.Booster.R
@ -1,14 +1,11 @@
 # Construct a Booster from cachelist
 # internal utility function
 xgb.Booster <- function(params = list(), cachelist = list(), modelfile = NULL) {
-  if (typeof(cachelist) != "list") {
+  if (typeof(cachelist) != "list" ||
+      any(sapply(cachelist, class) != 'xgb.DMatrix')) {
    stop("xgb.Booster only accepts list of DMatrix as cachelist")
  }
-  for (dm in cachelist) {
-    if (class(dm) != "xgb.DMatrix") {
-      stop("xgb.Booster only accepts list of DMatrix as cachelist")
-    }
-  }
+
  handle <- .Call("XGBoosterCreate_R", cachelist, PACKAGE = "xgboost")
  if (!is.null(modelfile)) {
    if (typeof(modelfile) == "character") {
@ -54,6 +51,9 @@ xgb.get.handle <- function(object) {
 # Check whether an xgb.Booster object is complete
 # internal utility function
 xgb.Booster.check <- function(bst, saveraw = TRUE) {
+  if (class(bst) != "xgb.Booster")
+    stop("argument type must be xgb.Booster")
+  
  isnull <- is.null(bst$handle)
  if (!isnull) {
    isnull <- .Call("XGCheckNullPtr_R", bst$handle, PACKAGE="xgboost")
@ -67,48 +67,118 @@ xgb.Booster.check <- function(bst, saveraw = TRUE) {
  return(bst)
 }

+
 #' Predict method for eXtreme Gradient Boosting model
 #' 
 #' Predicted values based on either xgboost model or model handle object.
 #' 
 #' @param object Object of class \code{xgb.Booster} or \code{xgb.Booster.handle}
-#' @param newdata takes \code{matrix}, \code{dgCMatrix}, local data file or 
-#'   \code{xgb.DMatrix}. 
-#' @param missing Missing is only used when input is dense matrix, pick a float 
-#'     value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values.
-#' @param outputmargin whether the prediction should be shown in the original
-#'   value of sum of functions, when outputmargin=TRUE, the prediction is 
-#'   untransformed margin value. In logistic regression, outputmargin=T will
-#'   output value before logistic transformation.
-#' @param ntreelimit limit number of trees used in prediction, this parameter is
-#'  only valid for gbtree, but not for gblinear. set it to be value bigger 
-#'  than 0. It will use all trees by default.
-#' @param predleaf whether predict leaf index instead. If set to TRUE, the output will be a matrix object.
-#' @param ... Parameters pass to \code{predict.xgb.Booster}
+#' @param newdata takes \code{matrix}, \code{dgCMatrix}, local data file or \code{xgb.DMatrix}.
+#' @param missing Missing is only used when input is dense matrix. Pick a float value that represents
+#'        missing values in data (e.g., sometimes 0 or some other extreme value is used).
+#' @param outputmargin whether the prediction should be returned in the for of original untransformed 
+#'        sum of predictions from boosting iterations' results. E.g., setting \code{outputmargin=TRUE} for 
+#'        logistic regression would result in predictions for log-odds instead of probabilities.
+#' @param ntreelimit limit the number of model's trees or boosting iterations used in prediction (see Details).
+#'        It will use all the trees by default (\code{NULL} value).
+#' @param predleaf whether predict leaf index instead. 
+#' @param reshape whether to reshape the vector of predictions to a matrix form when there are several 
+#'        prediction outputs per case. This option has no effect when \code{predleaf = TRUE}.
+#' @param ... Parameters passed to \code{predict.xgb.Booster}
 #' 
 #' @details  
-#' The option \code{ntreelimit} purpose is to let the user train a model with lots 
-#' of trees but use only the first trees for prediction to avoid overfitting 
-#' (without having to train a new model with less trees).
+#' Note that \code{ntreelimit} is not necesserily equal to the number of boosting iterations
+#' and it is not necesserily equal to the number of trees in a model.
+#' E.g., in a random forest-like model, \code{ntreelimit} would limit the number of trees.
+#' But for multiclass classification, there are multiple trees per iteration, 
+#' but \code{ntreelimit} limits the number of boosting iterations.
 #' 
-#' The option \code{predleaf} purpose is inspired from §3.1 of the paper 
-#' \code{Practical Lessons from Predicting Clicks on Ads at Facebook}.
-#' The idea is to use the model as a generator of new features which capture non linear link 
-#' from original features.
+#' Also note that \code{ntreelimit} would currently do nothing for predictions from gblinear, 
+#' since gblinear doesn't keep its boosting history. 
+#' 
+#' One possible practical applications of the \code{predleaf} option is to use the model 
+#' as a generator of new features which capture non-linearity and interactions, 
+#' e.g., as implemented in \code{\link{xgb.create.features}}. 
+#' 
+#' @return 
+#' For regression or binary classification, it returns a vector of length \code{nrows(newdata)}.
+#' For multiclass classification, either a \code{num_class * nrows(newdata)} vector or 
+#' a \code{(nrows(newdata), num_class)} dimension matrix is returned, depending on 
+#' the \code{reshape} value.
+#' 
+#' When \code{predleaf = TRUE}, the output is a matrix object with the 
+#' number of columns corresponding to the number of trees.
+#' 
+#' @seealso
+#' \code{\link{xgb.train}}.
 #' 
 #' @examples
+#' ## binary classification:
+#' 
 #' data(agaricus.train, package='xgboost')
 #' data(agaricus.test, package='xgboost')
 #' train <- agaricus.train
 #' test <- agaricus.test
 #' 
-#' bst <- xgboost(data = train$data, label = train$label, max.depth = 2, 
-#'                eta = 1, nthread = 2, nround = 2,objective = "binary:logistic")
+#' bst <- xgboost(data = train$data, label = train$label, max_depth = 2, 
+#'                eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic")
+#' # use all trees by default
 #' pred <- predict(bst, test$data)
+#' # use only the 1st tree
+#' pred <- predict(bst, test$data, ntreelimit = 1)
+#' 
+#' 
+#' ## multiclass classification in iris dataset:
+#' 
+#' lb <- as.numeric(iris$Species) - 1
+#' num_class <- 3
+#' set.seed(11)
+#' bst <- xgboost(data = as.matrix(iris[, -5]), label = lb,
+#'                max_depth = 4, eta = 0.5, nthread = 2, nrounds = 10, subsample = 0.5,
+#'                objective = "multi:softprob", num_class = num_class)
+#' # predict for softmax returns num_class probability numbers per case:
+#' pred <- predict(bst, as.matrix(iris[, -5]))
+#' str(pred)
+#' # reshape it to a num_class-columns matrix
+#' pred <- matrix(pred, ncol=num_class, byrow=TRUE)
+#' # convert the probabilities to softmax labels
+#' pred_labels <- max.col(pred) - 1
+#' # the following should result in the same error as seen in the last iteration
+#' sum(pred_labels != lb)/length(lb)
+#' 
+#' # compare that to the predictions from softmax:
+#' set.seed(11)
+#' bst <- xgboost(data = as.matrix(iris[, -5]), label = lb,
+#'                max_depth = 4, eta = 0.5, nthread = 2, nrounds = 10, subsample = 0.5,
+#'                objective = "multi:softmax", num_class = num_class)
+#' pred <- predict(bst, as.matrix(iris[, -5]))
+#' str(pred)
+#' all.equal(pred, pred_labels)
+#' # prediction from using only 5 iterations should result 
+#' # in the same error as seen in iteration 5:
+#' pred5 <- predict(bst, as.matrix(iris[, -5]), ntreelimit=5)
+#' sum(pred5 != lb)/length(lb)
+#' 
+#' 
+#' ## random forest-like model of 25 trees for binary classification:
+#' 
+#' set.seed(11)
+#' bst <- xgboost(data = train$data, label = train$label, max_depth = 5,
+#'                nthread = 2, nrounds = 1, objective = "binary:logistic",
+#'                num_parallel_tree = 25, subsample = 0.6, colsample_bytree = 0.1)
+#' # Inspect the prediction error vs number of trees:
+#' lb <- test$label
+#' dtest <- xgb.DMatrix(test$data, label=lb)
+#' err <- sapply(1:25, function(n) {
+#'   pred <- predict(bst, dtest, ntreelimit=n)
+#'   sum((pred > 0.5) != lb)/length(lb)
+#' })
+#' plot(err, type='l', ylim=c(0,0.1), xlab='#trees')
+#'
 #' @rdname predict.xgb.Booster
 #' @export
 predict.xgb.Booster <- function(object, newdata, missing = NA,
-    outputmargin = FALSE, ntreelimit = NULL, predleaf = FALSE) {
+    outputmargin = FALSE, ntreelimit = NULL, predleaf = FALSE, reshape = FALSE) {

  object <- xgb.Booster.check(object, saveraw = FALSE)
  if (class(newdata) != "xgb.DMatrix")
@ -116,16 +186,26 @@ predict.xgb.Booster <- function(object, newdata, missing = NA,
  if (is.null(ntreelimit))
    ntreelimit <- NVL(object$best_ntreelimit, 0)
  if (ntreelimit < 0)
-    stop("ntreelimit must be positive")
+    stop("ntreelimit cannot be negative")
  
  option <- 0L + 1L * as.logical(outputmargin) + 2L * as.logical(predleaf)
  
  ret <- .Call("XGBoosterPredict_R", object$handle, newdata, option[1],
               as.integer(ntreelimit), PACKAGE = "xgboost")
+  
+  if (length(ret) %% nrow(newdata) != 0)
+    stop("prediction length ", length(ret)," is not multiple of nrows(newdata) ", nrow(newdata))
+  npred_per_case <- length(ret) / nrow(newdata)
+
  if (predleaf){
    len <- nrow(newdata)
-    ret <- if (length(ret) == len) matrix(ret, ncol = 1)
-           else t(matrix(ret, ncol = len))
+    ret <- if (length(ret) == len) {
+      matrix(ret, ncol = 1)
+    } else {
+      t(matrix(ret, ncol = len))
+    }
+  } else if (reshape && npred_per_case > 1) {
+    ret <- matrix(ret, ncol = length(ret) / nrow(newdata), byrow = TRUE)
  }
  return(ret)
 }
@ -169,9 +249,13 @@ predict.xgb.Booster.handle <- function(object, ...) {
 #' 
 #' The attribute setters would usually work more efficiently for \code{xgb.Booster.handle}
 #' than for \code{xgb.Booster}, since only just a handle (pointer) would need to be copied.
+#' That would only matter if attributes need to be set many times.
+#' Note, however, that when feeding a handle of an \code{xgb.Booster} object to the attribute setters,
+#' the raw model cache of an \code{xgb.Booster} object would not be automatically updated, 
+#' and it would be user's responsibility to call \code{xgb.save.raw} to update it.
 #' 
 #' The \code{xgb.attributes<-} setter either updates the existing or adds one or several attributes, 
-#' but doesn't delete the existing attributes which don't have their names in \code{names(attributes)}.
+#' but it doesn't delete the other existing attributes.
 #' 
 #' @return
 #' \code{xgb.attr} returns either a string value of an attribute 
@ -184,8 +268,8 @@ predict.xgb.Booster.handle <- function(object, ...) {
 #' data(agaricus.train, package='xgboost')
 #' train <- agaricus.train
 #'
-#' bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
-#'                eta = 1, nthread = 2, nround = 2, objective = "binary:logistic")
+#' bst <- xgboost(data = train$data, label = train$label, max_depth = 2,
+#'                eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic")
 #'
 #' xgb.attr(bst, "my_attribute") <- "my attribute value"
 #' print(xgb.attr(bst, "my_attribute"))
@ -279,8 +363,8 @@ xgb.attributes <- function(object) {
 #' data(agaricus.train, package='xgboost')
 #' train <- agaricus.train
 #'
-#' bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
-#'                eta = 1, nthread = 2, nround = 2, objective = "binary:logistic")
+#' bst <- xgboost(data = train$data, label = train$label, max_depth = 2,
+#'                eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic")
 #'
 #' xgb.parameters(bst) <- list(eta = 0.1)
 #' 
@ -304,6 +388,12 @@ xgb.attributes <- function(object) {
  object
 }

+# Extract # of trees in a model
+# TODO: either add a getter to C-interface, or simply set an 'ntree' attribute after each iteration
+# internal utility function
+xgb.ntree <- function(bst) {
+  length(grep('^booster', xgb.dump(bst)))
+}


 #' Print xgb.Booster
@ -317,8 +407,8 @@ xgb.attributes <- function(object) {
 #' @examples
 #' data(agaricus.train, package='xgboost')
 #' train <- agaricus.train
-#' bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
-#'                eta = 1, nthread = 2, nround = 2, objective = "binary:logistic")
+#' bst <- xgboost(data = train$data, label = train$label, max_depth = 2,
+#'                eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic")
 #' attr(bst, 'myattr') <- 'memo'
 #' 
 #' print(bst)
@ -334,9 +424,11 @@ print.xgb.Booster <- function(x, verbose=FALSE, ...) {
  }
  
  cat('raw: ')
-  if (!is.null(x$raw)) cat(format(object.size(x$raw), units="auto"), '\n')
-  else cat('NULL\n')
-  
+  if (!is.null(x$raw)) {
+    cat(format(object.size(x$raw), units="auto"), '\n')
+  } else {
+    cat('NULL\n')
+  }
  if (!is.null(x$call)) {
    cat('call:\n  ')
    print(x$call)
@ -371,7 +463,11 @@ print.xgb.Booster <- function(x, verbose=FALSE, ...) {
    })
  }
  
-  for (n in setdiff(names(x), c('handle', 'raw', 'call', 'params', 'callbacks','evaluation_log'))) {
+  cat('niter: ', x$niter, '\n', sep='')
+  # TODO: uncomment when faster xgb.ntree is implemented
+  #cat('ntree: ', xgb.ntree(x), '\n', sep='')
+  
+  for (n in setdiff(names(x), c('handle', 'raw', 'call', 'params', 'callbacks','evaluation_log','niter'))) {
    if (is.atomic(x[[n]])) {
      cat(n, ': ', x[[n]], '\n', sep='')
    } else {