R-callbacks docs

2016-06-09 02:52:09 -05:00
parent 422b0000a8
commit 2e0ffcc303
12 changed files with 396 additions and 107 deletions
--- a/R-package/man/xgb.train.Rd
+++ b/R-package/man/xgb.train.Rd
@@ -1,16 +1,24 @@
 % Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/xgb.train.R
+% Please edit documentation in R/xgb.train.R, R/xgboost.R
 \name{xgb.train}
 \alias{xgb.train}
+\alias{xgboost}
 \title{eXtreme Gradient Boosting Training}
 \usage{
 xgb.train(params = list(), data, nrounds, watchlist = list(), obj = NULL,
  feval = NULL, verbose = 1, print.every.n = 1L,
+  early.stop.round = NULL, maximize = NULL, save_period = NULL,
+  save_name = "xgboost.model", xgb_model = NULL, callbacks = list(), ...)
+
+xgboost(data = NULL, label = NULL, missing = NA, weight = NULL,
+  params = list(), nrounds, verbose = 1, print.every.n = 1L,
  early.stop.round = NULL, maximize = NULL, save_period = 0,
-  save_name = "xgboost.model", ...)
+  save_name = "xgboost.model", xgb_model = NULL, callbacks = list(), ...)
 }
 \arguments{
 \item{params}{the list of parameters. 
+       The complete list of parameters is available at \url{http://xgboost.readthedocs.io/en/latest/parameter.html}.
+       Below is a shorter summary:

 1. General Parameters

@@ -59,15 +67,16 @@ xgb.train(params = list(), data, nrounds, watchlist = list(), obj = NULL,
  \item \code{eval_metric} evaluation metrics for validation data. Users can pass a self-defined function to it. Default: metric will be assigned according to objective(rmse for regression, and error for classification, mean average precision for ranking). List is provided in detail section.
 }}

-\item{data}{takes an \code{xgb.DMatrix} as the input.}
+\item{data}{input dataset. \code{xgb.train} takes only an \code{xgb.DMatrix} as the input.
+\code{xgboost}, in addition, also accepts \code{matrix}, \code{dgCMatrix}, or local data file.}

 \item{nrounds}{the max number of iterations}

 \item{watchlist}{what information should be printed when \code{verbose=1} or
 \code{verbose=2}. Watchlist is used to specify validation set monitoring
 during training. For example user can specify
- watchlist=list(validation1=mat1, validation2=mat2) to watch
- the performance of each round's model on mat1 and mat2}
+watchlist=list(validation1=mat1, validation2=mat2) to watch
+the performance of each round's model on mat1 and mat2}

 \item{obj}{customized objective function. Returns gradient and second order 
 gradient with given prediction and dtrain,}
@@ -79,53 +88,95 @@ prediction and dtrain,}
 \item{verbose}{If 0, xgboost will stay silent. If 1, xgboost will print 
 information of performance. If 2, xgboost will print information of both}

-\item{print.every.n}{Print every N progress messages when \code{verbose>0}. Default is 1 which means all messages are printed.}
+\item{print.every.n}{Print every N progress messages when \code{verbose>0}.
+Default is 1 which means all messages are printed.}

 \item{early.stop.round}{If \code{NULL}, the early stopping function is not triggered. 
 If set to an integer \code{k}, training with a validation set will stop if the performance 
 keeps getting worse consecutively for \code{k} rounds.}

-\item{maximize}{If \code{feval} and \code{early.stop.round} are set, then \code{maximize} must be set as well.
+\item{maximize}{If \code{feval} and \code{early.stop.round} are set, 
+then \code{maximize} must be set as well.
 \code{maximize=TRUE} means the larger the evaluation score the better.}

-\item{save_period}{save the model to the disk in every \code{save_period} rounds, 0 means no such action.}
+\item{save_period}{save the model to the disk after every \code{save_period} rounds, 0 means save at the end.}

 \item{save_name}{the name or path for periodically saved model file.}

+\item{xgb_model}{the previously built model to continue the trainig from. 
+Could be either an object of class \code{xgb.Booster}, or its raw data, or the name of a 
+file with a previously saved model.}
+
+\item{callbacks}{a list of callback functions to perform various task during boosting. 
+See \code{\link{callbacks}}. Some of the callbacks are currently automatically 
+created when specific parameters are set.}
+
 \item{...}{other parameters to pass to \code{params}.}
+
+\item{label}{the response variable. User should not set this field,
+if data is local data file or \code{xgb.DMatrix}.}
+
+\item{missing}{by default is set to NA, which means that NA values should be considered as 'missing'
+by the algorithm. Sometimes, 0 or other extreme value might be used to represent missing values.
+This parameter is only used when input is dense matrix,}
+
+\item{weight}{a vector indicating the weight for each row of the input.}
+}
+\value{
+TODO
 }
 \description{
-An advanced interface for training xgboost model. Look at \code{\link{xgboost}} function for a simpler interface.
+\code{xgb.train} is an advanced interface for training an xgboost model. The \code{xgboost} function provides a simpler interface.
 }
 \details{
-This is the training function for \code{xgboost}. 
+These are the training functions for \code{xgboost}. 

-It supports advanced features such as \code{watchlist}, customized objective function (\code{feval}),
-therefore it is more flexible than \code{\link{xgboost}} function.
+The \code{xgb.train} interface supports advanced features such as \code{watchlist}, 
+customized objective and evaluation metric functions, therefore it is more flexible 
+than the \code{\link{xgboost}} interface.

 Parallelization is automatically enabled if \code{OpenMP} is present. 
 Number of threads can also be manually specified via \code{nthread} parameter.

-\code{eval_metric} parameter (not listed above) is set automatically by Xgboost but can be overriden by parameter. Below is provided the list of different metric optimized by Xgboost to help you to understand how it works inside or to use them with the \code{watchlist} parameter.
+The evaluation metric is chosen automatically by Xgboost (according to the objective)
+when the \code{eval_metric} parameter is not provided.
+User may set one or several \code{eval_metric} parameters. 
+Note that when using a customized metric, only this single metric can be used.
+The folloiwing is the list of built-in metrics for which Xgboost provides optimized implementation:
  \itemize{
     \item \code{rmse} root mean square error. \url{http://en.wikipedia.org/wiki/Root_mean_square_error}
     \item \code{logloss} negative log-likelihood. \url{http://en.wikipedia.org/wiki/Log-likelihood}
     \item \code{mlogloss} multiclass logloss. \url{https://www.kaggle.com/wiki/MultiClassLogLoss}
-     \item \code{error} Binary classification error rate. It is calculated as \code{(wrong cases) / (all cases)}. For the predictions, the evaluation will regard the instances with prediction value larger than 0.5 as positive instances, and the others as negative instances.
+     \item \code{error} Binary classification error rate. It is calculated as \code{(wrong cases) / (all cases)}.
+           By default, it uses the 0.5 threshold for predicted values to define negative and positive instances.
+           Different threshold (e.g., 0.) could be specified as "error@0."
     \item \code{merror} Multiclass classification error rate. It is calculated as \code{(wrong cases) / (all cases)}.
     \item \code{auc} Area under the curve. \url{http://en.wikipedia.org/wiki/Receiver_operating_characteristic#'Area_under_curve} for ranking evaluation.
     \item \code{ndcg} Normalized Discounted Cumulative Gain (for ranking task). \url{http://en.wikipedia.org/wiki/NDCG}
  }
-  
-Full list of parameters is available in the Wiki \url{https://github.com/dmlc/xgboost/wiki/Parameters}.

-This function only accepts an \code{\link{xgb.DMatrix}} object as the input.
+The following callbacks are automatically created when certain parameters are set:
+\itemize{
+  \item \code{cb.print_evaluation} is turned on when \code{verbose > 0};
+        and the \code{print.every.n} parameter is passed to it.
+  \item \code{cb.log_evaluation} is on when \code{verbose > 0} and \code{watchlist} is present.
+  \item \code{cb.early_stop}: when \code{early.stop.round} is set.
+  \item \code{cb.save_model}: when \code{save_period > 0} is set.
+}
 }
 \examples{
 data(agaricus.train, package='xgboost')
+data(agaricus.test, package='xgboost')
+
 dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
-dtest <- dtrain
+dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label)
 watchlist <- list(eval = dtest, train = dtrain)
+
+## A simple xgb.train example:
+param <- list(max.depth = 2, eta = 1, silent = 1, objective="binary:logistic", eval_metric="auc")
+bst <- xgb.train(param, dtrain, nthread = 2, nround = 2, watchlist)
+
+## An xgb.train example where custom objective and evaluation metric are used:
 logregobj <- function(preds, dtrain) {
   labels <- getinfo(dtrain, "label")
   preds <- 1/(1 + exp(-preds))
@@ -138,7 +189,23 @@ evalerror <- function(preds, dtrain) {
  err <- as.numeric(sum(labels != (preds > 0)))/length(labels)
  return(list(metric = "error", value = err))
 }
-param <- list(max.depth = 2, eta = 1, silent = 1, objective=logregobj,eval_metric=evalerror)
 bst <- xgb.train(param, dtrain, nthread = 2, nround = 2, watchlist)
+
+## An xgb.train example of using variable learning rates at each iteration:
+my_etas <- list(eta = c(0.5, 0.1))
+bst <- xgb.train(param, dtrain, nthread = 2, nround = 2, watchlist,
+                 callbacks = list(cb.reset_parameters(my_etas)))
+
+## Explicit use of the cb.log_evaluation callback allows to run 
+## xgb.train silently but still store the evaluation results:
+bst <- xgb.train(param, dtrain, nthread = 2, nround = 2, watchlist,
+                 verbose = 0, callbacks = list(cb.log_evaluation()))
+print(bst$evaluation_log)
+
+## An 'xgboost' interface example:
+bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max.depth = 2, 
+               eta = 1, nthread = 2, nround = 2, objective = "binary:logistic")
+pred <- predict(bst, agaricus.test$data)
+
 }