[R] Redesigned xgboost() interface skeleton (#10456)

--------- Co-authored-by: Michael Mayer <mayermichael79@gmail.com>
2024-07-15 12:44:58 +02:00
parent 17c64300e3
commit ab982e7873
35 changed files with 1997 additions and 242 deletions
--- a/R-package/man/print.xgb.Booster.Rd
+++ b/R-package/man/print.xgb.Booster.Rd
@@ -21,9 +21,8 @@ Print information about \code{xgb.Booster}.
 data(agaricus.train, package = "xgboost")
 train <- agaricus.train

-bst <- xgboost(
-  data = train$data,
-  label = train$label,
+bst <- xgb.train(
+  data = xgb.DMatrix(train$data, label = train$label),
  max_depth = 2,
  eta = 1,
  nthread = 2,
--- a/R-package/man/xgb.attr.Rd
+++ b/R-package/man/xgb.attr.Rd
@@ -64,9 +64,8 @@ example of these behaviors).
 data(agaricus.train, package = "xgboost")
 train <- agaricus.train

-bst <- xgboost(
-  data = train$data,
-  label = train$label,
+bst <- xgb.train(
+  data = xgb.DMatrix(train$data, label = train$label),
  max_depth = 2,
  eta = 1,
  nthread = 2,
--- a/R-package/man/xgb.config.Rd
+++ b/R-package/man/xgb.config.Rd
@@ -35,9 +35,8 @@ nthread <- 1
 data.table::setDTthreads(nthread)
 train <- agaricus.train

-bst <- xgboost(
-  data = train$data,
-  label = train$label,
+bst <- xgb.train(
+  data = xgb.DMatrix(train$data, label = train$label),
  max_depth = 2,
  eta = 1,
  nthread = nthread,
--- a/R-package/man/xgb.dump.Rd
+++ b/R-package/man/xgb.dump.Rd
@@ -49,8 +49,8 @@ data(agaricus.train, package='xgboost')
 data(agaricus.test, package='xgboost')
 train <- agaricus.train
 test <- agaricus.test
-bst <- xgboost(data = train$data, label = train$label, max_depth = 2,
-               eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic")
+bst <- xgb.train(data = xgb.DMatrix(train$data, label = train$label), max_depth = 2,
+                 eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic")
 # save the model in file 'xgb.model.dump'
 dump_path = file.path(tempdir(), 'model.dump')
 xgb.dump(bst, dump_path, with_stats = TRUE)
--- a/R-package/man/xgb.importance.Rd
+++ b/R-package/man/xgb.importance.Rd
@@ -70,9 +70,8 @@ be on the same scale (which is also recommended when using L1 or L2 regularizati
 # binomial classification using "gbtree":
 data(agaricus.train, package = "xgboost")

-bst <- xgboost(
-  data = agaricus.train$data,
-  label = agaricus.train$label,
+bst <- xgb.train(
+  data = xgb.DMatrix(agaricus.train$data, label = agaricus.train$label),
  max_depth = 2,
  eta = 1,
  nthread = 2,
@@ -83,9 +82,8 @@ bst <- xgboost(
 xgb.importance(model = bst)

 # binomial classification using "gblinear":
-bst <- xgboost(
-  data = agaricus.train$data,
-  label = agaricus.train$label,
+bst <- xgb.train(
+  data = xgb.DMatrix(agaricus.train$data, label = agaricus.train$label),
  booster = "gblinear",
  eta = 0.3,
  nthread = 1,
@@ -97,9 +95,11 @@ xgb.importance(model = bst)
 # multiclass classification using "gbtree":
 nclass <- 3
 nrounds <- 10
-mbst <- xgboost(
-  data = as.matrix(iris[, -5]),
-  label = as.numeric(iris$Species) - 1,
+mbst <- xgb.train(
+  data = xgb.DMatrix(
+    as.matrix(iris[, -5]),
+    label = as.numeric(iris$Species) - 1
+  ),
  max_depth = 3,
  eta = 0.2,
  nthread = 2,
@@ -123,9 +123,11 @@ xgb.importance(
 )

 # multiclass classification using "gblinear":
-mbst <- xgboost(
-  data = scale(as.matrix(iris[, -5])),
-  label = as.numeric(iris$Species) - 1,
+mbst <- xgb.train(
+  data = xgb.DMatrix(
+    scale(as.matrix(iris[, -5])),
+    label = as.numeric(iris$Species) - 1
+  ),
  booster = "gblinear",
  eta = 0.2,
  nthread = 1,
--- a/R-package/man/xgb.model.dt.tree.Rd
+++ b/R-package/man/xgb.model.dt.tree.Rd
@@ -63,9 +63,8 @@ data(agaricus.train, package = "xgboost")
 nthread <- 1
 data.table::setDTthreads(nthread)

-bst <- xgboost(
-  data = agaricus.train$data,
-  label = agaricus.train$label,
+bst <- xgb.train(
+  data = xgb.DMatrix(agaricus.train$data, label = agaricus.train$label),
  max_depth = 2,
  eta = 1,
  nthread = nthread,
--- a/R-package/man/xgb.parameters.Rd
+++ b/R-package/man/xgb.parameters.Rd
@@ -33,9 +33,8 @@ will reset its number of rounds indicator to zero.
 data(agaricus.train, package = "xgboost")
 train <- agaricus.train

-bst <- xgboost(
-  data = train$data,
-  label = train$label,
+bst <- xgb.train(
+  data = xgb.DMatrix(train$data, label = train$label),
  max_depth = 2,
  eta = 1,
  nthread = 2,
--- a/R-package/man/xgb.plot.deepness.Rd
+++ b/R-package/man/xgb.plot.deepness.Rd
@@ -73,9 +73,8 @@ nthread <- 2
 data.table::setDTthreads(nthread)

 ## Change max_depth to a higher number to get a more significant result
-bst <- xgboost(
-  data = agaricus.train$data,
-  label = agaricus.train$label,
+bst <- xgb.train(
+  data = xgb.DMatrix(agaricus.train$data, label = agaricus.train$label),
  max_depth = 6,
  nthread = nthread,
  nrounds = 50,
--- a/R-package/man/xgb.plot.importance.Rd
+++ b/R-package/man/xgb.plot.importance.Rd
@@ -88,9 +88,8 @@ data(agaricus.train)
 nthread <- 2
 data.table::setDTthreads(nthread)

-bst <- xgboost(
-  data = agaricus.train$data,
-  label = agaricus.train$label,
+bst <- xgb.train(
+  data = xgb.DMatrix(agaricus.train$data, label = agaricus.train$label),
  max_depth = 3,
  eta = 1,
  nthread = nthread,
--- a/R-package/man/xgb.plot.multi.trees.Rd
+++ b/R-package/man/xgb.plot.multi.trees.Rd
@@ -67,9 +67,8 @@ data(agaricus.train, package = "xgboost")
 nthread <- 2
 data.table::setDTthreads(nthread)

-bst <- xgboost(
-  data = agaricus.train$data,
-  label = agaricus.train$label,
+bst <- xgb.train(
+  data = xgb.DMatrix(agaricus.train$data, label = agaricus.train$label),
  max_depth = 15,
  eta = 1,
  nthread = nthread,
--- a/R-package/man/xgb.plot.shap.Rd
+++ b/R-package/man/xgb.plot.shap.Rd
@@ -135,9 +135,8 @@ nthread <- 1
 data.table::setDTthreads(nthread)
 nrounds <- 20

-bst <- xgboost(
-  agaricus.train$data,
-  agaricus.train$label,
+bst <- xgb.train(
+  data = xgb.DMatrix(agaricus.train$data, agaricus.train$label),
  nrounds = nrounds,
  eta = 0.1,
  max_depth = 3,
@@ -161,9 +160,8 @@ x <- as.matrix(iris[, -5])
 set.seed(123)
 is.na(x[sample(nrow(x) * 4, 30)]) <- TRUE # introduce some missing values

-mbst <- xgboost(
-  data = x,
-  label = as.numeric(iris$Species) - 1,
+mbst <- xgb.train(
+  data = xgb.DMatrix(x, label = as.numeric(iris$Species) - 1),
  nrounds = nrounds,
  max_depth = 2,
  eta = 0.3,
--- a/R-package/man/xgb.plot.tree.Rd
+++ b/R-package/man/xgb.plot.tree.Rd
@@ -96,9 +96,8 @@ This function uses \href{https://www.graphviz.org/}{GraphViz} as DiagrammeR back
 \examples{
 data(agaricus.train, package = "xgboost")

-bst <- xgboost(
-  data = agaricus.train$data,
-  label = agaricus.train$label,
+bst <- xgb.train(
+  data = xgb.DMatrix(agaricus.train$data, agaricus.train$label),
  max_depth = 3,
  eta = 1,
  nthread = 2,
--- a/R-package/man/xgb.train.Rd
+++ b/R-package/man/xgb.train.Rd
@@ -1,8 +1,7 @@
 % Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/xgb.train.R, R/xgboost.R
+% Please edit documentation in R/xgb.train.R
 \name{xgb.train}
 \alias{xgb.train}
-\alias{xgboost}
 \title{eXtreme Gradient Boosting Training}
 \usage{
 xgb.train(
@@ -22,24 +21,6 @@ xgb.train(
  callbacks = list(),
  ...
 )
-
-xgboost(
-  data = NULL,
-  label = NULL,
-  missing = NA,
-  weight = NULL,
-  params = list(),
-  nrounds,
-  verbose = 1,
-  print_every_n = 1L,
-  early_stopping_rounds = NULL,
-  maximize = NULL,
-  save_period = NULL,
-  save_name = "xgboost.model",
-  xgb_model = NULL,
-  callbacks = list(),
-  ...
-)
 }
 \arguments{
 \item{params}{the list of parameters. The complete list of parameters is
@@ -240,15 +221,6 @@ to customize the training process.
 }\if{html}{\out{</div>}}}

 \item{...}{other parameters to pass to \code{params}.}
-
-\item{label}{vector of response values. Should not be provided when data is
-a local data file name or an \code{xgb.DMatrix}.}
-
-\item{missing}{by default is set to NA, which means that NA values should be considered as 'missing'
-by the algorithm. Sometimes, 0 or other extreme value might be used to represent missing values.
-This parameter is only used when input is a dense matrix.}
-
-\item{weight}{a vector indicating the weight for each row of the input.}
 }
 \value{
 An object of class \code{xgb.Booster}.
@@ -383,9 +355,8 @@ bst <- xgb.train(param, dtrain, nrounds = 25, evals = evals,
                 early_stopping_rounds = 3)

 ## An 'xgboost' interface example:
-bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label,
-               max_depth = 2, eta = 1, nthread = nthread, nrounds = 2,
-               objective = "binary:logistic")
+bst <- xgboost(x = agaricus.train$data, y = factor(agaricus.train$label),
+               params = list(max_depth = 2, eta = 1), nthread = nthread, nrounds = 2)
 pred <- predict(bst, agaricus.test$data)

 }
--- a/R-package/man/xgboost.Rd
+++ b/R-package/man/xgboost.Rd
@@ -0,0 +1,213 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/xgboost.R
+\name{xgboost}
+\alias{xgboost}
+\title{Fit XGBoost Model}
+\usage{
+xgboost(
+  x,
+  y,
+  objective = NULL,
+  nrounds = 100L,
+  weights = NULL,
+  verbosity = 0L,
+  nthreads = parallel::detectCores(),
+  seed = 0L,
+  monotone_constraints = NULL,
+  interaction_constraints = NULL,
+  feature_weights = NULL,
+  base_margin = NULL,
+  ...
+)
+}
+\arguments{
+\item{x}{The features / covariates. Can be passed as:\itemize{
+\item A numeric or integer `matrix`.
+\item A `data.frame`, in which all columns are one of the following types:\itemize{
+  \item `numeric`
+  \item `integer`
+  \item `logical`
+  \item `factor`
+}
+
+Columns of `factor` type will be assumed to be categorical, while other column types will
+be assumed to be numeric.
+\item A sparse matrix from the `Matrix` package, either as `dgCMatrix` or `dgRMatrix` class.
+}
+
+Note that categorical features are only supported for `data.frame` inputs, and are automatically
+determined based on their types. See \link{xgb.train} with \link{xgb.DMatrix} for more flexible
+variants that would allow something like categorical features on sparse matrices.}
+
+\item{y}{The response variable. Allowed values are:\itemize{
+\item A numeric or integer vector (for regression tasks).
+\item A factor or character vector (for binary and multi-class classification tasks).
+\item A logical (boolean) vector (for binary classification tasks).
+\item A numeric or integer matrix or `data.frame` with numeric/integer columns
+(for multi-task regression tasks).
+\item A `Surv` object from the `survival` package (for survival tasks).
+}
+
+If `objective` is `NULL`, the right task will be determined automatically based on
+the class of `y`.
+
+If `objective` is not `NULL`, it must match with the type of `y` - e.g. `factor` types of `y`
+can only be used with classification objectives and vice-versa.
+
+For binary classification, the last factor level of `y` will be used as the "positive"
+class - that is, the numbers from `predict` will reflect the probabilities of belonging to this
+class instead of to the first factor level. If `y` is a `logical` vector, then `TRUE` will be
+set as the last level.}
+
+\item{objective}{Optimization objective to minimize based on the supplied data, to be passed
+by name as a string / character (e.g. `reg:absoluteerror`). See the
+\href{https://xgboost.readthedocs.io/en/stable/parameter.html#learning-task-parameters}{
+Learning Task Parameters} page for more detailed information on allowed values.
+
+If `NULL` (the default), will be automatically determined from `y` according to the following
+logic:\itemize{
+\item If `y` is a factor with 2 levels, will use `binary:logistic`.
+\item If `y` is a factor with more than 2 levels, will use `multi:softprob` (number of classes
+will be determined automatically, should not be passed under `params`).
+\item If `y` is a `Surv` object from the `survival` package, will use `survival:aft` (note that
+the only types supported are left / right / interval censored).
+\item Otherwise, will use `reg:squarederror`.
+}
+
+If `objective` is not `NULL`, it must match with the type of `y` - e.g. `factor` types of `y`
+can only be used with classification objectives and vice-versa.
+
+Note that not all possible `objective` values supported by the core XGBoost library are allowed
+here - for example, objectives which are a variation of another but with a different default
+prediction type (e.g. `multi:softmax` vs. `multi:softprob`) are not allowed, and neither are
+ranking objectives, nor custom objectives at the moment.}
+
+\item{nrounds}{Number of boosting iterations / rounds.
+
+Note that the number of default boosting rounds here is not automatically tuned, and different
+problems will have vastly different optimal numbers of boosting rounds.}
+
+\item{weights}{Sample weights for each row in `x` and `y`. If `NULL` (the default), each row
+will have the same weight.
+
+If not `NULL`, should be passed as a numeric vector with length matching to the number of
+rows in `x`.}
+
+\item{verbosity}{Verbosity of printing messages. Valid values of 0 (silent), 1 (warning),
+2 (info), and 3 (debug).}
+
+\item{nthreads}{Number of parallel threads to use. If passing zero, will use all CPU threads.}
+
+\item{seed}{Seed to use for random number generation. If passing `NULL`, will draw a random
+number using R's PRNG system to use as seed.}
+
+\item{monotone_constraints}{Optional monotonicity constraints for features.
+
+Can be passed either as a named list (when `x` has column names), or as a vector. If passed
+as a vector and `x` has column names, will try to match the elements by name.
+
+A value of `+1` for a given feature makes the model predictions / scores constrained to be
+a monotonically increasing function of that feature (that is, as the value of the feature
+increases, the model prediction cannot decrease), while a value of `-1` makes it a monotonically
+decreasing function. A value of zero imposes no constraint.
+
+The input for `monotone_constraints` can be a subset of the columns of `x` if named, in which
+case the columns that are not referred to in `monotone_constraints` will be assumed to have
+a value of zero (no constraint imposed on the model for those features).
+
+See the tutorial \href{https://xgboost.readthedocs.io/en/stable/tutorials/monotonic.html}{
+Monotonic Constraints} for a more detailed explanation.}
+
+\item{interaction_constraints}{Constraints for interaction representing permitted interactions.
+The constraints must be specified in the form of a list of vectors referencing columns in the
+data, e.g. `list(c(1, 2), c(3, 4, 5))` (with these numbers being column indices, numeration
+starting at 1 - i.e. the first sublist references the first and second columns) or
+`list(c("Sepal.Length", "Sepal.Width"), c("Petal.Length", "Petal.Width"))` (references
+columns by names), where each vector is a group of indices of features that are allowed to
+interact with each other.
+
+See the tutorial
+\href{https://xgboost.readthedocs.io/en/stable/tutorials/feature_interaction_constraint.html}{
+Feature Interaction Constraints} for more information.}
+
+\item{feature_weights}{Feature weights for column sampling.
+
+Can be passed either as a vector with length matching to columns of `x`, or as a named
+list (only if `x` has column names) with names matching to columns of 'x'. If it is a
+named vector, will try to match the entries to column names of `x` by name.
+
+If `NULL` (the default), all columns will have the same weight.}
+
+\item{base_margin}{Base margin used for boosting from existing model.
+
+If passing it, will start the gradient boosting procedure from the scores that are provided
+here - for example, one can pass the raw scores from a previous model, or some per-observation
+offset, or similar.
+
+Should be either a numeric vector or numeric matrix (for multi-class and multi-target objectives)
+with the same number of rows as `x` and number of columns corresponding to number of optimization
+targets, and should be in the untransformed scale (for example, for objective `binary:logistic`,
+it should have log-odds, not probabilities; and for objective `multi:softprob`, should have
+number of columns matching to number of classes in the data).
+
+Note that, if it contains more than one column, then columns will not be matched by name to
+the corresponding `y` - `base_margin` should have the same column order that the model will use
+(for example, for objective `multi:softprob`, columns of `base_margin` will be matched against
+`levels(y)` by their position, regardless of what `colnames(base_margin)` returns).
+
+If `NULL`, will start from zero, but note that for most objectives, an intercept is usually
+added (controllable through parameter `base_score` instead) when `base_margin` is not passed.}
+
+\item{...}{Other training parameters. See the online documentation
+\href{https://xgboost.readthedocs.io/en/stable/parameter.html}{XGBoost Parameters} for
+details about possible values and what they do.
+
+Note that not all possible values from the core XGBoost library are allowed as `params` for
+'xgboost()' - in particular, values which require an already-fitted booster object (such as
+`process_type`) are not accepted here.}
+}
+\value{
+A model object, inheriting from both `xgboost` and `xgb.Booster`. Compared to the regular
+`xgb.Booster` model class produced by \link{xgb.train}, this `xgboost` class will have an
+additional attribute `metadata` containing information which is used for formatting prediction
+outputs, such as class names for classification problems.
+}
+\description{
+Fits an XGBoost model (boosted decision tree ensemble) to given x/y data.
+
+See the tutorial \href{https://xgboost.readthedocs.io/en/stable/tutorials/model.html}{
+Introduction to Boosted Trees} for a longer explanation of what XGBoost does.
+
+This function is intended to provide a more user-friendly interface for XGBoost that follows
+R's conventions for model fitting and predictions, but which doesn't expose all of the
+possible functionalities of the core XGBoost library.
+
+See \link{xgb.train} for a more flexible low-level alternative which is similar across different
+language bindings of XGBoost and which exposes the full library's functionalities.
+}
+\details{
+For package authors using `xgboost` as a dependency, it is highly recommended to use
+\link{xgb.train} in package code instead of `xgboost()`, since it has a more stable interface
+and performs fewer data conversions and copies along the way.
+}
+\examples{
+library(xgboost)
+data(mtcars)
+
+# Fit a small regression model on the mtcars data
+model_regression <- xgboost(mtcars[, -1], mtcars$mpg, nthreads = 1, nrounds = 3)
+predict(model_regression, mtcars, validate_features = TRUE)
+
+# Task objective is determined automatically according to the type of 'y'
+data(iris)
+model_classif <- xgboost(iris[, -5], iris$Species, nthreads = 1, nrounds = 5)
+predict(model_classif, iris, validate_features = TRUE)
+}
+\references{
+\itemize{
+\item Chen, Tianqi, and Carlos Guestrin. "Xgboost: A scalable tree boosting system."
+Proceedings of the 22nd acm sigkdd international conference on knowledge discovery and
+data mining. 2016.
+\item \url{https://xgboost.readthedocs.io/en/stable/}
+}
+}