[R] Redesigned xgboost() interface skeleton (#10456)
--------- Co-authored-by: Michael Mayer <mayermichael79@gmail.com>
This commit is contained in:
@@ -21,9 +21,8 @@ Print information about \code{xgb.Booster}.
|
||||
data(agaricus.train, package = "xgboost")
|
||||
train <- agaricus.train
|
||||
|
||||
bst <- xgboost(
|
||||
data = train$data,
|
||||
label = train$label,
|
||||
bst <- xgb.train(
|
||||
data = xgb.DMatrix(train$data, label = train$label),
|
||||
max_depth = 2,
|
||||
eta = 1,
|
||||
nthread = 2,
|
||||
|
||||
@@ -64,9 +64,8 @@ example of these behaviors).
|
||||
data(agaricus.train, package = "xgboost")
|
||||
train <- agaricus.train
|
||||
|
||||
bst <- xgboost(
|
||||
data = train$data,
|
||||
label = train$label,
|
||||
bst <- xgb.train(
|
||||
data = xgb.DMatrix(train$data, label = train$label),
|
||||
max_depth = 2,
|
||||
eta = 1,
|
||||
nthread = 2,
|
||||
|
||||
@@ -35,9 +35,8 @@ nthread <- 1
|
||||
data.table::setDTthreads(nthread)
|
||||
train <- agaricus.train
|
||||
|
||||
bst <- xgboost(
|
||||
data = train$data,
|
||||
label = train$label,
|
||||
bst <- xgb.train(
|
||||
data = xgb.DMatrix(train$data, label = train$label),
|
||||
max_depth = 2,
|
||||
eta = 1,
|
||||
nthread = nthread,
|
||||
|
||||
@@ -49,8 +49,8 @@ data(agaricus.train, package='xgboost')
|
||||
data(agaricus.test, package='xgboost')
|
||||
train <- agaricus.train
|
||||
test <- agaricus.test
|
||||
bst <- xgboost(data = train$data, label = train$label, max_depth = 2,
|
||||
eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic")
|
||||
bst <- xgb.train(data = xgb.DMatrix(train$data, label = train$label), max_depth = 2,
|
||||
eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic")
|
||||
# save the model in file 'xgb.model.dump'
|
||||
dump_path = file.path(tempdir(), 'model.dump')
|
||||
xgb.dump(bst, dump_path, with_stats = TRUE)
|
||||
|
||||
@@ -70,9 +70,8 @@ be on the same scale (which is also recommended when using L1 or L2 regularizati
|
||||
# binomial classification using "gbtree":
|
||||
data(agaricus.train, package = "xgboost")
|
||||
|
||||
bst <- xgboost(
|
||||
data = agaricus.train$data,
|
||||
label = agaricus.train$label,
|
||||
bst <- xgb.train(
|
||||
data = xgb.DMatrix(agaricus.train$data, label = agaricus.train$label),
|
||||
max_depth = 2,
|
||||
eta = 1,
|
||||
nthread = 2,
|
||||
@@ -83,9 +82,8 @@ bst <- xgboost(
|
||||
xgb.importance(model = bst)
|
||||
|
||||
# binomial classification using "gblinear":
|
||||
bst <- xgboost(
|
||||
data = agaricus.train$data,
|
||||
label = agaricus.train$label,
|
||||
bst <- xgb.train(
|
||||
data = xgb.DMatrix(agaricus.train$data, label = agaricus.train$label),
|
||||
booster = "gblinear",
|
||||
eta = 0.3,
|
||||
nthread = 1,
|
||||
@@ -97,9 +95,11 @@ xgb.importance(model = bst)
|
||||
# multiclass classification using "gbtree":
|
||||
nclass <- 3
|
||||
nrounds <- 10
|
||||
mbst <- xgboost(
|
||||
data = as.matrix(iris[, -5]),
|
||||
label = as.numeric(iris$Species) - 1,
|
||||
mbst <- xgb.train(
|
||||
data = xgb.DMatrix(
|
||||
as.matrix(iris[, -5]),
|
||||
label = as.numeric(iris$Species) - 1
|
||||
),
|
||||
max_depth = 3,
|
||||
eta = 0.2,
|
||||
nthread = 2,
|
||||
@@ -123,9 +123,11 @@ xgb.importance(
|
||||
)
|
||||
|
||||
# multiclass classification using "gblinear":
|
||||
mbst <- xgboost(
|
||||
data = scale(as.matrix(iris[, -5])),
|
||||
label = as.numeric(iris$Species) - 1,
|
||||
mbst <- xgb.train(
|
||||
data = xgb.DMatrix(
|
||||
scale(as.matrix(iris[, -5])),
|
||||
label = as.numeric(iris$Species) - 1
|
||||
),
|
||||
booster = "gblinear",
|
||||
eta = 0.2,
|
||||
nthread = 1,
|
||||
|
||||
@@ -63,9 +63,8 @@ data(agaricus.train, package = "xgboost")
|
||||
nthread <- 1
|
||||
data.table::setDTthreads(nthread)
|
||||
|
||||
bst <- xgboost(
|
||||
data = agaricus.train$data,
|
||||
label = agaricus.train$label,
|
||||
bst <- xgb.train(
|
||||
data = xgb.DMatrix(agaricus.train$data, label = agaricus.train$label),
|
||||
max_depth = 2,
|
||||
eta = 1,
|
||||
nthread = nthread,
|
||||
|
||||
@@ -33,9 +33,8 @@ will reset its number of rounds indicator to zero.
|
||||
data(agaricus.train, package = "xgboost")
|
||||
train <- agaricus.train
|
||||
|
||||
bst <- xgboost(
|
||||
data = train$data,
|
||||
label = train$label,
|
||||
bst <- xgb.train(
|
||||
data = xgb.DMatrix(train$data, label = train$label),
|
||||
max_depth = 2,
|
||||
eta = 1,
|
||||
nthread = 2,
|
||||
|
||||
@@ -73,9 +73,8 @@ nthread <- 2
|
||||
data.table::setDTthreads(nthread)
|
||||
|
||||
## Change max_depth to a higher number to get a more significant result
|
||||
bst <- xgboost(
|
||||
data = agaricus.train$data,
|
||||
label = agaricus.train$label,
|
||||
bst <- xgb.train(
|
||||
data = xgb.DMatrix(agaricus.train$data, label = agaricus.train$label),
|
||||
max_depth = 6,
|
||||
nthread = nthread,
|
||||
nrounds = 50,
|
||||
|
||||
@@ -88,9 +88,8 @@ data(agaricus.train)
|
||||
nthread <- 2
|
||||
data.table::setDTthreads(nthread)
|
||||
|
||||
bst <- xgboost(
|
||||
data = agaricus.train$data,
|
||||
label = agaricus.train$label,
|
||||
bst <- xgb.train(
|
||||
data = xgb.DMatrix(agaricus.train$data, label = agaricus.train$label),
|
||||
max_depth = 3,
|
||||
eta = 1,
|
||||
nthread = nthread,
|
||||
|
||||
@@ -67,9 +67,8 @@ data(agaricus.train, package = "xgboost")
|
||||
nthread <- 2
|
||||
data.table::setDTthreads(nthread)
|
||||
|
||||
bst <- xgboost(
|
||||
data = agaricus.train$data,
|
||||
label = agaricus.train$label,
|
||||
bst <- xgb.train(
|
||||
data = xgb.DMatrix(agaricus.train$data, label = agaricus.train$label),
|
||||
max_depth = 15,
|
||||
eta = 1,
|
||||
nthread = nthread,
|
||||
|
||||
@@ -135,9 +135,8 @@ nthread <- 1
|
||||
data.table::setDTthreads(nthread)
|
||||
nrounds <- 20
|
||||
|
||||
bst <- xgboost(
|
||||
agaricus.train$data,
|
||||
agaricus.train$label,
|
||||
bst <- xgb.train(
|
||||
data = xgb.DMatrix(agaricus.train$data, agaricus.train$label),
|
||||
nrounds = nrounds,
|
||||
eta = 0.1,
|
||||
max_depth = 3,
|
||||
@@ -161,9 +160,8 @@ x <- as.matrix(iris[, -5])
|
||||
set.seed(123)
|
||||
is.na(x[sample(nrow(x) * 4, 30)]) <- TRUE # introduce some missing values
|
||||
|
||||
mbst <- xgboost(
|
||||
data = x,
|
||||
label = as.numeric(iris$Species) - 1,
|
||||
mbst <- xgb.train(
|
||||
data = xgb.DMatrix(x, label = as.numeric(iris$Species) - 1),
|
||||
nrounds = nrounds,
|
||||
max_depth = 2,
|
||||
eta = 0.3,
|
||||
|
||||
@@ -96,9 +96,8 @@ This function uses \href{https://www.graphviz.org/}{GraphViz} as DiagrammeR back
|
||||
\examples{
|
||||
data(agaricus.train, package = "xgboost")
|
||||
|
||||
bst <- xgboost(
|
||||
data = agaricus.train$data,
|
||||
label = agaricus.train$label,
|
||||
bst <- xgb.train(
|
||||
data = xgb.DMatrix(agaricus.train$data, agaricus.train$label),
|
||||
max_depth = 3,
|
||||
eta = 1,
|
||||
nthread = 2,
|
||||
|
||||
@@ -1,8 +1,7 @@
|
||||
% Generated by roxygen2: do not edit by hand
|
||||
% Please edit documentation in R/xgb.train.R, R/xgboost.R
|
||||
% Please edit documentation in R/xgb.train.R
|
||||
\name{xgb.train}
|
||||
\alias{xgb.train}
|
||||
\alias{xgboost}
|
||||
\title{eXtreme Gradient Boosting Training}
|
||||
\usage{
|
||||
xgb.train(
|
||||
@@ -22,24 +21,6 @@ xgb.train(
|
||||
callbacks = list(),
|
||||
...
|
||||
)
|
||||
|
||||
xgboost(
|
||||
data = NULL,
|
||||
label = NULL,
|
||||
missing = NA,
|
||||
weight = NULL,
|
||||
params = list(),
|
||||
nrounds,
|
||||
verbose = 1,
|
||||
print_every_n = 1L,
|
||||
early_stopping_rounds = NULL,
|
||||
maximize = NULL,
|
||||
save_period = NULL,
|
||||
save_name = "xgboost.model",
|
||||
xgb_model = NULL,
|
||||
callbacks = list(),
|
||||
...
|
||||
)
|
||||
}
|
||||
\arguments{
|
||||
\item{params}{the list of parameters. The complete list of parameters is
|
||||
@@ -240,15 +221,6 @@ to customize the training process.
|
||||
}\if{html}{\out{</div>}}}
|
||||
|
||||
\item{...}{other parameters to pass to \code{params}.}
|
||||
|
||||
\item{label}{vector of response values. Should not be provided when data is
|
||||
a local data file name or an \code{xgb.DMatrix}.}
|
||||
|
||||
\item{missing}{by default is set to NA, which means that NA values should be considered as 'missing'
|
||||
by the algorithm. Sometimes, 0 or other extreme value might be used to represent missing values.
|
||||
This parameter is only used when input is a dense matrix.}
|
||||
|
||||
\item{weight}{a vector indicating the weight for each row of the input.}
|
||||
}
|
||||
\value{
|
||||
An object of class \code{xgb.Booster}.
|
||||
@@ -383,9 +355,8 @@ bst <- xgb.train(param, dtrain, nrounds = 25, evals = evals,
|
||||
early_stopping_rounds = 3)
|
||||
|
||||
## An 'xgboost' interface example:
|
||||
bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label,
|
||||
max_depth = 2, eta = 1, nthread = nthread, nrounds = 2,
|
||||
objective = "binary:logistic")
|
||||
bst <- xgboost(x = agaricus.train$data, y = factor(agaricus.train$label),
|
||||
params = list(max_depth = 2, eta = 1), nthread = nthread, nrounds = 2)
|
||||
pred <- predict(bst, agaricus.test$data)
|
||||
|
||||
}
|
||||
|
||||
213
R-package/man/xgboost.Rd
Normal file
213
R-package/man/xgboost.Rd
Normal file
@@ -0,0 +1,213 @@
|
||||
% Generated by roxygen2: do not edit by hand
|
||||
% Please edit documentation in R/xgboost.R
|
||||
\name{xgboost}
|
||||
\alias{xgboost}
|
||||
\title{Fit XGBoost Model}
|
||||
\usage{
|
||||
xgboost(
|
||||
x,
|
||||
y,
|
||||
objective = NULL,
|
||||
nrounds = 100L,
|
||||
weights = NULL,
|
||||
verbosity = 0L,
|
||||
nthreads = parallel::detectCores(),
|
||||
seed = 0L,
|
||||
monotone_constraints = NULL,
|
||||
interaction_constraints = NULL,
|
||||
feature_weights = NULL,
|
||||
base_margin = NULL,
|
||||
...
|
||||
)
|
||||
}
|
||||
\arguments{
|
||||
\item{x}{The features / covariates. Can be passed as:\itemize{
|
||||
\item A numeric or integer `matrix`.
|
||||
\item A `data.frame`, in which all columns are one of the following types:\itemize{
|
||||
\item `numeric`
|
||||
\item `integer`
|
||||
\item `logical`
|
||||
\item `factor`
|
||||
}
|
||||
|
||||
Columns of `factor` type will be assumed to be categorical, while other column types will
|
||||
be assumed to be numeric.
|
||||
\item A sparse matrix from the `Matrix` package, either as `dgCMatrix` or `dgRMatrix` class.
|
||||
}
|
||||
|
||||
Note that categorical features are only supported for `data.frame` inputs, and are automatically
|
||||
determined based on their types. See \link{xgb.train} with \link{xgb.DMatrix} for more flexible
|
||||
variants that would allow something like categorical features on sparse matrices.}
|
||||
|
||||
\item{y}{The response variable. Allowed values are:\itemize{
|
||||
\item A numeric or integer vector (for regression tasks).
|
||||
\item A factor or character vector (for binary and multi-class classification tasks).
|
||||
\item A logical (boolean) vector (for binary classification tasks).
|
||||
\item A numeric or integer matrix or `data.frame` with numeric/integer columns
|
||||
(for multi-task regression tasks).
|
||||
\item A `Surv` object from the `survival` package (for survival tasks).
|
||||
}
|
||||
|
||||
If `objective` is `NULL`, the right task will be determined automatically based on
|
||||
the class of `y`.
|
||||
|
||||
If `objective` is not `NULL`, it must match with the type of `y` - e.g. `factor` types of `y`
|
||||
can only be used with classification objectives and vice-versa.
|
||||
|
||||
For binary classification, the last factor level of `y` will be used as the "positive"
|
||||
class - that is, the numbers from `predict` will reflect the probabilities of belonging to this
|
||||
class instead of to the first factor level. If `y` is a `logical` vector, then `TRUE` will be
|
||||
set as the last level.}
|
||||
|
||||
\item{objective}{Optimization objective to minimize based on the supplied data, to be passed
|
||||
by name as a string / character (e.g. `reg:absoluteerror`). See the
|
||||
\href{https://xgboost.readthedocs.io/en/stable/parameter.html#learning-task-parameters}{
|
||||
Learning Task Parameters} page for more detailed information on allowed values.
|
||||
|
||||
If `NULL` (the default), will be automatically determined from `y` according to the following
|
||||
logic:\itemize{
|
||||
\item If `y` is a factor with 2 levels, will use `binary:logistic`.
|
||||
\item If `y` is a factor with more than 2 levels, will use `multi:softprob` (number of classes
|
||||
will be determined automatically, should not be passed under `params`).
|
||||
\item If `y` is a `Surv` object from the `survival` package, will use `survival:aft` (note that
|
||||
the only types supported are left / right / interval censored).
|
||||
\item Otherwise, will use `reg:squarederror`.
|
||||
}
|
||||
|
||||
If `objective` is not `NULL`, it must match with the type of `y` - e.g. `factor` types of `y`
|
||||
can only be used with classification objectives and vice-versa.
|
||||
|
||||
Note that not all possible `objective` values supported by the core XGBoost library are allowed
|
||||
here - for example, objectives which are a variation of another but with a different default
|
||||
prediction type (e.g. `multi:softmax` vs. `multi:softprob`) are not allowed, and neither are
|
||||
ranking objectives, nor custom objectives at the moment.}
|
||||
|
||||
\item{nrounds}{Number of boosting iterations / rounds.
|
||||
|
||||
Note that the number of default boosting rounds here is not automatically tuned, and different
|
||||
problems will have vastly different optimal numbers of boosting rounds.}
|
||||
|
||||
\item{weights}{Sample weights for each row in `x` and `y`. If `NULL` (the default), each row
|
||||
will have the same weight.
|
||||
|
||||
If not `NULL`, should be passed as a numeric vector with length matching to the number of
|
||||
rows in `x`.}
|
||||
|
||||
\item{verbosity}{Verbosity of printing messages. Valid values of 0 (silent), 1 (warning),
|
||||
2 (info), and 3 (debug).}
|
||||
|
||||
\item{nthreads}{Number of parallel threads to use. If passing zero, will use all CPU threads.}
|
||||
|
||||
\item{seed}{Seed to use for random number generation. If passing `NULL`, will draw a random
|
||||
number using R's PRNG system to use as seed.}
|
||||
|
||||
\item{monotone_constraints}{Optional monotonicity constraints for features.
|
||||
|
||||
Can be passed either as a named list (when `x` has column names), or as a vector. If passed
|
||||
as a vector and `x` has column names, will try to match the elements by name.
|
||||
|
||||
A value of `+1` for a given feature makes the model predictions / scores constrained to be
|
||||
a monotonically increasing function of that feature (that is, as the value of the feature
|
||||
increases, the model prediction cannot decrease), while a value of `-1` makes it a monotonically
|
||||
decreasing function. A value of zero imposes no constraint.
|
||||
|
||||
The input for `monotone_constraints` can be a subset of the columns of `x` if named, in which
|
||||
case the columns that are not referred to in `monotone_constraints` will be assumed to have
|
||||
a value of zero (no constraint imposed on the model for those features).
|
||||
|
||||
See the tutorial \href{https://xgboost.readthedocs.io/en/stable/tutorials/monotonic.html}{
|
||||
Monotonic Constraints} for a more detailed explanation.}
|
||||
|
||||
\item{interaction_constraints}{Constraints for interaction representing permitted interactions.
|
||||
The constraints must be specified in the form of a list of vectors referencing columns in the
|
||||
data, e.g. `list(c(1, 2), c(3, 4, 5))` (with these numbers being column indices, numeration
|
||||
starting at 1 - i.e. the first sublist references the first and second columns) or
|
||||
`list(c("Sepal.Length", "Sepal.Width"), c("Petal.Length", "Petal.Width"))` (references
|
||||
columns by names), where each vector is a group of indices of features that are allowed to
|
||||
interact with each other.
|
||||
|
||||
See the tutorial
|
||||
\href{https://xgboost.readthedocs.io/en/stable/tutorials/feature_interaction_constraint.html}{
|
||||
Feature Interaction Constraints} for more information.}
|
||||
|
||||
\item{feature_weights}{Feature weights for column sampling.
|
||||
|
||||
Can be passed either as a vector with length matching to columns of `x`, or as a named
|
||||
list (only if `x` has column names) with names matching to columns of 'x'. If it is a
|
||||
named vector, will try to match the entries to column names of `x` by name.
|
||||
|
||||
If `NULL` (the default), all columns will have the same weight.}
|
||||
|
||||
\item{base_margin}{Base margin used for boosting from existing model.
|
||||
|
||||
If passing it, will start the gradient boosting procedure from the scores that are provided
|
||||
here - for example, one can pass the raw scores from a previous model, or some per-observation
|
||||
offset, or similar.
|
||||
|
||||
Should be either a numeric vector or numeric matrix (for multi-class and multi-target objectives)
|
||||
with the same number of rows as `x` and number of columns corresponding to number of optimization
|
||||
targets, and should be in the untransformed scale (for example, for objective `binary:logistic`,
|
||||
it should have log-odds, not probabilities; and for objective `multi:softprob`, should have
|
||||
number of columns matching to number of classes in the data).
|
||||
|
||||
Note that, if it contains more than one column, then columns will not be matched by name to
|
||||
the corresponding `y` - `base_margin` should have the same column order that the model will use
|
||||
(for example, for objective `multi:softprob`, columns of `base_margin` will be matched against
|
||||
`levels(y)` by their position, regardless of what `colnames(base_margin)` returns).
|
||||
|
||||
If `NULL`, will start from zero, but note that for most objectives, an intercept is usually
|
||||
added (controllable through parameter `base_score` instead) when `base_margin` is not passed.}
|
||||
|
||||
\item{...}{Other training parameters. See the online documentation
|
||||
\href{https://xgboost.readthedocs.io/en/stable/parameter.html}{XGBoost Parameters} for
|
||||
details about possible values and what they do.
|
||||
|
||||
Note that not all possible values from the core XGBoost library are allowed as `params` for
|
||||
'xgboost()' - in particular, values which require an already-fitted booster object (such as
|
||||
`process_type`) are not accepted here.}
|
||||
}
|
||||
\value{
|
||||
A model object, inheriting from both `xgboost` and `xgb.Booster`. Compared to the regular
|
||||
`xgb.Booster` model class produced by \link{xgb.train}, this `xgboost` class will have an
|
||||
additional attribute `metadata` containing information which is used for formatting prediction
|
||||
outputs, such as class names for classification problems.
|
||||
}
|
||||
\description{
|
||||
Fits an XGBoost model (boosted decision tree ensemble) to given x/y data.
|
||||
|
||||
See the tutorial \href{https://xgboost.readthedocs.io/en/stable/tutorials/model.html}{
|
||||
Introduction to Boosted Trees} for a longer explanation of what XGBoost does.
|
||||
|
||||
This function is intended to provide a more user-friendly interface for XGBoost that follows
|
||||
R's conventions for model fitting and predictions, but which doesn't expose all of the
|
||||
possible functionalities of the core XGBoost library.
|
||||
|
||||
See \link{xgb.train} for a more flexible low-level alternative which is similar across different
|
||||
language bindings of XGBoost and which exposes the full library's functionalities.
|
||||
}
|
||||
\details{
|
||||
For package authors using `xgboost` as a dependency, it is highly recommended to use
|
||||
\link{xgb.train} in package code instead of `xgboost()`, since it has a more stable interface
|
||||
and performs fewer data conversions and copies along the way.
|
||||
}
|
||||
\examples{
|
||||
library(xgboost)
|
||||
data(mtcars)
|
||||
|
||||
# Fit a small regression model on the mtcars data
|
||||
model_regression <- xgboost(mtcars[, -1], mtcars$mpg, nthreads = 1, nrounds = 3)
|
||||
predict(model_regression, mtcars, validate_features = TRUE)
|
||||
|
||||
# Task objective is determined automatically according to the type of 'y'
|
||||
data(iris)
|
||||
model_classif <- xgboost(iris[, -5], iris$Species, nthreads = 1, nrounds = 5)
|
||||
predict(model_classif, iris, validate_features = TRUE)
|
||||
}
|
||||
\references{
|
||||
\itemize{
|
||||
\item Chen, Tianqi, and Carlos Guestrin. "Xgboost: A scalable tree boosting system."
|
||||
Proceedings of the 22nd acm sigkdd international conference on knowledge discovery and
|
||||
data mining. 2016.
|
||||
\item \url{https://xgboost.readthedocs.io/en/stable/}
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user