163 lines
5.3 KiB
R
163 lines
5.3 KiB
R
% Generated by roxygen2: do not edit by hand
|
|
% Please edit documentation in R/callbacks.R
|
|
\name{xgb.cb.gblinear.history}
|
|
\alias{xgb.cb.gblinear.history}
|
|
\title{Callback for collecting coefficients history of a gblinear booster}
|
|
\usage{
|
|
xgb.cb.gblinear.history(sparse = FALSE)
|
|
}
|
|
\arguments{
|
|
\item{sparse}{When set to \code{FALSE}/\code{TRUE}, a dense/sparse matrix is used to store the result.
|
|
Sparse format is useful when one expects only a subset of coefficients to be non-zero,
|
|
when using the "thrifty" feature selector with fairly small number of top features
|
|
selected per iteration.}
|
|
}
|
|
\value{
|
|
An \code{xgb.Callback} object, which can be passed to \code{\link[=xgb.train]{xgb.train()}} or \code{\link[=xgb.cv]{xgb.cv()}}.
|
|
}
|
|
\description{
|
|
Callback for collecting coefficients history of a gblinear booster
|
|
}
|
|
\details{
|
|
To keep things fast and simple, gblinear booster does not internally store the history of linear
|
|
model coefficients at each boosting iteration. This callback provides a workaround for storing
|
|
the coefficients' path, by extracting them after each training iteration.
|
|
|
|
This callback will construct a matrix where rows are boosting iterations and columns are
|
|
feature coefficients (same order as when calling \link{coef.xgb.Booster}, with the intercept
|
|
corresponding to the first column).
|
|
|
|
When there is more than one coefficient per feature (e.g. multi-class classification),
|
|
the result will be reshaped into a vector where coefficients are arranged first by features and
|
|
then by class (e.g. first 1 through N coefficients will be for the first class, then
|
|
coefficients N+1 through 2N for the second class, and so on).
|
|
|
|
If the result has only one coefficient per feature in the data, then the resulting matrix
|
|
will have column names matching with the feature names, otherwise (when there's more than
|
|
one coefficient per feature) the names will be composed as 'column name' + ':' + 'class index'
|
|
(so e.g. column 'c1' for class '0' will be named 'c1:0').
|
|
|
|
With \code{\link[=xgb.train]{xgb.train()}}, the output is either a dense or a sparse matrix.
|
|
With with \code{\link[=xgb.cv]{xgb.cv()}}, it is a list (one element per each fold) of such matrices.
|
|
|
|
Function \link{xgb.gblinear.history} provides an easy way to retrieve the
|
|
outputs from this callback.
|
|
}
|
|
\examples{
|
|
#### Binary classification:
|
|
|
|
## Keep the number of threads to 1 for examples
|
|
nthread <- 1
|
|
data.table::setDTthreads(nthread)
|
|
|
|
# In the iris dataset, it is hard to linearly separate Versicolor class from the rest
|
|
# without considering the 2nd order interactions:
|
|
x <- model.matrix(Species ~ .^2, iris)[, -1]
|
|
colnames(x)
|
|
dtrain <- xgb.DMatrix(
|
|
scale(x),
|
|
label = 1 * (iris$Species == "versicolor"),
|
|
nthread = nthread
|
|
)
|
|
param <- list(
|
|
booster = "gblinear",
|
|
objective = "reg:logistic",
|
|
eval_metric = "auc",
|
|
lambda = 0.0003,
|
|
alpha = 0.0003,
|
|
nthread = nthread
|
|
)
|
|
|
|
# For 'shotgun', which is a default linear updater, using high eta values may result in
|
|
# unstable behaviour in some datasets. With this simple dataset, however, the high learning
|
|
# rate does not break the convergence, but allows us to illustrate the typical pattern of
|
|
# "stochastic explosion" behaviour of this lock-free algorithm at early boosting iterations.
|
|
bst <- xgb.train(
|
|
param,
|
|
dtrain,
|
|
list(tr = dtrain),
|
|
nrounds = 200,
|
|
eta = 1.,
|
|
callbacks = list(xgb.cb.gblinear.history())
|
|
)
|
|
|
|
# Extract the coefficients' path and plot them vs boosting iteration number:
|
|
coef_path <- xgb.gblinear.history(bst)
|
|
matplot(coef_path, type = "l")
|
|
|
|
# With the deterministic coordinate descent updater, it is safer to use higher learning rates.
|
|
# Will try the classical componentwise boosting which selects a single best feature per round:
|
|
bst <- xgb.train(
|
|
param,
|
|
dtrain,
|
|
list(tr = dtrain),
|
|
nrounds = 200,
|
|
eta = 0.8,
|
|
updater = "coord_descent",
|
|
feature_selector = "thrifty",
|
|
top_k = 1,
|
|
callbacks = list(xgb.cb.gblinear.history())
|
|
)
|
|
matplot(xgb.gblinear.history(bst), type = "l")
|
|
# Componentwise boosting is known to have similar effect to Lasso regularization.
|
|
# Try experimenting with various values of top_k, eta, nrounds,
|
|
# as well as different feature_selectors.
|
|
|
|
# For xgb.cv:
|
|
bst <- xgb.cv(
|
|
param,
|
|
dtrain,
|
|
nfold = 5,
|
|
nrounds = 100,
|
|
eta = 0.8,
|
|
callbacks = list(xgb.cb.gblinear.history())
|
|
)
|
|
# coefficients in the CV fold #3
|
|
matplot(xgb.gblinear.history(bst)[[3]], type = "l")
|
|
|
|
|
|
#### Multiclass classification:
|
|
dtrain <- xgb.DMatrix(scale(x), label = as.numeric(iris$Species) - 1, nthread = nthread)
|
|
|
|
param <- list(
|
|
booster = "gblinear",
|
|
objective = "multi:softprob",
|
|
num_class = 3,
|
|
lambda = 0.0003,
|
|
alpha = 0.0003,
|
|
nthread = nthread
|
|
)
|
|
|
|
# For the default linear updater 'shotgun' it sometimes is helpful
|
|
# to use smaller eta to reduce instability
|
|
bst <- xgb.train(
|
|
param,
|
|
dtrain,
|
|
list(tr = dtrain),
|
|
nrounds = 50,
|
|
eta = 0.5,
|
|
callbacks = list(xgb.cb.gblinear.history())
|
|
)
|
|
|
|
# Will plot the coefficient paths separately for each class:
|
|
matplot(xgb.gblinear.history(bst, class_index = 0), type = "l")
|
|
matplot(xgb.gblinear.history(bst, class_index = 1), type = "l")
|
|
matplot(xgb.gblinear.history(bst, class_index = 2), type = "l")
|
|
|
|
# CV:
|
|
bst <- xgb.cv(
|
|
param,
|
|
dtrain,
|
|
nfold = 5,
|
|
nrounds = 70,
|
|
eta = 0.5,
|
|
callbacks = list(xgb.cb.gblinear.history(FALSE))
|
|
)
|
|
# 1st fold of 1st class
|
|
matplot(xgb.gblinear.history(bst, class_index = 0)[[1]], type = "l")
|
|
|
|
}
|
|
\seealso{
|
|
\link{xgb.gblinear.history}, \link{coef.xgb.Booster}.
|
|
}
|