115 lines
3.7 KiB
R
115 lines
3.7 KiB
R
% Generated by roxygen2: do not edit by hand
|
|
% Please edit documentation in R/xgb.ggplot.R, R/xgb.plot.importance.R
|
|
\name{xgb.ggplot.importance}
|
|
\alias{xgb.ggplot.importance}
|
|
\alias{xgb.plot.importance}
|
|
\title{Plot feature importance}
|
|
\usage{
|
|
xgb.ggplot.importance(
|
|
importance_matrix = NULL,
|
|
top_n = NULL,
|
|
measure = NULL,
|
|
rel_to_first = FALSE,
|
|
n_clusters = seq_len(10),
|
|
...
|
|
)
|
|
|
|
xgb.plot.importance(
|
|
importance_matrix = NULL,
|
|
top_n = NULL,
|
|
measure = NULL,
|
|
rel_to_first = FALSE,
|
|
left_margin = 10,
|
|
cex = NULL,
|
|
plot = TRUE,
|
|
...
|
|
)
|
|
}
|
|
\arguments{
|
|
\item{importance_matrix}{A \code{data.table} as returned by \code{\link[=xgb.importance]{xgb.importance()}}.}
|
|
|
|
\item{top_n}{Maximal number of top features to include into the plot.}
|
|
|
|
\item{measure}{The name of importance measure to plot.
|
|
When \code{NULL}, 'Gain' would be used for trees and 'Weight' would be used for gblinear.}
|
|
|
|
\item{rel_to_first}{Whether importance values should be represented as relative to
|
|
the highest ranked feature, see Details.}
|
|
|
|
\item{n_clusters}{A numeric vector containing the min and the max range
|
|
of the possible number of clusters of bars.}
|
|
|
|
\item{...}{Other parameters passed to \code{\link[graphics:barplot]{graphics::barplot()}}
|
|
(except \code{horiz}, \code{border}, \code{cex.names}, \code{names.arg}, and \code{las}).
|
|
Only used in \code{xgb.plot.importance()}.}
|
|
|
|
\item{left_margin}{Adjust the left margin size to fit feature names.
|
|
When \code{NULL}, the existing \code{par("mar")} is used.}
|
|
|
|
\item{cex}{Passed as \code{cex.names} parameter to \code{\link[graphics:barplot]{graphics::barplot()}}.}
|
|
|
|
\item{plot}{Should the barplot be shown? Default is \code{TRUE}.}
|
|
}
|
|
\value{
|
|
The return value depends on the function:
|
|
\itemize{
|
|
\item \code{xgb.plot.importance()}: Invisibly, a "data.table" with \code{n_top} features sorted
|
|
by importance. If \code{plot = TRUE}, the values are also plotted as barplot.
|
|
\item \code{xgb.ggplot.importance()}: A customizable "ggplot" object.
|
|
E.g., to change the title, set \code{+ ggtitle("A GRAPH NAME")}.
|
|
}
|
|
}
|
|
\description{
|
|
Represents previously calculated feature importance as a bar graph.
|
|
\itemize{
|
|
\item \code{xgb.plot.importance()} uses base R graphics, while
|
|
\item \code{xgb.ggplot.importance()} uses "ggplot".
|
|
}
|
|
}
|
|
\details{
|
|
The graph represents each feature as a horizontal bar of length proportional to the importance of a feature.
|
|
Features are sorted by decreasing importance.
|
|
It works for both "gblinear" and "gbtree" models.
|
|
|
|
When \code{rel_to_first = FALSE}, the values would be plotted as in \code{importance_matrix}.
|
|
For a "gbtree" model, that would mean being normalized to the total of 1
|
|
("what is feature's importance contribution relative to the whole model?").
|
|
For linear models, \code{rel_to_first = FALSE} would show actual values of the coefficients.
|
|
Setting \code{rel_to_first = TRUE} allows to see the picture from the perspective of
|
|
"what is feature's importance contribution relative to the most important feature?"
|
|
|
|
The "ggplot" backend performs 1-D clustering of the importance values,
|
|
with bar colors corresponding to different clusters having similar importance values.
|
|
}
|
|
\examples{
|
|
data(agaricus.train)
|
|
|
|
## Keep the number of threads to 2 for examples
|
|
nthread <- 2
|
|
data.table::setDTthreads(nthread)
|
|
|
|
bst <- xgb.train(
|
|
data = xgb.DMatrix(agaricus.train$data, label = agaricus.train$label),
|
|
max_depth = 3,
|
|
eta = 1,
|
|
nthread = nthread,
|
|
nrounds = 2,
|
|
objective = "binary:logistic"
|
|
)
|
|
|
|
importance_matrix <- xgb.importance(colnames(agaricus.train$data), model = bst)
|
|
xgb.plot.importance(
|
|
importance_matrix, rel_to_first = TRUE, xlab = "Relative importance"
|
|
)
|
|
|
|
gg <- xgb.ggplot.importance(
|
|
importance_matrix, measure = "Frequency", rel_to_first = TRUE
|
|
)
|
|
gg
|
|
gg + ggplot2::ylab("Frequency")
|
|
|
|
}
|
|
\seealso{
|
|
\code{\link[graphics:barplot]{graphics::barplot()}}
|
|
}
|