224 lines
6.6 KiB
R
224 lines
6.6 KiB
R
% Generated by roxygen2: do not edit by hand
|
|
% Please edit documentation in R/xgb.plot.shap.R
|
|
\name{xgb.plot.shap}
|
|
\alias{xgb.plot.shap}
|
|
\title{SHAP dependence plots}
|
|
\usage{
|
|
xgb.plot.shap(
|
|
data,
|
|
shap_contrib = NULL,
|
|
features = NULL,
|
|
top_n = 1,
|
|
model = NULL,
|
|
trees = NULL,
|
|
target_class = NULL,
|
|
approxcontrib = FALSE,
|
|
subsample = NULL,
|
|
n_col = 1,
|
|
col = rgb(0, 0, 1, 0.2),
|
|
pch = ".",
|
|
discrete_n_uniq = 5,
|
|
discrete_jitter = 0.01,
|
|
ylab = "SHAP",
|
|
plot_NA = TRUE,
|
|
col_NA = rgb(0.7, 0, 1, 0.6),
|
|
pch_NA = ".",
|
|
pos_NA = 1.07,
|
|
plot_loess = TRUE,
|
|
col_loess = 2,
|
|
span_loess = 0.5,
|
|
which = c("1d", "2d"),
|
|
plot = TRUE,
|
|
...
|
|
)
|
|
}
|
|
\arguments{
|
|
\item{data}{The data to explain as a \code{matrix} or \code{dgCMatrix}.}
|
|
|
|
\item{shap_contrib}{Matrix of SHAP contributions of \code{data}.
|
|
The default (\code{NULL}) computes it from \code{model} and \code{data}.}
|
|
|
|
\item{features}{Vector of column indices or feature names to plot.
|
|
When \code{NULL} (default), the \code{top_n} most important features are selected
|
|
by \code{\link[=xgb.importance]{xgb.importance()}}.}
|
|
|
|
\item{top_n}{How many of the most important features (<= 100) should be selected?
|
|
By default 1 for SHAP dependence and 10 for SHAP summary).
|
|
Only used when \code{features = NULL}.}
|
|
|
|
\item{model}{An \code{xgb.Booster} model. Only required when \code{shap_contrib = NULL} or
|
|
\code{features = NULL}.}
|
|
|
|
\item{trees}{Passed to \code{\link[=xgb.importance]{xgb.importance()}} when \code{features = NULL}.}
|
|
|
|
\item{target_class}{Only relevant for multiclass models. The default (\code{NULL})
|
|
averages the SHAP values over all classes. Pass a (0-based) class index
|
|
to show only SHAP values of that class.}
|
|
|
|
\item{approxcontrib}{Passed to \code{predict()} when \code{shap_contrib = NULL}.}
|
|
|
|
\item{subsample}{Fraction of data points randomly picked for plotting.
|
|
The default (\code{NULL}) will use up to 100k data points.}
|
|
|
|
\item{n_col}{Number of columns in a grid of plots.}
|
|
|
|
\item{col}{Color of the scatterplot markers.}
|
|
|
|
\item{pch}{Scatterplot marker.}
|
|
|
|
\item{discrete_n_uniq}{Maximal number of unique feature values to consider the
|
|
feature as discrete.}
|
|
|
|
\item{discrete_jitter}{Jitter amount added to the values of discrete features.}
|
|
|
|
\item{ylab}{The y-axis label in 1D plots.}
|
|
|
|
\item{plot_NA}{Should contributions of cases with missing values be plotted?
|
|
Default is \code{TRUE}.}
|
|
|
|
\item{col_NA}{Color of marker for missing value contributions.}
|
|
|
|
\item{pch_NA}{Marker type for \code{NA} values.}
|
|
|
|
\item{pos_NA}{Relative position of the x-location where \code{NA} values are shown:
|
|
\code{min(x) + (max(x) - min(x)) * pos_NA}.}
|
|
|
|
\item{plot_loess}{Should loess-smoothed curves be plotted? (Default is \code{TRUE}).
|
|
The smoothing is only done for features with more than 5 distinct values.}
|
|
|
|
\item{col_loess}{Color of loess curves.}
|
|
|
|
\item{span_loess}{The \code{span} parameter of \code{\link[stats:loess]{stats::loess()}}.}
|
|
|
|
\item{which}{Whether to do univariate or bivariate plotting. Currently, only "1d" is implemented.}
|
|
|
|
\item{plot}{Should the plot be drawn? (Default is \code{TRUE}).
|
|
If \code{FALSE}, only a list of matrices is returned.}
|
|
|
|
\item{...}{Other parameters passed to \code{\link[graphics:plot.default]{graphics::plot()}}.}
|
|
}
|
|
\value{
|
|
In addition to producing plots (when \code{plot = TRUE}), it silently returns a list of two matrices:
|
|
\itemize{
|
|
\item \code{data}: Feature value matrix.
|
|
\item \code{shap_contrib}: Corresponding SHAP value matrix.
|
|
}
|
|
}
|
|
\description{
|
|
Visualizes SHAP values against feature values to gain an impression of feature effects.
|
|
}
|
|
\details{
|
|
These scatterplots represent how SHAP feature contributions depend of feature values.
|
|
The similarity to partial dependence plots is that they also give an idea for how feature values
|
|
affect predictions. However, in partial dependence plots, we see marginal dependencies
|
|
of model prediction on feature value, while SHAP dependence plots display the estimated
|
|
contributions of a feature to the prediction for each individual case.
|
|
|
|
When \code{plot_loess = TRUE}, feature values are rounded to three significant digits and
|
|
weighted LOESS is computed and plotted, where the weights are the numbers of data points
|
|
at each rounded value.
|
|
|
|
Note: SHAP contributions are on the scale of the model margin.
|
|
E.g., for a logistic binomial objective, the margin is on log-odds scale.
|
|
Also, since SHAP stands for "SHapley Additive exPlanation" (model prediction = sum of SHAP
|
|
contributions for all features + bias), depending on the objective used, transforming SHAP
|
|
contributions for a feature from the marginal to the prediction space is not necessarily
|
|
a meaningful thing to do.
|
|
}
|
|
\examples{
|
|
|
|
data(agaricus.train, package = "xgboost")
|
|
data(agaricus.test, package = "xgboost")
|
|
|
|
## Keep the number of threads to 1 for examples
|
|
nthread <- 1
|
|
data.table::setDTthreads(nthread)
|
|
nrounds <- 20
|
|
|
|
bst <- xgb.train(
|
|
data = xgb.DMatrix(agaricus.train$data, agaricus.train$label),
|
|
nrounds = nrounds,
|
|
eta = 0.1,
|
|
max_depth = 3,
|
|
subsample = 0.5,
|
|
objective = "binary:logistic",
|
|
nthread = nthread,
|
|
verbose = 0
|
|
)
|
|
|
|
xgb.plot.shap(agaricus.test$data, model = bst, features = "odor=none")
|
|
|
|
contr <- predict(bst, agaricus.test$data, predcontrib = TRUE)
|
|
xgb.plot.shap(agaricus.test$data, contr, model = bst, top_n = 12, n_col = 3)
|
|
|
|
# Summary plot
|
|
xgb.ggplot.shap.summary(agaricus.test$data, contr, model = bst, top_n = 12)
|
|
|
|
# Multiclass example - plots for each class separately:
|
|
nclass <- 3
|
|
x <- as.matrix(iris[, -5])
|
|
set.seed(123)
|
|
is.na(x[sample(nrow(x) * 4, 30)]) <- TRUE # introduce some missing values
|
|
|
|
mbst <- xgb.train(
|
|
data = xgb.DMatrix(x, label = as.numeric(iris$Species) - 1),
|
|
nrounds = nrounds,
|
|
max_depth = 2,
|
|
eta = 0.3,
|
|
subsample = 0.5,
|
|
nthread = nthread,
|
|
objective = "multi:softprob",
|
|
num_class = nclass,
|
|
verbose = 0
|
|
)
|
|
trees0 <- seq(from = 0, by = nclass, length.out = nrounds)
|
|
col <- rgb(0, 0, 1, 0.5)
|
|
xgb.plot.shap(
|
|
x,
|
|
model = mbst,
|
|
trees = trees0,
|
|
target_class = 0,
|
|
top_n = 4,
|
|
n_col = 2,
|
|
col = col,
|
|
pch = 16,
|
|
pch_NA = 17
|
|
)
|
|
|
|
xgb.plot.shap(
|
|
x,
|
|
model = mbst,
|
|
trees = trees0 + 1,
|
|
target_class = 1,
|
|
top_n = 4,
|
|
n_col = 2,
|
|
col = col,
|
|
pch = 16,
|
|
pch_NA = 17
|
|
)
|
|
|
|
xgb.plot.shap(
|
|
x,
|
|
model = mbst,
|
|
trees = trees0 + 2,
|
|
target_class = 2,
|
|
top_n = 4,
|
|
n_col = 2,
|
|
col = col,
|
|
pch = 16,
|
|
pch_NA = 17
|
|
)
|
|
|
|
# Summary plot
|
|
xgb.ggplot.shap.summary(x, model = mbst, target_class = 0, top_n = 4)
|
|
|
|
}
|
|
\references{
|
|
\enumerate{
|
|
\item Scott M. Lundberg, Su-In Lee, "A Unified Approach to Interpreting Model Predictions",
|
|
NIPS Proceedings 2017, \url{https://arxiv.org/abs/1705.07874}
|
|
\item Scott M. Lundberg, Su-In Lee, "Consistent feature attribution for tree ensembles",
|
|
\url{https://arxiv.org/abs/1706.06060}
|
|
}
|
|
}
|