[R] maintenance Nov 2017; SHAP plots (#2888)

* [R] fix predict contributions for data with no colnames

* [R] add a render parameter for xgb.plot.multi.trees; fixes #2628

* [R] update Rd's

* [R] remove unnecessary dep-package from R cmake install

* silence type warnings; readability

* [R] silence complaint about incomplete line at the end

* [R] initial version of xgb.plot.shap()

* [R] more work on xgb.plot.shap

* [R] enforce black font in xgb.plot.tree; fixes #2640

* [R] if feature names are available, check in predict that they are the same; fixes #2857

* [R] cran check and lint fixes

* remove tabs

* [R] add references; a test for plot.shap
This commit is contained in:
Vadim Khotilovich 2017-12-05 11:45:34 -06:00 committed by Tong He
parent 1b77903eeb
commit e8a6597957
19 changed files with 554 additions and 118 deletions

View File

@ -1,8 +1,8 @@
Package: xgboost Package: xgboost
Type: Package Type: Package
Title: Extreme Gradient Boosting Title: Extreme Gradient Boosting
Version: 0.6.4.7 Version: 0.6.4.8
Date: 2017-09-25 Date: 2017-12-05
Author: Tianqi Chen <tianqi.tchen@gmail.com>, Tong He <hetong007@gmail.com>, Author: Tianqi Chen <tianqi.tchen@gmail.com>, Tong He <hetong007@gmail.com>,
Michael Benesty <michael@benesty.fr>, Vadim Khotilovich <khotilovich@gmail.com>, Michael Benesty <michael@benesty.fr>, Vadim Khotilovich <khotilovich@gmail.com>,
Yuan Tang <terrytangyuan@gmail.com> Yuan Tang <terrytangyuan@gmail.com>

View File

@ -40,6 +40,7 @@ export(xgb.model.dt.tree)
export(xgb.plot.deepness) export(xgb.plot.deepness)
export(xgb.plot.importance) export(xgb.plot.importance)
export(xgb.plot.multi.trees) export(xgb.plot.multi.trees)
export(xgb.plot.shap)
export(xgb.plot.tree) export(xgb.plot.tree)
export(xgb.save) export(xgb.save)
export(xgb.save.raw) export(xgb.save.raw)
@ -60,9 +61,12 @@ importFrom(data.table,rbindlist)
importFrom(data.table,setkey) importFrom(data.table,setkey)
importFrom(data.table,setkeyv) importFrom(data.table,setkeyv)
importFrom(data.table,setnames) importFrom(data.table,setnames)
importFrom(grDevices,rgb)
importFrom(graphics,barplot) importFrom(graphics,barplot)
importFrom(graphics,grid) importFrom(graphics,grid)
importFrom(graphics,lines)
importFrom(graphics,par) importFrom(graphics,par)
importFrom(graphics,points)
importFrom(graphics,title) importFrom(graphics,title)
importFrom(magrittr,"%>%") importFrom(magrittr,"%>%")
importFrom(stats,median) importFrom(stats,median)

View File

@ -150,7 +150,7 @@ xgb.Booster.complete <- function(object, saveraw = TRUE) {
#' Setting \code{predcontrib = TRUE} allows to calculate contributions of each feature to #' Setting \code{predcontrib = TRUE} allows to calculate contributions of each feature to
#' individual predictions. For "gblinear" booster, feature contributions are simply linear terms #' individual predictions. For "gblinear" booster, feature contributions are simply linear terms
#' (feature_beta * feature_value). For "gbtree" booster, feature contributions are SHAP #' (feature_beta * feature_value). For "gbtree" booster, feature contributions are SHAP
#' values (https://arxiv.org/abs/1706.06060) that sum to the difference between the expected output #' values (Lundberg 2017) that sum to the difference between the expected output
#' of the model and the current prediction (where the hessian weights are used to compute the expectations). #' of the model and the current prediction (where the hessian weights are used to compute the expectations).
#' Setting \code{approxcontrib = TRUE} approximates these values following the idea explained #' Setting \code{approxcontrib = TRUE} approximates these values following the idea explained
#' in \url{http://blog.datadive.net/interpreting-random-forests/}. #' in \url{http://blog.datadive.net/interpreting-random-forests/}.
@ -173,6 +173,12 @@ xgb.Booster.complete <- function(object, saveraw = TRUE) {
#' @seealso #' @seealso
#' \code{\link{xgb.train}}. #' \code{\link{xgb.train}}.
#' #'
#' @references
#'
#' Scott M. Lundberg, Su-In Lee, "A Unified Approach to Interpreting Model Predictions", NIPS Proceedings 2017, \url{https://arxiv.org/abs/1705.07874}
#'
#' Scott M. Lundberg, Su-In Lee, "Consistent feature attribution for tree ensembles", \url{https://arxiv.org/abs/1706.06060}
#'
#' @examples #' @examples
#' ## binary classification: #' ## binary classification:
#' #'
@ -265,6 +271,10 @@ predict.xgb.Booster <- function(object, newdata, missing = NA, outputmargin = FA
object <- xgb.Booster.complete(object, saveraw = FALSE) object <- xgb.Booster.complete(object, saveraw = FALSE)
if (!inherits(newdata, "xgb.DMatrix")) if (!inherits(newdata, "xgb.DMatrix"))
newdata <- xgb.DMatrix(newdata, missing = missing) newdata <- xgb.DMatrix(newdata, missing = missing)
if (!is.null(object[["feature_names"]]) &&
!is.null(colnames(newdata)) &&
!identical(object[["feature_names"]], colnames(newdata)))
stop("Feature names stored in `object` and `newdata` are different!")
if (is.null(ntreelimit)) if (is.null(ntreelimit))
ntreelimit <- NVL(object$best_ntreelimit, 0) ntreelimit <- NVL(object$best_ntreelimit, 0)
if (NVL(object$params[['booster']], '') == 'gblinear') if (NVL(object$params[['booster']], '') == 'gblinear')
@ -292,7 +302,7 @@ predict.xgb.Booster <- function(object, newdata, missing = NA, outputmargin = FA
} else if (predcontrib) { } else if (predcontrib) {
n_col1 <- ncol(newdata) + 1 n_col1 <- ncol(newdata) + 1
n_group <- npred_per_case / n_col1 n_group <- npred_per_case / n_col1
dnames <- list(NULL, c(colnames(newdata), "BIAS")) dnames <- if (!is.null(colnames(newdata))) list(NULL, c(colnames(newdata), "BIAS")) else NULL
ret <- if (n_ret == n_row) { ret <- if (n_ret == n_row) {
matrix(ret, ncol = 1, dimnames = dnames) matrix(ret, ncol = 1, dimnames = dnames)
} else if (n_group == 1) { } else if (n_group == 1) {

View File

@ -136,4 +136,4 @@ xgb.importance <- function(feature_names = NULL, model = NULL, trees = NULL,
# Avoid error messages during CRAN check. # Avoid error messages during CRAN check.
# The reason is that these variables are never declared # The reason is that these variables are never declared
# They are mainly column names inferred by Data.table... # They are mainly column names inferred by Data.table...
globalVariables(c(".", ".N", "Gain", "Cover", "Frequency", "Feature")) globalVariables(c(".", ".N", "Gain", "Cover", "Frequency", "Feature", "Class"))

View File

@ -7,10 +7,9 @@
#' @param features_keep number of features to keep in each position of the multi trees. #' @param features_keep number of features to keep in each position of the multi trees.
#' @param plot_width width in pixels of the graph to produce #' @param plot_width width in pixels of the graph to produce
#' @param plot_height height in pixels of the graph to produce #' @param plot_height height in pixels of the graph to produce
#' @param render a logical flag for whether the graph should be rendered (see Value).
#' @param ... currently not used #' @param ... currently not used
#' #'
#' @return Two graphs showing the distribution of the model deepness.
#'
#' @details #' @details
#' #'
#' This function tries to capture the complexity of a gradient boosted tree model #' This function tries to capture the complexity of a gradient boosted tree model
@ -30,19 +29,39 @@
#' This function is inspired by this blog post: #' This function is inspired by this blog post:
#' \url{https://wellecks.wordpress.com/2015/02/21/peering-into-the-black-box-visualizing-lambdamart/} #' \url{https://wellecks.wordpress.com/2015/02/21/peering-into-the-black-box-visualizing-lambdamart/}
#' #'
#' @return
#'
#' When \code{render = TRUE}:
#' returns a rendered graph object which is an \code{htmlwidget} of class \code{grViz}.
#' Similar to ggplot objects, it needs to be printed to see it when not running from command line.
#'
#' When \code{render = FALSE}:
#' silently returns a graph object which is of DiagrammeR's class \code{dgr_graph}.
#' This could be useful if one wants to modify some of the graph attributes
#' before rendering the graph with \code{\link[DiagrammeR]{render_graph}}.
#'
#' @examples #' @examples
#'
#' data(agaricus.train, package='xgboost') #' data(agaricus.train, package='xgboost')
#' #'
#' bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max_depth = 15, #' bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max_depth = 15,
#' eta = 1, nthread = 2, nrounds = 30, objective = "binary:logistic", #' eta = 1, nthread = 2, nrounds = 30, objective = "binary:logistic",
#' min_child_weight = 50) #' min_child_weight = 50, verbose = 0)
#' #'
#' p <- xgb.plot.multi.trees(model = bst, feature_names = colnames(agaricus.train$data), #' p <- xgb.plot.multi.trees(model = bst, features_keep = 3)
#' features_keep = 3)
#' print(p) #' print(p)
#' #'
#' \dontrun{
#' # Below is an example of how to save this plot to a file.
#' # Note that for `export_graph` to work, the DiagrammeRsvg and rsvg packages must also be installed.
#' library(DiagrammeR)
#' gr <- xgb.plot.multi.trees(model=bst, features_keep = 3, render=FALSE)
#' export_graph(gr, 'tree.pdf', width=1500, height=600)
#' }
#'
#' @export #' @export
xgb.plot.multi.trees <- function(model, feature_names = NULL, features_keep = 5, plot_width = NULL, plot_height = NULL, ...){ xgb.plot.multi.trees <- function(model, feature_names = NULL, features_keep = 5, plot_width = NULL, plot_height = NULL,
render = TRUE, ...){
check.deprecation(...) check.deprecation(...)
tree.matrix <- xgb.model.dt.tree(feature_names = feature_names, model = model) tree.matrix <- xgb.model.dt.tree(feature_names = feature_names, model = model)
@ -67,7 +86,6 @@ xgb.plot.multi.trees <- function(model, feature_names = NULL, features_keep = 5,
tree.matrix[!is.na(Yes), Yes := paste0(abs.node.position, "_0")] tree.matrix[!is.na(Yes), Yes := paste0(abs.node.position, "_0")]
tree.matrix[!is.na(No), No := paste0(abs.node.position, "_1")] tree.matrix[!is.na(No), No := paste0(abs.node.position, "_1")]
remove.tree <- . %>% stri_replace_first_regex(pattern = "^\\d+-", replacement = "") remove.tree <- . %>% stri_replace_first_regex(pattern = "^\\d+-", replacement = "")
tree.matrix[,`:=`(abs.node.position = remove.tree(abs.node.position), tree.matrix[,`:=`(abs.node.position = remove.tree(abs.node.position),
@ -121,6 +139,8 @@ xgb.plot.multi.trees <- function(model, feature_names = NULL, features_keep = 5,
attr = c("color", "arrowsize", "arrowhead", "fontname"), attr = c("color", "arrowsize", "arrowhead", "fontname"),
value = c("DimGray", "1.5", "vee", "Helvetica")) value = c("DimGray", "1.5", "vee", "Helvetica"))
if (!render) return(invisible(graph))
DiagrammeR::render_graph(graph, width = plot_width, height = plot_height) DiagrammeR::render_graph(graph, width = plot_width, height = plot_height)
} }

214
R-package/R/xgb.plot.shap.R Normal file
View File

@ -0,0 +1,214 @@
#' SHAP contribution dependency plots
#'
#' Visualizing the SHAP feature contribution to prediction dependencies on feature value.
#'
#' @param data data as a \code{matrix} or \code{dgCMatrix}.
#' @param shap_contrib a matrix of SHAP contributions that was computed earlier for the above
#' \code{data}. When it is NULL, it is computed internally using \code{model} and \code{data}.
#' @param features a vector of either column indices or of feature names to plot. When it is NULL,
#' feature importance is calculated, and \code{top_n} high ranked features are taken.
#' @param top_n when \code{features} is NULL, top_n [1, 100] most important features in a model are taken.
#' @param model an \code{xgb.Booster} model. It has to be provided when either \code{shap_contrib}
#' or \code{features} is missing.
#' @param trees passed to \code{\link{xgb.importance}} when \code{features = NULL}.
#' @param target_class is only relevant for multiclass models. When it is set to a 0-based class index,
#' only SHAP contributions for that specific class are used.
#' If it is not set, SHAP importances are averaged over all classes.
#' @param approxcontrib passed to \code{\link{predict.xgb.Booster}} when \code{shap_contrib = NULL}.
#' @param subsample a random fraction of data points to use for plotting. When it is NULL,
#' it is set so that up to 100K data points are used.
#' @param n_col a number of columns in a grid of plots.
#' @param col color of the scatterplot markers.
#' @param pch scatterplot marker.
#' @param discrete_n_uniq a maximal number of unique values in a feature to consider it as discrete.
#' @param discrete_jitter an \code{amount} parameter of jitter added to discrete features' positions.
#' @param ylab a y-axis label in 1D plots.
#' @param plot_NA whether the contributions of cases with missing values should also be plotted.
#' @param col_NA a color of marker for missing value contributions.
#' @param pch_NA a marker type for NA values.
#' @param pos_NA a relative position of the x-location where NA values are shown:
#' \code{min(x) + (max(x) - min(x)) * pos_NA}.
#' @param plot_loess whether to plot loess-smoothed curves. The smoothing is only done for features with
#' more than 5 distinct values.
#' @param col_loess a color to use for the loess curves.
#' @param span_loess the \code{span} paramerer in \code{\link[stats]{loess}}'s call.
#' @param which whether to do univariate or bivariate plotting. NOTE: only 1D is implemented so far.
#' @param plot whether a plot should be drawn. If FALSE, only a lits of matrices is returned.
#' @param ... other parameters passed to \code{plot}.
#'
#' @details
#'
#' These scatterplots represent how SHAP feature contributions depend of feature values.
#' The similarity to partial dependency plots is that they also give an idea for how feature values
#' affect predictions. However, in partial dependency plots, we usually see marginal dependencies
#' of model prediction on feature value, while SHAP contribution dependency plots display the estimated
#' contributions of a feature to model prediction for each individual case.
#'
#' When \code{plot_loess = TRUE} is set, feature values are rounded to 3 significant digits and
#' weighted LOESS is computed and plotted, where weights are the numbers of data points
#' at each rounded value.
#'
#' Note: SHAP contributions are shown on the scale of model margin. E.g., for a logistic binomial objective,
#' the margin is prediction before a sigmoidal transform into probability-like values.
#' Also, since SHAP stands for "SHapley Additive exPlanation" (model prediction = sum of SHAP
#' contributions for all features + bias), depending on the objective used, transforming SHAP
#' contributions for a feature from the marginal to the prediction space is not necessarily
#' a meaningful thing to do.
#'
#' @return
#'
#' In addition to producing plots (when \code{plot=TRUE}), it silently returns a list of two matrices:
#' \itemize{
#' \item \code{data} the values of selected features;
#' \item \code{shap_contrib} the contributions of selected features.
#' }
#'
#' @references
#'
#' Scott M. Lundberg, Su-In Lee, "A Unified Approach to Interpreting Model Predictions", NIPS Proceedings 2017, \url{https://arxiv.org/abs/1705.07874}
#'
#' Scott M. Lundberg, Su-In Lee, "Consistent feature attribution for tree ensembles", \url{https://arxiv.org/abs/1706.06060}
#'
#' @examples
#'
#' data(agaricus.train, package='xgboost')
#' data(agaricus.test, package='xgboost')
#'
#' bst <- xgboost(agaricus.train$data, agaricus.train$label, nrounds = 50,
#' eta = 0.1, max_depth = 3, subsample = .5,
#' method = "hist", objective = "binary:logistic", nthread = 2, verbose = 0)
#'
#' xgb.plot.shap(agaricus.test$data, model = bst, features = "odor=none")
#' contr <- predict(bst, agaricus.test$data, predcontrib = TRUE)
#' xgb.plot.shap(agaricus.test$data, contr, model = bst, top_n = 12, n_col = 3)
#'
#' # multiclass example - plots for each class separately:
#' nclass <- 3
#' nrounds <- 20
#' x <- as.matrix(iris[, -5])
#' set.seed(123)
#' is.na(x[sample(nrow(x) * 4, 30)]) <- TRUE # introduce some missing values
#' mbst <- xgboost(data = x, label = as.numeric(iris$Species) - 1, nrounds = nrounds,
#' max_depth = 2, eta = 0.3, subsample = .5, nthread = 2,
#' objective = "multi:softprob", num_class = nclass, verbose = 0)
#' trees0 <- seq(from=0, by=nclass, length.out=nrounds)
#' col <- rgb(0, 0, 1, 0.5)
#' xgb.plot.shap(x, model = mbst, trees = trees0, target_class = 0, top_n = 4, n_col = 2, col = col, pch = 16, pch_NA = 17)
#' xgb.plot.shap(x, model = mbst, trees = trees0 + 1, target_class = 1, top_n = 4, n_col = 2, col = col, pch = 16, pch_NA = 17)
#' xgb.plot.shap(x, model = mbst, trees = trees0 + 2, target_class = 2, top_n = 4, n_col = 2, col = col, pch = 16, pch_NA = 17)
#'
#' @rdname xgb.plot.shap
#' @export
xgb.plot.shap <- function(data, shap_contrib = NULL, features = NULL, top_n = 1, model = NULL,
trees = NULL, target_class = NULL, approxcontrib = FALSE,
subsample = NULL, n_col = 1, col = rgb(0, 0, 1, 0.2), pch = '.',
discrete_n_uniq = 5, discrete_jitter = 0.01, ylab = "SHAP",
plot_NA = TRUE, col_NA = rgb(0.7, 0, 1, 0.6), pch_NA = '.', pos_NA = 1.07,
plot_loess = TRUE, col_loess = 2, span_loess = 0.5,
which = c("1d", "2d"), plot = TRUE, ...) {
if (!is.matrix(data) && !inherits(data, "dgCMatrix"))
stop("data: must be either matrix or dgCMatrix")
if (is.null(shap_contrib) && (is.null(model) || !inherits(model, "xgb.Booster")))
stop("when shap_contrib is not provided, one must provide an xgb.Booster model")
if (is.null(features) && (is.null(model) || !inherits(model, "xgb.Booster")))
stop("when features are not provided, one must provide an xgb.Booster model to rank the features")
if (!is.null(shap_contrib) &&
(!is.matrix(shap_contrib) || nrow(shap_contrib) != nrow(data) || ncol(shap_contrib) != ncol(data) + 1))
stop("shap_contrib is not compatible with the provided data")
nsample <- if (is.null(subsample)) min(100000, nrow(data)) else as.integer(subsample * nrow(data))
idx <- sample(1:nrow(data), nsample)
data <- data[idx,]
if (is.null(shap_contrib)) {
shap_contrib <- predict(model, data, predcontrib = TRUE, approxcontrib = approxcontrib)
} else {
shap_contrib <- shap_contrib[idx,]
}
which <- match.arg(which)
if (which == "2d")
stop("2D plots are not implemented yet")
if (is.null(features)) {
imp <- xgb.importance(model = model, trees = trees)
top_n <- as.integer(top_n[1])
if (top_n < 1 && top_n > 100)
stop("top_n: must be an integer within [1, 100]")
features <- imp$Feature[1:min(top_n, NROW(imp))]
}
if (is.character(features)) {
if (is.null(colnames(data)))
stop("Either provide `data` with column names or provide `features` as column indices")
features <- match(features, colnames(data))
}
if (n_col > length(features)) n_col <- length(features)
if (is.list(shap_contrib)) { # multiclass: either choose a class or merge
shap_contrib <- if (!is.null(target_class)) shap_contrib[[target_class + 1]]
else Reduce("+", lapply(shap_contrib, abs))
}
shap_contrib <- shap_contrib[, features, drop = FALSE]
data <- data[, features, drop = FALSE]
cols <- colnames(data)
if (is.null(cols)) cols <- colnames(shap_contrib)
if (is.null(cols)) cols <- paste0('X', 1:ncol(data))
colnames(data) <- cols
colnames(shap_contrib) <- cols
if (plot && which == "1d") {
op <- par(mfrow = c(ceiling(length(features) / n_col), n_col),
oma = c(0,0,0,0) + 0.2,
mar = c(3.5,3.5,0,0) + 0.1,
mgp = c(1.7, 0.6, 0))
for (f in cols) {
ord <- order(data[, f])
x <- data[, f][ord]
y <- shap_contrib[, f][ord]
x_lim <- range(x, na.rm = TRUE)
y_lim <- range(y, na.rm = TRUE)
do_na <- plot_NA && any(is.na(x))
if (do_na) {
x_range <- diff(x_lim)
loc_na <- min(x, na.rm = TRUE) + x_range * pos_NA
x_lim <- range(c(x_lim, loc_na))
}
x_uniq <- unique(x)
x2plot <- x
# add small jitter for discrete features with <= 5 distinct values
if (length(x_uniq) <= discrete_n_uniq)
x2plot <- jitter(x, amount = discrete_jitter * min(diff(x_uniq), na.rm = TRUE))
plot(x2plot, y, pch = pch, xlab = f, col = col, xlim = x_lim, ylim = y_lim, ylab = ylab, ...)
grid()
if (plot_loess) {
# compress x to 3 digits, and mean-aggredate y
zz <- data.table(x = signif(x, 3), y)[, .(.N, y=mean(y)), x]
if (nrow(zz) <= 5) {
lines(zz$x, zz$y, col = col_loess)
} else {
lo <- stats::loess(y ~ x, data = zz, weights = zz$N, span = span_loess)
zz$y_lo <- predict(lo, zz, type = "link")
lines(zz$x, zz$y_lo, col = col_loess)
}
}
if (do_na) {
i_na <- which(is.na(x))
x_na <- rep(loc_na, length(i_na))
x_na <- jitter(x_na, amount = x_range * 0.01)
points(x_na, y[i_na], pch = pch_NA, col = col_NA)
}
}
par(op)
}
if (plot && which == "2d") {
# TODO
}
invisible(list(data = data, shap_contrib = shap_contrib))
}

View File

@ -95,7 +95,8 @@ xgb.plot.tree <- function(feature_names = NULL, model = NULL, trees = NULL, plot
label = dt$label, label = dt$label,
fillcolor = dt$filledcolor, fillcolor = dt$filledcolor,
shape = dt$shape, shape = dt$shape,
data = dt$Feature) data = dt$Feature,
fontcolor = "black")
edges <- DiagrammeR::create_edge_df( edges <- DiagrammeR::create_edge_df(
from = match(dt[Feature != "Leaf", c(ID)] %>% rep(2), dt$ID), from = match(dt[Feature != "Leaf", c(ID)] %>% rep(2), dt$ID),

View File

@ -169,6 +169,11 @@
#' \code{\link{predict.xgb.Booster}}, #' \code{\link{predict.xgb.Booster}},
#' \code{\link{xgb.cv}} #' \code{\link{xgb.cv}}
#' #'
#' @references
#'
#' Tianqi Chen and Carlos Guestrin, "XGBoost: A Scalable Tree Boosting System",
#' 22nd SIGKDD Conference on Knowledge Discovery and Data Mining, 2016, \url{https://arxiv.org/abs/1603.02754}
#'
#' @examples #' @examples
#' data(agaricus.train, package='xgboost') #' data(agaricus.train, package='xgboost')
#' data(agaricus.test, package='xgboost') #' data(agaricus.test, package='xgboost')

View File

@ -100,9 +100,12 @@ NULL
#' @importFrom stats median #' @importFrom stats median
#' @importFrom utils head #' @importFrom utils head
#' @importFrom graphics barplot #' @importFrom graphics barplot
#' @importFrom graphics lines
#' @importFrom graphics points
#' @importFrom graphics grid #' @importFrom graphics grid
#' @importFrom graphics par #' @importFrom graphics par
#' @importFrom graphics title #' @importFrom graphics title
#' @importFrom grDevices rgb
#' #'
#' @import methods #' @import methods
#' @useDynLib xgboost, .registration = TRUE #' @useDynLib xgboost, .registration = TRUE

View File

@ -11,3 +11,4 @@ early_stopping Early Stop in training
poisson_regression Poisson Regression on count data poisson_regression Poisson Regression on count data
tweedie_regression Tweddie Regression tweedie_regression Tweddie Regression
gpu_accelerated GPU-accelerated tree building algorithms gpu_accelerated GPU-accelerated tree building algorithms

View File

@ -7,7 +7,7 @@
\usage{ \usage{
\method{predict}{xgb.Booster}(object, newdata, missing = NA, \method{predict}{xgb.Booster}(object, newdata, missing = NA,
outputmargin = FALSE, ntreelimit = NULL, predleaf = FALSE, outputmargin = FALSE, ntreelimit = NULL, predleaf = FALSE,
predcontrib = FALSE, reshape = FALSE, ...) predcontrib = FALSE, approxcontrib = FALSE, reshape = FALSE, ...)
\method{predict}{xgb.Booster.handle}(object, ...) \method{predict}{xgb.Booster.handle}(object, ...)
} }
@ -30,6 +30,8 @@ It will use all the trees by default (\code{NULL} value).}
\item{predcontrib}{whether to return feature contributions to individual predictions instead (see Details).} \item{predcontrib}{whether to return feature contributions to individual predictions instead (see Details).}
\item{approxcontrib}{whether to use a fast approximation for feature contributions (see Details).}
\item{reshape}{whether to reshape the vector of predictions to a matrix form when there are several \item{reshape}{whether to reshape the vector of predictions to a matrix form when there are several
prediction outputs per case. This option has no effect when \code{predleaf = TRUE}.} prediction outputs per case. This option has no effect when \code{predleaf = TRUE}.}
@ -69,10 +71,11 @@ e.g., as implemented in \code{\link{xgb.create.features}}.
Setting \code{predcontrib = TRUE} allows to calculate contributions of each feature to Setting \code{predcontrib = TRUE} allows to calculate contributions of each feature to
individual predictions. For "gblinear" booster, feature contributions are simply linear terms individual predictions. For "gblinear" booster, feature contributions are simply linear terms
(feature_beta * feature_value). For "gbtree" booster, feature contribution is calculated (feature_beta * feature_value). For "gbtree" booster, feature contributions are SHAP
as a sum of average contribution of that feature's split nodes across all trees to an values (Lundberg 2017) that sum to the difference between the expected output
individual prediction, following the idea explained in of the model and the current prediction (where the hessian weights are used to compute the expectations).
\url{http://blog.datadive.net/interpreting-random-forests/}. Setting \code{approxcontrib = TRUE} approximates these values following the idea explained
in \url{http://blog.datadive.net/interpreting-random-forests/}.
} }
\examples{ \examples{
## binary classification: ## binary classification:
@ -98,7 +101,7 @@ str(pred_leaf)
# the result is an nsamples X (nfeatures + 1) matrix # the result is an nsamples X (nfeatures + 1) matrix
pred_contr <- predict(bst, test$data, predcontrib = TRUE) pred_contr <- predict(bst, test$data, predcontrib = TRUE)
str(pred_contr) str(pred_contr)
# verify that contributions' sums are equal to log-odds of predictions (up to foat precision): # verify that contributions' sums are equal to log-odds of predictions (up to float precision):
summary(rowSums(pred_contr) - qlogis(pred)) summary(rowSums(pred_contr) - qlogis(pred))
# for the 1st record, let's inspect its features that had non-zero contribution to prediction: # for the 1st record, let's inspect its features that had non-zero contribution to prediction:
contr1 <- pred_contr[1,] contr1 <- pred_contr[1,]
@ -158,6 +161,11 @@ err <- sapply(1:25, function(n) {
}) })
plot(err, type='l', ylim=c(0,0.1), xlab='#trees') plot(err, type='l', ylim=c(0,0.1), xlab='#trees')
}
\references{
Scott M. Lundberg, Su-In Lee, "A Unified Approach to Interpreting Model Predictions", NIPS Proceedings 2017, \url{https://arxiv.org/abs/1705.07874}
Scott M. Lundberg, Su-In Lee, "Consistent feature attribution for tree ensembles", \url{https://arxiv.org/abs/1706.06060}
} }
\seealso{ \seealso{
\code{\link{xgb.train}}. \code{\link{xgb.train}}.

View File

@ -5,7 +5,7 @@
\title{Project all trees on one tree and plot it} \title{Project all trees on one tree and plot it}
\usage{ \usage{
xgb.plot.multi.trees(model, feature_names = NULL, features_keep = 5, xgb.plot.multi.trees(model, feature_names = NULL, features_keep = 5,
plot_width = NULL, plot_height = NULL, ...) plot_width = NULL, plot_height = NULL, render = TRUE, ...)
} }
\arguments{ \arguments{
\item{model}{produced by the \code{xgb.train} function.} \item{model}{produced by the \code{xgb.train} function.}
@ -18,10 +18,19 @@ xgb.plot.multi.trees(model, feature_names = NULL, features_keep = 5,
\item{plot_height}{height in pixels of the graph to produce} \item{plot_height}{height in pixels of the graph to produce}
\item{render}{a logical flag for whether the graph should be rendered (see Value).}
\item{...}{currently not used} \item{...}{currently not used}
} }
\value{ \value{
Two graphs showing the distribution of the model deepness. When \code{render = TRUE}:
returns a rendered graph object which is an \code{htmlwidget} of class \code{grViz}.
Similar to ggplot objects, it needs to be printed to see it when not running from command line.
When \code{render = FALSE}:
silently returns a graph object which is of DiagrammeR's class \code{dgr_graph}.
This could be useful if one wants to modify some of the graph attributes
before rendering the graph with \code{\link[DiagrammeR]{render_graph}}.
} }
\description{ \description{
Visualization of the ensemble of trees as a single collective unit. Visualization of the ensemble of trees as a single collective unit.
@ -45,14 +54,22 @@ This function is inspired by this blog post:
\url{https://wellecks.wordpress.com/2015/02/21/peering-into-the-black-box-visualizing-lambdamart/} \url{https://wellecks.wordpress.com/2015/02/21/peering-into-the-black-box-visualizing-lambdamart/}
} }
\examples{ \examples{
data(agaricus.train, package='xgboost') data(agaricus.train, package='xgboost')
bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max_depth = 15, bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max_depth = 15,
eta = 1, nthread = 2, nrounds = 30, objective = "binary:logistic", eta = 1, nthread = 2, nrounds = 30, objective = "binary:logistic",
min_child_weight = 50) min_child_weight = 50, verbose = 0)
p <- xgb.plot.multi.trees(model = bst, feature_names = colnames(agaricus.train$data), p <- xgb.plot.multi.trees(model = bst, features_keep = 3)
features_keep = 3)
print(p) print(p)
\dontrun{
# Below is an example of how to save this plot to a file.
# Note that for `export_graph` to work, the DiagrammeRsvg and rsvg packages must also be installed.
library(DiagrammeR)
gr <- xgb.plot.multi.trees(model=bst, features_keep = 3, render=FALSE)
export_graph(gr, 'tree.pdf', width=1500, height=600)
}
} }

View File

@ -0,0 +1,135 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/xgb.plot.shap.R
\name{xgb.plot.shap}
\alias{xgb.plot.shap}
\title{SHAP contribution dependency plots}
\usage{
xgb.plot.shap(data, shap_contrib = NULL, features = NULL, top_n = 1,
model = NULL, trees = NULL, target_class = NULL,
approxcontrib = FALSE, subsample = NULL, n_col = 1, col = rgb(0, 0, 1,
0.2), pch = ".", discrete_n_uniq = 5, discrete_jitter = 0.01,
ylab = "SHAP", plot_NA = TRUE, col_NA = rgb(0.7, 0, 1, 0.6),
pch_NA = ".", pos_NA = 1.07, plot_loess = TRUE, col_loess = 2,
span_loess = 0.5, which = c("1d", "2d"), plot = TRUE, ...)
}
\arguments{
\item{data}{data as a \code{matrix} or \code{dgCMatrix}.}
\item{shap_contrib}{a matrix of SHAP contributions that was computed earlier for the above
\code{data}. When it is NULL, it is computed internally using \code{model} and \code{data}.}
\item{features}{a vector of either column indices or of feature names to plot. When it is NULL,
feature importance is calculated, and \code{top_n} high ranked features are taken.}
\item{top_n}{when \code{features} is NULL, top_n [1, 100] most important features in a model are taken.}
\item{model}{an \code{xgb.Booster} model. It has to be provided when either \code{shap_contrib}
or \code{features} is missing.}
\item{trees}{passed to \code{\link{xgb.importance}} when \code{features = NULL}.}
\item{target_class}{is only relevant for multiclass models. When it is set to a 0-based class index,
only SHAP contributions for that specific class are used.
If it is not set, SHAP importances are averaged over all classes.}
\item{approxcontrib}{passed to \code{\link{predict.xgb.Booster}} when \code{shap_contrib = NULL}.}
\item{subsample}{a random fraction of data points to use for plotting. When it is NULL,
it is set so that up to 100K data points are used.}
\item{n_col}{a number of columns in a grid of plots.}
\item{col}{color of the scatterplot markers.}
\item{pch}{scatterplot marker.}
\item{discrete_n_uniq}{a maximal number of unique values in a feature to consider it as discrete.}
\item{discrete_jitter}{an \code{amount} parameter of jitter added to discrete features' positions.}
\item{ylab}{a y-axis label in 1D plots.}
\item{plot_NA}{whether the contributions of cases with missing values should also be plotted.}
\item{col_NA}{a color of marker for missing value contributions.}
\item{pch_NA}{a marker type for NA values.}
\item{pos_NA}{a relative position of the x-location where NA values are shown:
\code{min(x) + (max(x) - min(x)) * pos_NA}.}
\item{plot_loess}{whether to plot loess-smoothed curves. The smoothing is only done for features with
more than 5 distinct values.}
\item{col_loess}{a color to use for the loess curves.}
\item{span_loess}{the \code{span} paramerer in \code{\link[stats]{loess}}'s call.}
\item{which}{whether to do univariate or bivariate plotting. NOTE: only 1D is implemented so far.}
\item{plot}{whether a plot should be drawn. If FALSE, only a lits of matrices is returned.}
\item{...}{other parameters passed to \code{plot}.}
}
\value{
In addition to producing plots (when \code{plot=TRUE}), it silently returns a list of two matrices:
\itemize{
\item \code{data} the values of selected features;
\item \code{shap_contrib} the contributions of selected features.
}
}
\description{
Visualizing the SHAP feature contribution to prediction dependencies on feature value.
}
\details{
These scatterplots represent how SHAP feature contributions depend of feature values.
The similarity to partial dependency plots is that they also give an idea for how feature values
affect predictions. However, in partial dependency plots, we usually see marginal dependencies
of model prediction on feature value, while SHAP contribution dependency plots display the estimated
contributions of a feature to model prediction for each individual case.
When \code{plot_loess = TRUE} is set, feature values are rounded to 3 significant digits and
weighted LOESS is computed and plotted, where weights are the numbers of data points
at each rounded value.
Note: SHAP contributions are shown on the scale of model margin. E.g., for a logistic binomial objective,
the margin is prediction before a sigmoidal transform into probability-like values.
Also, since SHAP stands for "SHapley Additive exPlanation" (model prediction = sum of SHAP
contributions for all features + bias), depending on the objective used, transforming SHAP
contributions for a feature from the marginal to the prediction space is not necessarily
a meaningful thing to do.
}
\examples{
data(agaricus.train, package='xgboost')
data(agaricus.test, package='xgboost')
bst <- xgboost(agaricus.train$data, agaricus.train$label, nrounds = 50,
eta = 0.1, max_depth = 3, subsample = .5,
method = "hist", objective = "binary:logistic", nthread = 2, verbose = 0)
xgb.plot.shap(agaricus.test$data, model = bst, features = "odor=none")
contr <- predict(bst, agaricus.test$data, predcontrib = TRUE)
xgb.plot.shap(agaricus.test$data, contr, model = bst, top_n = 12, n_col = 3)
# multiclass example - plots for each class separately:
nclass <- 3
nrounds <- 20
x <- as.matrix(iris[, -5])
set.seed(123)
is.na(x[sample(nrow(x) * 4, 30)]) <- TRUE # introduce some missing values
mbst <- xgboost(data = x, label = as.numeric(iris$Species) - 1, nrounds = nrounds,
max_depth = 2, eta = 0.3, subsample = .5, nthread = 2,
objective = "multi:softprob", num_class = nclass, verbose = 0)
trees0 <- seq(from=0, by=nclass, length.out=nrounds)
col <- rgb(0, 0, 1, 0.5)
xgb.plot.shap(x, model = mbst, trees = trees0, target_class = 0, top_n = 4, n_col = 2, col = col, pch = 16, pch_NA = 17)
xgb.plot.shap(x, model = mbst, trees = trees0 + 1, target_class = 1, top_n = 4, n_col = 2, col = col, pch = 16, pch_NA = 17)
xgb.plot.shap(x, model = mbst, trees = trees0 + 2, target_class = 2, top_n = 4, n_col = 2, col = col, pch = 16, pch_NA = 17)
}
\references{
Scott M. Lundberg, Su-In Lee, "A Unified Approach to Interpreting Model Predictions", NIPS Proceedings 2017, \url{https://arxiv.org/abs/1705.07874}
Scott M. Lundberg, Su-In Lee, "Consistent feature attribution for tree ensembles", \url{https://arxiv.org/abs/1706.06060}
}

View File

@ -258,6 +258,10 @@ bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label,
objective = "binary:logistic") objective = "binary:logistic")
pred <- predict(bst, agaricus.test$data) pred <- predict(bst, agaricus.test$data)
}
\references{
Tianqi Chen and Carlos Guestrin, "XGBoost: A Scalable Tree Boosting System",
22nd SIGKDD Conference on Knowledge Discovery and Data Mining, 2016, \url{https://arxiv.org/abs/1603.02754}
} }
\seealso{ \seealso{
\code{\link{callbacks}}, \code{\link{callbacks}},

View File

@ -81,6 +81,11 @@ test_that("predict feature contributions works", {
expect_equal(colnames(pred_contr), c(colnames(sparse_matrix), "BIAS")) expect_equal(colnames(pred_contr), c(colnames(sparse_matrix), "BIAS"))
pred <- predict(bst.Tree, sparse_matrix, outputmargin = TRUE) pred <- predict(bst.Tree, sparse_matrix, outputmargin = TRUE)
expect_lt(max(abs(rowSums(pred_contr) - pred)), 1e-5) expect_lt(max(abs(rowSums(pred_contr) - pred)), 1e-5)
# must work with data that has no column names
X <- sparse_matrix
colnames(X) <- NULL
expect_error(pred_contr_ <- predict(bst.Tree, X, predcontrib = TRUE), regexp = NA)
expect_equal(pred_contr, pred_contr_, check.attributes = FALSE)
# gbtree binary classifier (approximate method) # gbtree binary classifier (approximate method)
expect_error(pred_contr <- predict(bst.Tree, sparse_matrix, predcontrib = TRUE, approxcontrib = TRUE), regexp = NA) expect_error(pred_contr <- predict(bst.Tree, sparse_matrix, predcontrib = TRUE, approxcontrib = TRUE), regexp = NA)
@ -289,6 +294,13 @@ test_that("xgb.plot.deepness works", {
xgb.ggplot.deepness(model = bst.Tree) xgb.ggplot.deepness(model = bst.Tree)
}) })
test_that("xgb.plot.shap works", {
sh <- xgb.plot.shap(data = sparse_matrix, model = bst.Tree, top_n = 2, col = 4)
expect_equal(names(sh), c("data", "shap_contrib"))
expect_equal(NCOL(sh$data), 2)
expect_equal(NCOL(sh$shap_contrib), 2)
})
test_that("check.deprecation works", { test_that("check.deprecation works", {
ttt <- function(a = NNULL, DUMMY=NULL, ...) { ttt <- function(a = NNULL, DUMMY=NULL, ...) {
check.deprecation(...) check.deprecation(...)

View File

@ -78,7 +78,7 @@ function(setup_rpackage_install_target rlib_target build_dir)
install(CODE "file(WRITE \"${build_dir}/R-package/src/Makevars\" \"all:\")") install(CODE "file(WRITE \"${build_dir}/R-package/src/Makevars\" \"all:\")")
install(CODE "file(WRITE \"${build_dir}/R-package/src/Makevars.win\" \"all:\")") install(CODE "file(WRITE \"${build_dir}/R-package/src/Makevars.win\" \"all:\")")
set(XGB_DEPS_SCRIPT set(XGB_DEPS_SCRIPT
"deps = setdiff(c('statar','data.table', 'magrittr', 'stringi'), rownames(installed.packages()));\ "deps = setdiff(c('data.table', 'magrittr', 'stringi'), rownames(installed.packages()));\
if(length(deps)>0) install.packages(deps, repo = 'https://cloud.r-project.org/')") if(length(deps)>0) install.packages(deps, repo = 'https://cloud.r-project.org/')")
install(CODE "execute_process(COMMAND \"${LIBR_EXECUTABLE}\" \"-q\" \"-e\" \"${XGB_DEPS_SCRIPT}\")") install(CODE "execute_process(COMMAND \"${LIBR_EXECUTABLE}\" \"-q\" \"-e\" \"${XGB_DEPS_SCRIPT}\")")
install(CODE "execute_process(COMMAND \"${LIBR_EXECUTABLE}\" CMD INSTALL\ install(CODE "execute_process(COMMAND \"${LIBR_EXECUTABLE}\" CMD INSTALL\

View File

@ -651,7 +651,7 @@ inline void ExtendPath(PathElement *unique_path, unsigned unique_depth,
unique_path[unique_depth].feature_index = feature_index; unique_path[unique_depth].feature_index = feature_index;
unique_path[unique_depth].zero_fraction = zero_fraction; unique_path[unique_depth].zero_fraction = zero_fraction;
unique_path[unique_depth].one_fraction = one_fraction; unique_path[unique_depth].one_fraction = one_fraction;
unique_path[unique_depth].pweight = static_cast<bst_float>(unique_depth == 0 ? 1 : 0); unique_path[unique_depth].pweight = (unique_depth == 0 ? 1.0f : 0.0f);
for (int i = unique_depth - 1; i >= 0; i--) { for (int i = unique_depth - 1; i >= 0; i--) {
unique_path[i+1].pweight += one_fraction * unique_path[i].pweight * (i + 1) unique_path[i+1].pweight += one_fraction * unique_path[i].pweight * (i + 1)
/ static_cast<bst_float>(unique_depth + 1); / static_cast<bst_float>(unique_depth + 1);
@ -718,7 +718,8 @@ inline void RegTree::TreeShap(const RegTree::FVec& feat, bst_float *phi,
// extend the unique path // extend the unique path
PathElement *unique_path = parent_unique_path + unique_depth; PathElement *unique_path = parent_unique_path + unique_depth;
if (unique_depth > 0) std::copy(parent_unique_path, parent_unique_path+unique_depth, unique_path); if (unique_depth > 0) std::copy(parent_unique_path,
parent_unique_path + unique_depth, unique_path);
ExtendPath(unique_path, unique_depth, parent_zero_fraction, ExtendPath(unique_path, unique_depth, parent_zero_fraction,
parent_one_fraction, parent_feature_index); parent_one_fraction, parent_feature_index);
const unsigned split_index = node.split_index(); const unsigned split_index = node.split_index();
@ -742,7 +743,8 @@ inline void RegTree::TreeShap(const RegTree::FVec& feat, bst_float *phi,
} else { } else {
hot_index = node.cright(); hot_index = node.cright();
} }
const unsigned cold_index = (hot_index == node.cleft() ? node.cright() : node.cleft()); const unsigned cold_index = (static_cast<int>(hot_index) == node.cleft() ?
node.cright() : node.cleft());
const bst_float w = this->stat(node_index).sum_hess; const bst_float w = this->stat(node_index).sum_hess;
const bst_float hot_zero_fraction = this->stat(hot_index).sum_hess / w; const bst_float hot_zero_fraction = this->stat(hot_index).sum_hess / w;
const bst_float cold_zero_fraction = this->stat(cold_index).sum_hess / w; const bst_float cold_zero_fraction = this->stat(cold_index).sum_hess / w;
@ -753,7 +755,7 @@ inline void RegTree::TreeShap(const RegTree::FVec& feat, bst_float *phi,
// if so we undo that split so we can redo it for this node // if so we undo that split so we can redo it for this node
unsigned path_index = 0; unsigned path_index = 0;
for (; path_index <= unique_depth; ++path_index) { for (; path_index <= unique_depth; ++path_index) {
if (unique_path[path_index].feature_index == split_index) break; if (static_cast<unsigned>(unique_path[path_index].feature_index) == split_index) break;
} }
if (path_index != unique_depth + 1) { if (path_index != unique_depth + 1) {
incoming_zero_fraction = unique_path[path_index].zero_fraction; incoming_zero_fraction = unique_path[path_index].zero_fraction;
@ -773,8 +775,8 @@ inline void RegTree::TreeShap(const RegTree::FVec& feat, bst_float *phi,
inline void RegTree::CalculateContributions(const RegTree::FVec& feat, unsigned root_id, inline void RegTree::CalculateContributions(const RegTree::FVec& feat, unsigned root_id,
bst_float *out_contribs) const { bst_float *out_contribs) const {
// find the expected value of the tree's predictions // find the expected value of the tree's predictions
bst_float base_value = 0.0; bst_float base_value = 0.0f;
bst_float total_cover = 0; bst_float total_cover = 0.0f;
for (int i = 0; i < (*this).param.num_nodes; ++i) { for (int i = 0; i < (*this).param.num_nodes; ++i) {
const auto node = (*this)[i]; const auto node = (*this)[i];
if (node.is_leaf()) { if (node.is_leaf()) {