[R] R-interface for SHAP interactions (#3636)

* add R-interface for SHAP interactions

* update docs for new roxygen version
This commit is contained in:
Vadim Khotilovich 2018-08-30 19:06:21 -05:00 committed by GitHub
parent 10c31ab2cb
commit 5b662cbe1c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 194 additions and 38 deletions

View File

@ -1,7 +1,7 @@
Package: xgboost Package: xgboost
Type: Package Type: Package
Title: Extreme Gradient Boosting Title: Extreme Gradient Boosting
Version: 0.80.1 Version: 0.81.0.1
Date: 2018-08-13 Date: 2018-08-13
Authors@R: c( Authors@R: c(
person("Tianqi", "Chen", role = c("aut"), person("Tianqi", "Chen", role = c("aut"),
@ -61,5 +61,5 @@ Imports:
data.table (>= 1.9.6), data.table (>= 1.9.6),
magrittr (>= 1.5), magrittr (>= 1.5),
stringi (>= 0.5.2) stringi (>= 0.5.2)
RoxygenNote: 6.0.1 RoxygenNote: 6.1.0
SystemRequirements: GNU make, C++11 SystemRequirements: GNU make, C++11

View File

@ -129,11 +129,13 @@ xgb.Booster.complete <- function(object, saveraw = TRUE) {
#' logistic regression would result in predictions for log-odds instead of probabilities. #' logistic regression would result in predictions for log-odds instead of probabilities.
#' @param ntreelimit limit the number of model's trees or boosting iterations used in prediction (see Details). #' @param ntreelimit limit the number of model's trees or boosting iterations used in prediction (see Details).
#' It will use all the trees by default (\code{NULL} value). #' It will use all the trees by default (\code{NULL} value).
#' @param predleaf whether predict leaf index instead. #' @param predleaf whether predict leaf index.
#' @param predcontrib whether to return feature contributions to individual predictions instead (see Details). #' @param predcontrib whether to return feature contributions to individual predictions (see Details).
#' @param approxcontrib whether to use a fast approximation for feature contributions (see Details). #' @param approxcontrib whether to use a fast approximation for feature contributions (see Details).
#' @param predinteraction whether to return contributions of feature interactions to individual predictions (see Details).
#' @param reshape whether to reshape the vector of predictions to a matrix form when there are several #' @param reshape whether to reshape the vector of predictions to a matrix form when there are several
#' prediction outputs per case. This option has no effect when \code{predleaf = TRUE}. #' prediction outputs per case. This option has no effect when either of predleaf, predcontrib,
#' or predinteraction flags is TRUE.
#' @param ... Parameters passed to \code{predict.xgb.Booster} #' @param ... Parameters passed to \code{predict.xgb.Booster}
#' #'
#' @details #' @details
@ -158,6 +160,11 @@ xgb.Booster.complete <- function(object, saveraw = TRUE) {
#' Setting \code{approxcontrib = TRUE} approximates these values following the idea explained #' Setting \code{approxcontrib = TRUE} approximates these values following the idea explained
#' in \url{http://blog.datadive.net/interpreting-random-forests/}. #' in \url{http://blog.datadive.net/interpreting-random-forests/}.
#' #'
#' With \code{predinteraction = TRUE}, SHAP values of contributions of interaction of each pair of features
#' are computed. Note that this operation might be rather expensive in terms of compute and memory.
#' Since it quadratically depends on the number of features, it is recommended to perfom selection
#' of the most important features first. See below about the format of the returned results.
#'
#' @return #' @return
#' For regression or binary classification, it returns a vector of length \code{nrows(newdata)}. #' For regression or binary classification, it returns a vector of length \code{nrows(newdata)}.
#' For multiclass classification, either a \code{num_class * nrows(newdata)} vector or #' For multiclass classification, either a \code{num_class * nrows(newdata)} vector or
@ -173,6 +180,14 @@ xgb.Booster.complete <- function(object, saveraw = TRUE) {
#' such a matrix. The contribution values are on the scale of untransformed margin #' such a matrix. The contribution values are on the scale of untransformed margin
#' (e.g., for binary classification would mean that the contributions are log-odds deviations from bias). #' (e.g., for binary classification would mean that the contributions are log-odds deviations from bias).
#' #'
#' When \code{predinteraction = TRUE} and it is not a multiclass setting, the output is a 3d array with
#' dimensions \code{c(nrow, num_features + 1, num_features + 1)}. The off-diagonal (in the last two dimensions)
#' elements represent different features interaction contributions. The array is symmetric WRT the last
#' two dimensions. The "+ 1" columns corresponds to bias. Summing this array along the last dimension should
#' produce practically the same result as predict with \code{predcontrib = TRUE}.
#' For a multiclass case, a list of \code{num_class} elements is returned, where each element is
#' such an array.
#'
#' @seealso #' @seealso
#' \code{\link{xgb.train}}. #' \code{\link{xgb.train}}.
#' #'
@ -269,7 +284,8 @@ xgb.Booster.complete <- function(object, saveraw = TRUE) {
#' @rdname predict.xgb.Booster #' @rdname predict.xgb.Booster
#' @export #' @export
predict.xgb.Booster <- function(object, newdata, missing = NA, outputmargin = FALSE, ntreelimit = NULL, predict.xgb.Booster <- function(object, newdata, missing = NA, outputmargin = FALSE, ntreelimit = NULL,
predleaf = FALSE, predcontrib = FALSE, approxcontrib = FALSE, reshape = FALSE, ...) { predleaf = FALSE, predcontrib = FALSE, approxcontrib = FALSE, predinteraction = FALSE,
reshape = FALSE, ...) {
object <- xgb.Booster.complete(object, saveraw = FALSE) object <- xgb.Booster.complete(object, saveraw = FALSE)
if (!inherits(newdata, "xgb.DMatrix")) if (!inherits(newdata, "xgb.DMatrix"))
@ -285,7 +301,8 @@ predict.xgb.Booster <- function(object, newdata, missing = NA, outputmargin = FA
if (ntreelimit < 0) if (ntreelimit < 0)
stop("ntreelimit cannot be negative") stop("ntreelimit cannot be negative")
option <- 0L + 1L * as.logical(outputmargin) + 2L * as.logical(predleaf) + 4L * as.logical(predcontrib) + 8L * as.logical(approxcontrib) option <- 0L + 1L * as.logical(outputmargin) + 2L * as.logical(predleaf) + 4L * as.logical(predcontrib) +
8L * as.logical(approxcontrib) + 16L * as.logical(predinteraction)
ret <- .Call(XGBoosterPredict_R, object$handle, newdata, option[1], as.integer(ntreelimit)) ret <- .Call(XGBoosterPredict_R, object$handle, newdata, option[1], as.integer(ntreelimit))
@ -305,17 +322,28 @@ predict.xgb.Booster <- function(object, newdata, missing = NA, outputmargin = FA
} else if (predcontrib) { } else if (predcontrib) {
n_col1 <- ncol(newdata) + 1 n_col1 <- ncol(newdata) + 1
n_group <- npred_per_case / n_col1 n_group <- npred_per_case / n_col1
dnames <- if (!is.null(colnames(newdata))) list(NULL, c(colnames(newdata), "BIAS")) else NULL cnames <- if (!is.null(colnames(newdata))) c(colnames(newdata), "BIAS") else NULL
ret <- if (n_ret == n_row) { ret <- if (n_ret == n_row) {
matrix(ret, ncol = 1, dimnames = dnames) matrix(ret, ncol = 1, dimnames = list(NULL, cnames))
} else if (n_group == 1) { } else if (n_group == 1) {
matrix(ret, nrow = n_row, byrow = TRUE, dimnames = dnames) matrix(ret, nrow = n_row, byrow = TRUE, dimnames = list(NULL, cnames))
} else { } else {
grp_mask <- rep(seq_len(n_col1), n_row) + arr <- array(ret, c(n_col1, n_group, n_row),
rep((seq_len(n_row) - 1) * n_col1 * n_group, each = n_col1) dimnames = list(cnames, NULL, NULL)) %>% aperm(c(2,3,1)) # [group, row, col]
lapply(seq_len(n_group), function(g) { lapply(seq_len(n_group), function(g) arr[g,,])
matrix(ret[grp_mask + n_col1 * (g - 1)], nrow = n_row, byrow = TRUE, dimnames = dnames) }
}) } else if (predinteraction) {
n_col1 <- ncol(newdata) + 1
n_group <- npred_per_case / n_col1^2
cnames <- if (!is.null(colnames(newdata))) c(colnames(newdata), "BIAS") else NULL
ret <- if (n_ret == n_row) {
matrix(ret, ncol = 1, dimnames = list(NULL, cnames))
} else if (n_group == 1) {
array(ret, c(n_col1, n_col1, n_row), dimnames = list(cnames, cnames, NULL)) %>% aperm(c(3,1,2))
} else {
arr <- array(ret, c(n_col1, n_col1, n_group, n_row),
dimnames = list(cnames, cnames, NULL, NULL)) %>% aperm(c(3,4,1,2)) # [group, row, col1, col2]
lapply(seq_len(n_group), function(g) arr[g,,,])
} }
} else if (reshape && npred_per_case > 1) { } else if (reshape && npred_per_case > 1) {
ret <- matrix(ret, nrow = n_row, byrow = TRUE) ret <- matrix(ret, nrow = n_row, byrow = TRUE)

View File

@ -7,7 +7,8 @@
\usage{ \usage{
\method{predict}{xgb.Booster}(object, newdata, missing = NA, \method{predict}{xgb.Booster}(object, newdata, missing = NA,
outputmargin = FALSE, ntreelimit = NULL, predleaf = FALSE, outputmargin = FALSE, ntreelimit = NULL, predleaf = FALSE,
predcontrib = FALSE, approxcontrib = FALSE, reshape = FALSE, ...) predcontrib = FALSE, approxcontrib = FALSE,
predinteraction = FALSE, reshape = FALSE, ...)
\method{predict}{xgb.Booster.handle}(object, ...) \method{predict}{xgb.Booster.handle}(object, ...)
} }
@ -26,14 +27,17 @@ logistic regression would result in predictions for log-odds instead of probabil
\item{ntreelimit}{limit the number of model's trees or boosting iterations used in prediction (see Details). \item{ntreelimit}{limit the number of model's trees or boosting iterations used in prediction (see Details).
It will use all the trees by default (\code{NULL} value).} It will use all the trees by default (\code{NULL} value).}
\item{predleaf}{whether predict leaf index instead.} \item{predleaf}{whether predict leaf index.}
\item{predcontrib}{whether to return feature contributions to individual predictions instead (see Details).} \item{predcontrib}{whether to return feature contributions to individual predictions (see Details).}
\item{approxcontrib}{whether to use a fast approximation for feature contributions (see Details).} \item{approxcontrib}{whether to use a fast approximation for feature contributions (see Details).}
\item{predinteraction}{whether to return contributions of feature interactions to individual predictions (see Details).}
\item{reshape}{whether to reshape the vector of predictions to a matrix form when there are several \item{reshape}{whether to reshape the vector of predictions to a matrix form when there are several
prediction outputs per case. This option has no effect when \code{predleaf = TRUE}.} prediction outputs per case. This option has no effect when either of predleaf, predcontrib,
or predinteraction flags is TRUE.}
\item{...}{Parameters passed to \code{predict.xgb.Booster}} \item{...}{Parameters passed to \code{predict.xgb.Booster}}
} }
@ -51,6 +55,14 @@ When \code{predcontrib = TRUE} and it is not a multiclass setting, the output is
For a multiclass case, a list of \code{num_class} elements is returned, where each element is For a multiclass case, a list of \code{num_class} elements is returned, where each element is
such a matrix. The contribution values are on the scale of untransformed margin such a matrix. The contribution values are on the scale of untransformed margin
(e.g., for binary classification would mean that the contributions are log-odds deviations from bias). (e.g., for binary classification would mean that the contributions are log-odds deviations from bias).
When \code{predinteraction = TRUE} and it is not a multiclass setting, the output is a 3d array with
dimensions \code{c(nrow, num_features + 1, num_features + 1)}. The off-diagonal (in the last two dimensions)
elements represent different features interaction contributions. The array is symmetric WRT the last
two dimensions. The "+ 1" columns corresponds to bias. Summing this array along the last dimension should
produce practically the same result as predict with \code{predcontrib = TRUE}.
For a multiclass case, a list of \code{num_class} elements is returned, where each element is
such an array.
} }
\description{ \description{
Predicted values based on either xgboost model or model handle object. Predicted values based on either xgboost model or model handle object.
@ -76,6 +88,11 @@ values (Lundberg 2017) that sum to the difference between the expected output
of the model and the current prediction (where the hessian weights are used to compute the expectations). of the model and the current prediction (where the hessian weights are used to compute the expectations).
Setting \code{approxcontrib = TRUE} approximates these values following the idea explained Setting \code{approxcontrib = TRUE} approximates these values following the idea explained
in \url{http://blog.datadive.net/interpreting-random-forests/}. in \url{http://blog.datadive.net/interpreting-random-forests/}.
With \code{predinteraction = TRUE}, SHAP values of contributions of interaction of each pair of features
are computed. Note that this operation might be rather expensive in terms of compute and memory.
Since it quadratically depends on the number of features, it is recommended to perfom selection
of the most important features first. See below about the format of the returned results.
} }
\examples{ \examples{
## binary classification: ## binary classification:

View File

@ -4,11 +4,12 @@
\alias{xgb.cv} \alias{xgb.cv}
\title{Cross Validation} \title{Cross Validation}
\usage{ \usage{
xgb.cv(params = list(), data, nrounds, nfold, label = NULL, missing = NA, xgb.cv(params = list(), data, nrounds, nfold, label = NULL,
prediction = FALSE, showsd = TRUE, metrics = list(), obj = NULL, missing = NA, prediction = FALSE, showsd = TRUE,
feval = NULL, stratified = TRUE, folds = NULL, verbose = TRUE, metrics = list(), obj = NULL, feval = NULL, stratified = TRUE,
print_every_n = 1L, early_stopping_rounds = NULL, maximize = NULL, folds = NULL, verbose = TRUE, print_every_n = 1L,
callbacks = list(), ...) early_stopping_rounds = NULL, maximize = NULL, callbacks = list(),
...)
} }
\arguments{ \arguments{
\item{params}{the list of parameters. Commonly used ones are: \item{params}{the list of parameters. Commonly used ones are:

View File

@ -44,8 +44,8 @@ test <- agaricus.test
bst <- xgboost(data = train$data, label = train$label, max_depth = 2, bst <- xgboost(data = train$data, label = train$label, max_depth = 2,
eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic") eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic")
# save the model in file 'xgb.model.dump' # save the model in file 'xgb.model.dump'
dump.path = file.path(tempdir(), 'model.dump') dump_path = file.path(tempdir(), 'model.dump')
xgb.dump(bst, dump.path, with_stats = TRUE) xgb.dump(bst, dump_path, with_stats = TRUE)
# print the model without saving it to a file # print the model without saving it to a file
print(xgb.dump(bst, with_stats = TRUE)) print(xgb.dump(bst, with_stats = TRUE))

View File

@ -5,11 +5,11 @@
\alias{xgb.plot.deepness} \alias{xgb.plot.deepness}
\title{Plot model trees deepness} \title{Plot model trees deepness}
\usage{ \usage{
xgb.ggplot.deepness(model = NULL, which = c("2x1", "max.depth", "med.depth", xgb.ggplot.deepness(model = NULL, which = c("2x1", "max.depth",
"med.weight")) "med.depth", "med.weight"))
xgb.plot.deepness(model = NULL, which = c("2x1", "max.depth", "med.depth", xgb.plot.deepness(model = NULL, which = c("2x1", "max.depth",
"med.weight"), plot = TRUE, ...) "med.depth", "med.weight"), plot = TRUE, ...)
} }
\arguments{ \arguments{
\item{model}{either an \code{xgb.Booster} model generated by the \code{xgb.train} function \item{model}{either an \code{xgb.Booster} model generated by the \code{xgb.train} function

View File

@ -9,8 +9,8 @@ xgb.ggplot.importance(importance_matrix = NULL, top_n = NULL,
measure = NULL, rel_to_first = FALSE, n_clusters = c(1:10), ...) measure = NULL, rel_to_first = FALSE, n_clusters = c(1:10), ...)
xgb.plot.importance(importance_matrix = NULL, top_n = NULL, xgb.plot.importance(importance_matrix = NULL, top_n = NULL,
measure = NULL, rel_to_first = FALSE, left_margin = 10, cex = NULL, measure = NULL, rel_to_first = FALSE, left_margin = 10,
plot = TRUE, ...) cex = NULL, plot = TRUE, ...)
} }
\arguments{ \arguments{
\item{importance_matrix}{a \code{data.table} returned by \code{\link{xgb.importance}}.} \item{importance_matrix}{a \code{data.table} returned by \code{\link{xgb.importance}}.}

View File

@ -6,8 +6,8 @@
\usage{ \usage{
xgb.plot.shap(data, shap_contrib = NULL, features = NULL, top_n = 1, xgb.plot.shap(data, shap_contrib = NULL, features = NULL, top_n = 1,
model = NULL, trees = NULL, target_class = NULL, model = NULL, trees = NULL, target_class = NULL,
approxcontrib = FALSE, subsample = NULL, n_col = 1, col = rgb(0, 0, 1, approxcontrib = FALSE, subsample = NULL, n_col = 1, col = rgb(0,
0.2), pch = ".", discrete_n_uniq = 5, discrete_jitter = 0.01, 0, 1, 0.2), pch = ".", discrete_n_uniq = 5, discrete_jitter = 0.01,
ylab = "SHAP", plot_NA = TRUE, col_NA = rgb(0.7, 0, 1, 0.6), ylab = "SHAP", plot_NA = TRUE, col_NA = rgb(0.7, 0, 1, 0.6),
pch_NA = ".", pos_NA = 1.07, plot_loess = TRUE, col_loess = 2, pch_NA = ".", pos_NA = 1.07, plot_loess = TRUE, col_loess = 2,
span_loess = 0.5, which = c("1d", "2d"), plot = TRUE, ...) span_loess = 0.5, which = c("1d", "2d"), plot = TRUE, ...)

View File

@ -5,15 +5,17 @@
\alias{xgboost} \alias{xgboost}
\title{eXtreme Gradient Boosting Training} \title{eXtreme Gradient Boosting Training}
\usage{ \usage{
xgb.train(params = list(), data, nrounds, watchlist = list(), obj = NULL, xgb.train(params = list(), data, nrounds, watchlist = list(),
feval = NULL, verbose = 1, print_every_n = 1L, obj = NULL, feval = NULL, verbose = 1, print_every_n = 1L,
early_stopping_rounds = NULL, maximize = NULL, save_period = NULL, early_stopping_rounds = NULL, maximize = NULL, save_period = NULL,
save_name = "xgboost.model", xgb_model = NULL, callbacks = list(), ...) save_name = "xgboost.model", xgb_model = NULL, callbacks = list(),
...)
xgboost(data = NULL, label = NULL, missing = NA, weight = NULL, xgboost(data = NULL, label = NULL, missing = NA, weight = NULL,
params = list(), nrounds, verbose = 1, print_every_n = 1L, params = list(), nrounds, verbose = 1, print_every_n = 1L,
early_stopping_rounds = NULL, maximize = NULL, save_period = NULL, early_stopping_rounds = NULL, maximize = NULL, save_period = NULL,
save_name = "xgboost.model", xgb_model = NULL, callbacks = list(), ...) save_name = "xgboost.model", xgb_model = NULL, callbacks = list(),
...)
} }
\arguments{ \arguments{
\item{params}{the list of parameters. \item{params}{the list of parameters.

View File

@ -0,0 +1,108 @@
context('Test prediction of feature interactions')
require(xgboost)
require(magrittr)
set.seed(123)
test_that("predict feature interactions works", {
# simulate some binary data and a linear outcome with an interaction term
N <- 1000
P <- 5
X <- matrix(rbinom(N * P, 1, 0.5), ncol=P, dimnames = list(NULL, letters[1:P]))
# center the data (as contributions are computed WRT feature means)
X <- scale(X, scale=FALSE)
# outcome without any interactions, without any noise:
f <- function(x) 2 * x[, 1] - 3 * x[, 2]
# outcome with interactions, without noise:
f_int <- function(x) f(x) + 2 * x[, 2] * x[, 3]
# outcome with interactions, with noise:
#f_int_noise <- function(x) f_int(x) + rnorm(N, 0, 0.3)
y <- f_int(X)
dm <- xgb.DMatrix(X, label = y)
param <- list(eta=0.1, max_depth=4, base_score=mean(y), lambda=0, nthread=2)
b <- xgb.train(param, dm, 100)
pred = predict(b, dm, outputmargin=TRUE)
# SHAP contributions:
cont <- predict(b, dm, predcontrib=TRUE)
expect_equal(dim(cont), c(N, P+1))
# make sure for each row they add up to marginal predictions
max(abs(rowSums(cont) - pred)) %>% expect_lt(0.001)
# Hand-construct the 'ground truth' feature contributions:
gt_cont <- cbind(
2. * X[, 1],
-3. * X[, 2] + 1. * X[, 2] * X[, 3], # attribute a HALF of the interaction term to feature #2
1. * X[, 2] * X[, 3] # and another HALF of the interaction term to feature #3
)
gt_cont <- cbind(gt_cont, matrix(0, nrow=N, ncol=P + 1 - 3))
# These should be relatively close:
expect_lt(max(abs(cont - gt_cont)), 0.05)
# SHAP interaction contributions:
intr <- predict(b, dm, predinteraction=TRUE)
expect_equal(dim(intr), c(N, P+1, P+1))
# check assigned colnames
cn <- c(letters[1:P], "BIAS")
expect_equal(dimnames(intr), list(NULL, cn, cn))
# check the symmetry
max(abs(aperm(intr, c(1,3,2)) - intr)) %>% expect_lt(0.00001)
# sums WRT columns must be close to feature contributions
max(abs(apply(intr, c(1,2), sum) - cont)) %>% expect_lt(0.00001)
# diagonal terms for features 3,4,5 must be close to zero
Reduce(max, sapply(3:P, function(i) max(abs(intr[, i, i])))) %>% expect_lt(0.05)
# BIAS must have no interactions
max(abs(intr[, 1:P, P+1])) %>% expect_lt(0.00001)
# interactions other than 2 x 3 must be close to zero
intr23 <- intr
intr23[,2,3] <- 0
Reduce(max, sapply(1:P, function(i) max(abs(intr23[, i, (i+1):(P+1)])))) %>% expect_lt(0.05)
# Construct the 'ground truth' contributions of interactions directly from the linear terms:
gt_intr <- array(0, c(N, P+1, P+1))
gt_intr[,2,3] <- 1. * X[, 2] * X[, 3] # attribute a HALF of the interaction term to each symmetric element
gt_intr[,3,2] <- gt_intr[, 2, 3]
# merge-in the diagonal based on 'ground truth' feature contributions
intr_diag = gt_cont - apply(gt_intr, c(1,2), sum)
for(j in seq_len(P)) {
gt_intr[,j,j] = intr_diag[,j]
}
# These should be relatively close:
expect_lt(max(abs(intr - gt_intr)), 0.1)
})
test_that("multiclass feature interactions work", {
dm <- xgb.DMatrix(as.matrix(iris[,-5]), label=as.numeric(iris$Species)-1)
param <- list(eta=0.1, max_depth=4, objective='multi:softprob', num_class=3)
b <- xgb.train(param, dm, 40)
pred = predict(b, dm, outputmargin=TRUE) %>% array(c(3, 150)) %>% t
# SHAP contributions:
cont <- predict(b, dm, predcontrib=TRUE)
expect_length(cont, 3)
# rewrap them as a 3d array
cont <- unlist(cont) %>% array(c(150, 5, 3))
# make sure for each row they add up to marginal predictions
max(abs(apply(cont, c(1,3), sum) - pred)) %>% expect_lt(0.001)
# SHAP interaction contributions:
intr <- predict(b, dm, predinteraction=TRUE)
expect_length(intr, 3)
# rewrap them as a 4d array
intr <- unlist(intr) %>% array(c(150, 5, 5, 3)) %>% aperm(c(4, 1, 2, 3)) # [grp, row, col, col]
# check the symmetry
max(abs(aperm(intr, c(1,2,4,3)) - intr)) %>% expect_lt(0.00001)
# sums WRT columns must be close to feature contributions
max(abs(apply(intr, c(1,2,3), sum) - aperm(cont, c(3,1,2)))) %>% expect_lt(0.00001)
})