[R] R-interface for SHAP interactions (#3636)
* add R-interface for SHAP interactions * update docs for new roxygen version
This commit is contained in:
parent
10c31ab2cb
commit
5b662cbe1c
@ -1,7 +1,7 @@
|
||||
Package: xgboost
|
||||
Type: Package
|
||||
Title: Extreme Gradient Boosting
|
||||
Version: 0.80.1
|
||||
Version: 0.81.0.1
|
||||
Date: 2018-08-13
|
||||
Authors@R: c(
|
||||
person("Tianqi", "Chen", role = c("aut"),
|
||||
@ -61,5 +61,5 @@ Imports:
|
||||
data.table (>= 1.9.6),
|
||||
magrittr (>= 1.5),
|
||||
stringi (>= 0.5.2)
|
||||
RoxygenNote: 6.0.1
|
||||
RoxygenNote: 6.1.0
|
||||
SystemRequirements: GNU make, C++11
|
||||
|
||||
@ -129,11 +129,13 @@ xgb.Booster.complete <- function(object, saveraw = TRUE) {
|
||||
#' logistic regression would result in predictions for log-odds instead of probabilities.
|
||||
#' @param ntreelimit limit the number of model's trees or boosting iterations used in prediction (see Details).
|
||||
#' It will use all the trees by default (\code{NULL} value).
|
||||
#' @param predleaf whether predict leaf index instead.
|
||||
#' @param predcontrib whether to return feature contributions to individual predictions instead (see Details).
|
||||
#' @param predleaf whether predict leaf index.
|
||||
#' @param predcontrib whether to return feature contributions to individual predictions (see Details).
|
||||
#' @param approxcontrib whether to use a fast approximation for feature contributions (see Details).
|
||||
#' @param predinteraction whether to return contributions of feature interactions to individual predictions (see Details).
|
||||
#' @param reshape whether to reshape the vector of predictions to a matrix form when there are several
|
||||
#' prediction outputs per case. This option has no effect when \code{predleaf = TRUE}.
|
||||
#' prediction outputs per case. This option has no effect when either of predleaf, predcontrib,
|
||||
#' or predinteraction flags is TRUE.
|
||||
#' @param ... Parameters passed to \code{predict.xgb.Booster}
|
||||
#'
|
||||
#' @details
|
||||
@ -158,6 +160,11 @@ xgb.Booster.complete <- function(object, saveraw = TRUE) {
|
||||
#' Setting \code{approxcontrib = TRUE} approximates these values following the idea explained
|
||||
#' in \url{http://blog.datadive.net/interpreting-random-forests/}.
|
||||
#'
|
||||
#' With \code{predinteraction = TRUE}, SHAP values of contributions of interaction of each pair of features
|
||||
#' are computed. Note that this operation might be rather expensive in terms of compute and memory.
|
||||
#' Since it quadratically depends on the number of features, it is recommended to perfom selection
|
||||
#' of the most important features first. See below about the format of the returned results.
|
||||
#'
|
||||
#' @return
|
||||
#' For regression or binary classification, it returns a vector of length \code{nrows(newdata)}.
|
||||
#' For multiclass classification, either a \code{num_class * nrows(newdata)} vector or
|
||||
@ -173,6 +180,14 @@ xgb.Booster.complete <- function(object, saveraw = TRUE) {
|
||||
#' such a matrix. The contribution values are on the scale of untransformed margin
|
||||
#' (e.g., for binary classification would mean that the contributions are log-odds deviations from bias).
|
||||
#'
|
||||
#' When \code{predinteraction = TRUE} and it is not a multiclass setting, the output is a 3d array with
|
||||
#' dimensions \code{c(nrow, num_features + 1, num_features + 1)}. The off-diagonal (in the last two dimensions)
|
||||
#' elements represent different features interaction contributions. The array is symmetric WRT the last
|
||||
#' two dimensions. The "+ 1" columns corresponds to bias. Summing this array along the last dimension should
|
||||
#' produce practically the same result as predict with \code{predcontrib = TRUE}.
|
||||
#' For a multiclass case, a list of \code{num_class} elements is returned, where each element is
|
||||
#' such an array.
|
||||
#'
|
||||
#' @seealso
|
||||
#' \code{\link{xgb.train}}.
|
||||
#'
|
||||
@ -269,7 +284,8 @@ xgb.Booster.complete <- function(object, saveraw = TRUE) {
|
||||
#' @rdname predict.xgb.Booster
|
||||
#' @export
|
||||
predict.xgb.Booster <- function(object, newdata, missing = NA, outputmargin = FALSE, ntreelimit = NULL,
|
||||
predleaf = FALSE, predcontrib = FALSE, approxcontrib = FALSE, reshape = FALSE, ...) {
|
||||
predleaf = FALSE, predcontrib = FALSE, approxcontrib = FALSE, predinteraction = FALSE,
|
||||
reshape = FALSE, ...) {
|
||||
|
||||
object <- xgb.Booster.complete(object, saveraw = FALSE)
|
||||
if (!inherits(newdata, "xgb.DMatrix"))
|
||||
@ -285,7 +301,8 @@ predict.xgb.Booster <- function(object, newdata, missing = NA, outputmargin = FA
|
||||
if (ntreelimit < 0)
|
||||
stop("ntreelimit cannot be negative")
|
||||
|
||||
option <- 0L + 1L * as.logical(outputmargin) + 2L * as.logical(predleaf) + 4L * as.logical(predcontrib) + 8L * as.logical(approxcontrib)
|
||||
option <- 0L + 1L * as.logical(outputmargin) + 2L * as.logical(predleaf) + 4L * as.logical(predcontrib) +
|
||||
8L * as.logical(approxcontrib) + 16L * as.logical(predinteraction)
|
||||
|
||||
ret <- .Call(XGBoosterPredict_R, object$handle, newdata, option[1], as.integer(ntreelimit))
|
||||
|
||||
@ -305,17 +322,28 @@ predict.xgb.Booster <- function(object, newdata, missing = NA, outputmargin = FA
|
||||
} else if (predcontrib) {
|
||||
n_col1 <- ncol(newdata) + 1
|
||||
n_group <- npred_per_case / n_col1
|
||||
dnames <- if (!is.null(colnames(newdata))) list(NULL, c(colnames(newdata), "BIAS")) else NULL
|
||||
cnames <- if (!is.null(colnames(newdata))) c(colnames(newdata), "BIAS") else NULL
|
||||
ret <- if (n_ret == n_row) {
|
||||
matrix(ret, ncol = 1, dimnames = dnames)
|
||||
matrix(ret, ncol = 1, dimnames = list(NULL, cnames))
|
||||
} else if (n_group == 1) {
|
||||
matrix(ret, nrow = n_row, byrow = TRUE, dimnames = dnames)
|
||||
matrix(ret, nrow = n_row, byrow = TRUE, dimnames = list(NULL, cnames))
|
||||
} else {
|
||||
grp_mask <- rep(seq_len(n_col1), n_row) +
|
||||
rep((seq_len(n_row) - 1) * n_col1 * n_group, each = n_col1)
|
||||
lapply(seq_len(n_group), function(g) {
|
||||
matrix(ret[grp_mask + n_col1 * (g - 1)], nrow = n_row, byrow = TRUE, dimnames = dnames)
|
||||
})
|
||||
arr <- array(ret, c(n_col1, n_group, n_row),
|
||||
dimnames = list(cnames, NULL, NULL)) %>% aperm(c(2,3,1)) # [group, row, col]
|
||||
lapply(seq_len(n_group), function(g) arr[g,,])
|
||||
}
|
||||
} else if (predinteraction) {
|
||||
n_col1 <- ncol(newdata) + 1
|
||||
n_group <- npred_per_case / n_col1^2
|
||||
cnames <- if (!is.null(colnames(newdata))) c(colnames(newdata), "BIAS") else NULL
|
||||
ret <- if (n_ret == n_row) {
|
||||
matrix(ret, ncol = 1, dimnames = list(NULL, cnames))
|
||||
} else if (n_group == 1) {
|
||||
array(ret, c(n_col1, n_col1, n_row), dimnames = list(cnames, cnames, NULL)) %>% aperm(c(3,1,2))
|
||||
} else {
|
||||
arr <- array(ret, c(n_col1, n_col1, n_group, n_row),
|
||||
dimnames = list(cnames, cnames, NULL, NULL)) %>% aperm(c(3,4,1,2)) # [group, row, col1, col2]
|
||||
lapply(seq_len(n_group), function(g) arr[g,,,])
|
||||
}
|
||||
} else if (reshape && npred_per_case > 1) {
|
||||
ret <- matrix(ret, nrow = n_row, byrow = TRUE)
|
||||
|
||||
@ -7,7 +7,8 @@
|
||||
\usage{
|
||||
\method{predict}{xgb.Booster}(object, newdata, missing = NA,
|
||||
outputmargin = FALSE, ntreelimit = NULL, predleaf = FALSE,
|
||||
predcontrib = FALSE, approxcontrib = FALSE, reshape = FALSE, ...)
|
||||
predcontrib = FALSE, approxcontrib = FALSE,
|
||||
predinteraction = FALSE, reshape = FALSE, ...)
|
||||
|
||||
\method{predict}{xgb.Booster.handle}(object, ...)
|
||||
}
|
||||
@ -26,14 +27,17 @@ logistic regression would result in predictions for log-odds instead of probabil
|
||||
\item{ntreelimit}{limit the number of model's trees or boosting iterations used in prediction (see Details).
|
||||
It will use all the trees by default (\code{NULL} value).}
|
||||
|
||||
\item{predleaf}{whether predict leaf index instead.}
|
||||
\item{predleaf}{whether predict leaf index.}
|
||||
|
||||
\item{predcontrib}{whether to return feature contributions to individual predictions instead (see Details).}
|
||||
\item{predcontrib}{whether to return feature contributions to individual predictions (see Details).}
|
||||
|
||||
\item{approxcontrib}{whether to use a fast approximation for feature contributions (see Details).}
|
||||
|
||||
\item{predinteraction}{whether to return contributions of feature interactions to individual predictions (see Details).}
|
||||
|
||||
\item{reshape}{whether to reshape the vector of predictions to a matrix form when there are several
|
||||
prediction outputs per case. This option has no effect when \code{predleaf = TRUE}.}
|
||||
prediction outputs per case. This option has no effect when either of predleaf, predcontrib,
|
||||
or predinteraction flags is TRUE.}
|
||||
|
||||
\item{...}{Parameters passed to \code{predict.xgb.Booster}}
|
||||
}
|
||||
@ -51,6 +55,14 @@ When \code{predcontrib = TRUE} and it is not a multiclass setting, the output is
|
||||
For a multiclass case, a list of \code{num_class} elements is returned, where each element is
|
||||
such a matrix. The contribution values are on the scale of untransformed margin
|
||||
(e.g., for binary classification would mean that the contributions are log-odds deviations from bias).
|
||||
|
||||
When \code{predinteraction = TRUE} and it is not a multiclass setting, the output is a 3d array with
|
||||
dimensions \code{c(nrow, num_features + 1, num_features + 1)}. The off-diagonal (in the last two dimensions)
|
||||
elements represent different features interaction contributions. The array is symmetric WRT the last
|
||||
two dimensions. The "+ 1" columns corresponds to bias. Summing this array along the last dimension should
|
||||
produce practically the same result as predict with \code{predcontrib = TRUE}.
|
||||
For a multiclass case, a list of \code{num_class} elements is returned, where each element is
|
||||
such an array.
|
||||
}
|
||||
\description{
|
||||
Predicted values based on either xgboost model or model handle object.
|
||||
@ -76,6 +88,11 @@ values (Lundberg 2017) that sum to the difference between the expected output
|
||||
of the model and the current prediction (where the hessian weights are used to compute the expectations).
|
||||
Setting \code{approxcontrib = TRUE} approximates these values following the idea explained
|
||||
in \url{http://blog.datadive.net/interpreting-random-forests/}.
|
||||
|
||||
With \code{predinteraction = TRUE}, SHAP values of contributions of interaction of each pair of features
|
||||
are computed. Note that this operation might be rather expensive in terms of compute and memory.
|
||||
Since it quadratically depends on the number of features, it is recommended to perfom selection
|
||||
of the most important features first. See below about the format of the returned results.
|
||||
}
|
||||
\examples{
|
||||
## binary classification:
|
||||
|
||||
@ -4,11 +4,12 @@
|
||||
\alias{xgb.cv}
|
||||
\title{Cross Validation}
|
||||
\usage{
|
||||
xgb.cv(params = list(), data, nrounds, nfold, label = NULL, missing = NA,
|
||||
prediction = FALSE, showsd = TRUE, metrics = list(), obj = NULL,
|
||||
feval = NULL, stratified = TRUE, folds = NULL, verbose = TRUE,
|
||||
print_every_n = 1L, early_stopping_rounds = NULL, maximize = NULL,
|
||||
callbacks = list(), ...)
|
||||
xgb.cv(params = list(), data, nrounds, nfold, label = NULL,
|
||||
missing = NA, prediction = FALSE, showsd = TRUE,
|
||||
metrics = list(), obj = NULL, feval = NULL, stratified = TRUE,
|
||||
folds = NULL, verbose = TRUE, print_every_n = 1L,
|
||||
early_stopping_rounds = NULL, maximize = NULL, callbacks = list(),
|
||||
...)
|
||||
}
|
||||
\arguments{
|
||||
\item{params}{the list of parameters. Commonly used ones are:
|
||||
|
||||
@ -44,8 +44,8 @@ test <- agaricus.test
|
||||
bst <- xgboost(data = train$data, label = train$label, max_depth = 2,
|
||||
eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic")
|
||||
# save the model in file 'xgb.model.dump'
|
||||
dump.path = file.path(tempdir(), 'model.dump')
|
||||
xgb.dump(bst, dump.path, with_stats = TRUE)
|
||||
dump_path = file.path(tempdir(), 'model.dump')
|
||||
xgb.dump(bst, dump_path, with_stats = TRUE)
|
||||
|
||||
# print the model without saving it to a file
|
||||
print(xgb.dump(bst, with_stats = TRUE))
|
||||
|
||||
@ -5,11 +5,11 @@
|
||||
\alias{xgb.plot.deepness}
|
||||
\title{Plot model trees deepness}
|
||||
\usage{
|
||||
xgb.ggplot.deepness(model = NULL, which = c("2x1", "max.depth", "med.depth",
|
||||
"med.weight"))
|
||||
xgb.ggplot.deepness(model = NULL, which = c("2x1", "max.depth",
|
||||
"med.depth", "med.weight"))
|
||||
|
||||
xgb.plot.deepness(model = NULL, which = c("2x1", "max.depth", "med.depth",
|
||||
"med.weight"), plot = TRUE, ...)
|
||||
xgb.plot.deepness(model = NULL, which = c("2x1", "max.depth",
|
||||
"med.depth", "med.weight"), plot = TRUE, ...)
|
||||
}
|
||||
\arguments{
|
||||
\item{model}{either an \code{xgb.Booster} model generated by the \code{xgb.train} function
|
||||
|
||||
@ -9,8 +9,8 @@ xgb.ggplot.importance(importance_matrix = NULL, top_n = NULL,
|
||||
measure = NULL, rel_to_first = FALSE, n_clusters = c(1:10), ...)
|
||||
|
||||
xgb.plot.importance(importance_matrix = NULL, top_n = NULL,
|
||||
measure = NULL, rel_to_first = FALSE, left_margin = 10, cex = NULL,
|
||||
plot = TRUE, ...)
|
||||
measure = NULL, rel_to_first = FALSE, left_margin = 10,
|
||||
cex = NULL, plot = TRUE, ...)
|
||||
}
|
||||
\arguments{
|
||||
\item{importance_matrix}{a \code{data.table} returned by \code{\link{xgb.importance}}.}
|
||||
|
||||
@ -6,8 +6,8 @@
|
||||
\usage{
|
||||
xgb.plot.shap(data, shap_contrib = NULL, features = NULL, top_n = 1,
|
||||
model = NULL, trees = NULL, target_class = NULL,
|
||||
approxcontrib = FALSE, subsample = NULL, n_col = 1, col = rgb(0, 0, 1,
|
||||
0.2), pch = ".", discrete_n_uniq = 5, discrete_jitter = 0.01,
|
||||
approxcontrib = FALSE, subsample = NULL, n_col = 1, col = rgb(0,
|
||||
0, 1, 0.2), pch = ".", discrete_n_uniq = 5, discrete_jitter = 0.01,
|
||||
ylab = "SHAP", plot_NA = TRUE, col_NA = rgb(0.7, 0, 1, 0.6),
|
||||
pch_NA = ".", pos_NA = 1.07, plot_loess = TRUE, col_loess = 2,
|
||||
span_loess = 0.5, which = c("1d", "2d"), plot = TRUE, ...)
|
||||
|
||||
@ -5,15 +5,17 @@
|
||||
\alias{xgboost}
|
||||
\title{eXtreme Gradient Boosting Training}
|
||||
\usage{
|
||||
xgb.train(params = list(), data, nrounds, watchlist = list(), obj = NULL,
|
||||
feval = NULL, verbose = 1, print_every_n = 1L,
|
||||
xgb.train(params = list(), data, nrounds, watchlist = list(),
|
||||
obj = NULL, feval = NULL, verbose = 1, print_every_n = 1L,
|
||||
early_stopping_rounds = NULL, maximize = NULL, save_period = NULL,
|
||||
save_name = "xgboost.model", xgb_model = NULL, callbacks = list(), ...)
|
||||
save_name = "xgboost.model", xgb_model = NULL, callbacks = list(),
|
||||
...)
|
||||
|
||||
xgboost(data = NULL, label = NULL, missing = NA, weight = NULL,
|
||||
params = list(), nrounds, verbose = 1, print_every_n = 1L,
|
||||
early_stopping_rounds = NULL, maximize = NULL, save_period = NULL,
|
||||
save_name = "xgboost.model", xgb_model = NULL, callbacks = list(), ...)
|
||||
save_name = "xgboost.model", xgb_model = NULL, callbacks = list(),
|
||||
...)
|
||||
}
|
||||
\arguments{
|
||||
\item{params}{the list of parameters.
|
||||
|
||||
108
R-package/tests/testthat/test_interactions.R
Normal file
108
R-package/tests/testthat/test_interactions.R
Normal file
@ -0,0 +1,108 @@
|
||||
context('Test prediction of feature interactions')
|
||||
|
||||
require(xgboost)
|
||||
require(magrittr)
|
||||
|
||||
set.seed(123)
|
||||
|
||||
test_that("predict feature interactions works", {
|
||||
# simulate some binary data and a linear outcome with an interaction term
|
||||
N <- 1000
|
||||
P <- 5
|
||||
X <- matrix(rbinom(N * P, 1, 0.5), ncol=P, dimnames = list(NULL, letters[1:P]))
|
||||
# center the data (as contributions are computed WRT feature means)
|
||||
X <- scale(X, scale=FALSE)
|
||||
|
||||
# outcome without any interactions, without any noise:
|
||||
f <- function(x) 2 * x[, 1] - 3 * x[, 2]
|
||||
# outcome with interactions, without noise:
|
||||
f_int <- function(x) f(x) + 2 * x[, 2] * x[, 3]
|
||||
# outcome with interactions, with noise:
|
||||
#f_int_noise <- function(x) f_int(x) + rnorm(N, 0, 0.3)
|
||||
|
||||
y <- f_int(X)
|
||||
|
||||
dm <- xgb.DMatrix(X, label = y)
|
||||
param <- list(eta=0.1, max_depth=4, base_score=mean(y), lambda=0, nthread=2)
|
||||
b <- xgb.train(param, dm, 100)
|
||||
|
||||
pred = predict(b, dm, outputmargin=TRUE)
|
||||
|
||||
# SHAP contributions:
|
||||
cont <- predict(b, dm, predcontrib=TRUE)
|
||||
expect_equal(dim(cont), c(N, P+1))
|
||||
# make sure for each row they add up to marginal predictions
|
||||
max(abs(rowSums(cont) - pred)) %>% expect_lt(0.001)
|
||||
# Hand-construct the 'ground truth' feature contributions:
|
||||
gt_cont <- cbind(
|
||||
2. * X[, 1],
|
||||
-3. * X[, 2] + 1. * X[, 2] * X[, 3], # attribute a HALF of the interaction term to feature #2
|
||||
1. * X[, 2] * X[, 3] # and another HALF of the interaction term to feature #3
|
||||
)
|
||||
gt_cont <- cbind(gt_cont, matrix(0, nrow=N, ncol=P + 1 - 3))
|
||||
# These should be relatively close:
|
||||
expect_lt(max(abs(cont - gt_cont)), 0.05)
|
||||
|
||||
|
||||
# SHAP interaction contributions:
|
||||
intr <- predict(b, dm, predinteraction=TRUE)
|
||||
expect_equal(dim(intr), c(N, P+1, P+1))
|
||||
# check assigned colnames
|
||||
cn <- c(letters[1:P], "BIAS")
|
||||
expect_equal(dimnames(intr), list(NULL, cn, cn))
|
||||
|
||||
# check the symmetry
|
||||
max(abs(aperm(intr, c(1,3,2)) - intr)) %>% expect_lt(0.00001)
|
||||
|
||||
# sums WRT columns must be close to feature contributions
|
||||
max(abs(apply(intr, c(1,2), sum) - cont)) %>% expect_lt(0.00001)
|
||||
|
||||
# diagonal terms for features 3,4,5 must be close to zero
|
||||
Reduce(max, sapply(3:P, function(i) max(abs(intr[, i, i])))) %>% expect_lt(0.05)
|
||||
|
||||
# BIAS must have no interactions
|
||||
max(abs(intr[, 1:P, P+1])) %>% expect_lt(0.00001)
|
||||
|
||||
# interactions other than 2 x 3 must be close to zero
|
||||
intr23 <- intr
|
||||
intr23[,2,3] <- 0
|
||||
Reduce(max, sapply(1:P, function(i) max(abs(intr23[, i, (i+1):(P+1)])))) %>% expect_lt(0.05)
|
||||
|
||||
# Construct the 'ground truth' contributions of interactions directly from the linear terms:
|
||||
gt_intr <- array(0, c(N, P+1, P+1))
|
||||
gt_intr[,2,3] <- 1. * X[, 2] * X[, 3] # attribute a HALF of the interaction term to each symmetric element
|
||||
gt_intr[,3,2] <- gt_intr[, 2, 3]
|
||||
# merge-in the diagonal based on 'ground truth' feature contributions
|
||||
intr_diag = gt_cont - apply(gt_intr, c(1,2), sum)
|
||||
for(j in seq_len(P)) {
|
||||
gt_intr[,j,j] = intr_diag[,j]
|
||||
}
|
||||
# These should be relatively close:
|
||||
expect_lt(max(abs(intr - gt_intr)), 0.1)
|
||||
})
|
||||
|
||||
|
||||
test_that("multiclass feature interactions work", {
|
||||
dm <- xgb.DMatrix(as.matrix(iris[,-5]), label=as.numeric(iris$Species)-1)
|
||||
param <- list(eta=0.1, max_depth=4, objective='multi:softprob', num_class=3)
|
||||
b <- xgb.train(param, dm, 40)
|
||||
pred = predict(b, dm, outputmargin=TRUE) %>% array(c(3, 150)) %>% t
|
||||
|
||||
# SHAP contributions:
|
||||
cont <- predict(b, dm, predcontrib=TRUE)
|
||||
expect_length(cont, 3)
|
||||
# rewrap them as a 3d array
|
||||
cont <- unlist(cont) %>% array(c(150, 5, 3))
|
||||
# make sure for each row they add up to marginal predictions
|
||||
max(abs(apply(cont, c(1,3), sum) - pred)) %>% expect_lt(0.001)
|
||||
|
||||
# SHAP interaction contributions:
|
||||
intr <- predict(b, dm, predinteraction=TRUE)
|
||||
expect_length(intr, 3)
|
||||
# rewrap them as a 4d array
|
||||
intr <- unlist(intr) %>% array(c(150, 5, 5, 3)) %>% aperm(c(4, 1, 2, 3)) # [grp, row, col, col]
|
||||
# check the symmetry
|
||||
max(abs(aperm(intr, c(1,2,4,3)) - intr)) %>% expect_lt(0.00001)
|
||||
# sums WRT columns must be close to feature contributions
|
||||
max(abs(apply(intr, c(1,2,3), sum) - aperm(cont, c(3,1,2)))) %>% expect_lt(0.00001)
|
||||
})
|
||||
Loading…
x
Reference in New Issue
Block a user