[R] R-interface for SHAP interactions (#3636)
* add R-interface for SHAP interactions * update docs for new roxygen version
This commit is contained in:
parent
10c31ab2cb
commit
5b662cbe1c
@ -1,7 +1,7 @@
|
|||||||
Package: xgboost
|
Package: xgboost
|
||||||
Type: Package
|
Type: Package
|
||||||
Title: Extreme Gradient Boosting
|
Title: Extreme Gradient Boosting
|
||||||
Version: 0.80.1
|
Version: 0.81.0.1
|
||||||
Date: 2018-08-13
|
Date: 2018-08-13
|
||||||
Authors@R: c(
|
Authors@R: c(
|
||||||
person("Tianqi", "Chen", role = c("aut"),
|
person("Tianqi", "Chen", role = c("aut"),
|
||||||
@ -61,5 +61,5 @@ Imports:
|
|||||||
data.table (>= 1.9.6),
|
data.table (>= 1.9.6),
|
||||||
magrittr (>= 1.5),
|
magrittr (>= 1.5),
|
||||||
stringi (>= 0.5.2)
|
stringi (>= 0.5.2)
|
||||||
RoxygenNote: 6.0.1
|
RoxygenNote: 6.1.0
|
||||||
SystemRequirements: GNU make, C++11
|
SystemRequirements: GNU make, C++11
|
||||||
|
|||||||
@ -129,11 +129,13 @@ xgb.Booster.complete <- function(object, saveraw = TRUE) {
|
|||||||
#' logistic regression would result in predictions for log-odds instead of probabilities.
|
#' logistic regression would result in predictions for log-odds instead of probabilities.
|
||||||
#' @param ntreelimit limit the number of model's trees or boosting iterations used in prediction (see Details).
|
#' @param ntreelimit limit the number of model's trees or boosting iterations used in prediction (see Details).
|
||||||
#' It will use all the trees by default (\code{NULL} value).
|
#' It will use all the trees by default (\code{NULL} value).
|
||||||
#' @param predleaf whether predict leaf index instead.
|
#' @param predleaf whether predict leaf index.
|
||||||
#' @param predcontrib whether to return feature contributions to individual predictions instead (see Details).
|
#' @param predcontrib whether to return feature contributions to individual predictions (see Details).
|
||||||
#' @param approxcontrib whether to use a fast approximation for feature contributions (see Details).
|
#' @param approxcontrib whether to use a fast approximation for feature contributions (see Details).
|
||||||
|
#' @param predinteraction whether to return contributions of feature interactions to individual predictions (see Details).
|
||||||
#' @param reshape whether to reshape the vector of predictions to a matrix form when there are several
|
#' @param reshape whether to reshape the vector of predictions to a matrix form when there are several
|
||||||
#' prediction outputs per case. This option has no effect when \code{predleaf = TRUE}.
|
#' prediction outputs per case. This option has no effect when either of predleaf, predcontrib,
|
||||||
|
#' or predinteraction flags is TRUE.
|
||||||
#' @param ... Parameters passed to \code{predict.xgb.Booster}
|
#' @param ... Parameters passed to \code{predict.xgb.Booster}
|
||||||
#'
|
#'
|
||||||
#' @details
|
#' @details
|
||||||
@ -158,6 +160,11 @@ xgb.Booster.complete <- function(object, saveraw = TRUE) {
|
|||||||
#' Setting \code{approxcontrib = TRUE} approximates these values following the idea explained
|
#' Setting \code{approxcontrib = TRUE} approximates these values following the idea explained
|
||||||
#' in \url{http://blog.datadive.net/interpreting-random-forests/}.
|
#' in \url{http://blog.datadive.net/interpreting-random-forests/}.
|
||||||
#'
|
#'
|
||||||
|
#' With \code{predinteraction = TRUE}, SHAP values of contributions of interaction of each pair of features
|
||||||
|
#' are computed. Note that this operation might be rather expensive in terms of compute and memory.
|
||||||
|
#' Since it quadratically depends on the number of features, it is recommended to perfom selection
|
||||||
|
#' of the most important features first. See below about the format of the returned results.
|
||||||
|
#'
|
||||||
#' @return
|
#' @return
|
||||||
#' For regression or binary classification, it returns a vector of length \code{nrows(newdata)}.
|
#' For regression or binary classification, it returns a vector of length \code{nrows(newdata)}.
|
||||||
#' For multiclass classification, either a \code{num_class * nrows(newdata)} vector or
|
#' For multiclass classification, either a \code{num_class * nrows(newdata)} vector or
|
||||||
@ -173,6 +180,14 @@ xgb.Booster.complete <- function(object, saveraw = TRUE) {
|
|||||||
#' such a matrix. The contribution values are on the scale of untransformed margin
|
#' such a matrix. The contribution values are on the scale of untransformed margin
|
||||||
#' (e.g., for binary classification would mean that the contributions are log-odds deviations from bias).
|
#' (e.g., for binary classification would mean that the contributions are log-odds deviations from bias).
|
||||||
#'
|
#'
|
||||||
|
#' When \code{predinteraction = TRUE} and it is not a multiclass setting, the output is a 3d array with
|
||||||
|
#' dimensions \code{c(nrow, num_features + 1, num_features + 1)}. The off-diagonal (in the last two dimensions)
|
||||||
|
#' elements represent different features interaction contributions. The array is symmetric WRT the last
|
||||||
|
#' two dimensions. The "+ 1" columns corresponds to bias. Summing this array along the last dimension should
|
||||||
|
#' produce practically the same result as predict with \code{predcontrib = TRUE}.
|
||||||
|
#' For a multiclass case, a list of \code{num_class} elements is returned, where each element is
|
||||||
|
#' such an array.
|
||||||
|
#'
|
||||||
#' @seealso
|
#' @seealso
|
||||||
#' \code{\link{xgb.train}}.
|
#' \code{\link{xgb.train}}.
|
||||||
#'
|
#'
|
||||||
@ -269,7 +284,8 @@ xgb.Booster.complete <- function(object, saveraw = TRUE) {
|
|||||||
#' @rdname predict.xgb.Booster
|
#' @rdname predict.xgb.Booster
|
||||||
#' @export
|
#' @export
|
||||||
predict.xgb.Booster <- function(object, newdata, missing = NA, outputmargin = FALSE, ntreelimit = NULL,
|
predict.xgb.Booster <- function(object, newdata, missing = NA, outputmargin = FALSE, ntreelimit = NULL,
|
||||||
predleaf = FALSE, predcontrib = FALSE, approxcontrib = FALSE, reshape = FALSE, ...) {
|
predleaf = FALSE, predcontrib = FALSE, approxcontrib = FALSE, predinteraction = FALSE,
|
||||||
|
reshape = FALSE, ...) {
|
||||||
|
|
||||||
object <- xgb.Booster.complete(object, saveraw = FALSE)
|
object <- xgb.Booster.complete(object, saveraw = FALSE)
|
||||||
if (!inherits(newdata, "xgb.DMatrix"))
|
if (!inherits(newdata, "xgb.DMatrix"))
|
||||||
@ -285,7 +301,8 @@ predict.xgb.Booster <- function(object, newdata, missing = NA, outputmargin = FA
|
|||||||
if (ntreelimit < 0)
|
if (ntreelimit < 0)
|
||||||
stop("ntreelimit cannot be negative")
|
stop("ntreelimit cannot be negative")
|
||||||
|
|
||||||
option <- 0L + 1L * as.logical(outputmargin) + 2L * as.logical(predleaf) + 4L * as.logical(predcontrib) + 8L * as.logical(approxcontrib)
|
option <- 0L + 1L * as.logical(outputmargin) + 2L * as.logical(predleaf) + 4L * as.logical(predcontrib) +
|
||||||
|
8L * as.logical(approxcontrib) + 16L * as.logical(predinteraction)
|
||||||
|
|
||||||
ret <- .Call(XGBoosterPredict_R, object$handle, newdata, option[1], as.integer(ntreelimit))
|
ret <- .Call(XGBoosterPredict_R, object$handle, newdata, option[1], as.integer(ntreelimit))
|
||||||
|
|
||||||
@ -305,17 +322,28 @@ predict.xgb.Booster <- function(object, newdata, missing = NA, outputmargin = FA
|
|||||||
} else if (predcontrib) {
|
} else if (predcontrib) {
|
||||||
n_col1 <- ncol(newdata) + 1
|
n_col1 <- ncol(newdata) + 1
|
||||||
n_group <- npred_per_case / n_col1
|
n_group <- npred_per_case / n_col1
|
||||||
dnames <- if (!is.null(colnames(newdata))) list(NULL, c(colnames(newdata), "BIAS")) else NULL
|
cnames <- if (!is.null(colnames(newdata))) c(colnames(newdata), "BIAS") else NULL
|
||||||
ret <- if (n_ret == n_row) {
|
ret <- if (n_ret == n_row) {
|
||||||
matrix(ret, ncol = 1, dimnames = dnames)
|
matrix(ret, ncol = 1, dimnames = list(NULL, cnames))
|
||||||
} else if (n_group == 1) {
|
} else if (n_group == 1) {
|
||||||
matrix(ret, nrow = n_row, byrow = TRUE, dimnames = dnames)
|
matrix(ret, nrow = n_row, byrow = TRUE, dimnames = list(NULL, cnames))
|
||||||
} else {
|
} else {
|
||||||
grp_mask <- rep(seq_len(n_col1), n_row) +
|
arr <- array(ret, c(n_col1, n_group, n_row),
|
||||||
rep((seq_len(n_row) - 1) * n_col1 * n_group, each = n_col1)
|
dimnames = list(cnames, NULL, NULL)) %>% aperm(c(2,3,1)) # [group, row, col]
|
||||||
lapply(seq_len(n_group), function(g) {
|
lapply(seq_len(n_group), function(g) arr[g,,])
|
||||||
matrix(ret[grp_mask + n_col1 * (g - 1)], nrow = n_row, byrow = TRUE, dimnames = dnames)
|
}
|
||||||
})
|
} else if (predinteraction) {
|
||||||
|
n_col1 <- ncol(newdata) + 1
|
||||||
|
n_group <- npred_per_case / n_col1^2
|
||||||
|
cnames <- if (!is.null(colnames(newdata))) c(colnames(newdata), "BIAS") else NULL
|
||||||
|
ret <- if (n_ret == n_row) {
|
||||||
|
matrix(ret, ncol = 1, dimnames = list(NULL, cnames))
|
||||||
|
} else if (n_group == 1) {
|
||||||
|
array(ret, c(n_col1, n_col1, n_row), dimnames = list(cnames, cnames, NULL)) %>% aperm(c(3,1,2))
|
||||||
|
} else {
|
||||||
|
arr <- array(ret, c(n_col1, n_col1, n_group, n_row),
|
||||||
|
dimnames = list(cnames, cnames, NULL, NULL)) %>% aperm(c(3,4,1,2)) # [group, row, col1, col2]
|
||||||
|
lapply(seq_len(n_group), function(g) arr[g,,,])
|
||||||
}
|
}
|
||||||
} else if (reshape && npred_per_case > 1) {
|
} else if (reshape && npred_per_case > 1) {
|
||||||
ret <- matrix(ret, nrow = n_row, byrow = TRUE)
|
ret <- matrix(ret, nrow = n_row, byrow = TRUE)
|
||||||
|
|||||||
@ -7,7 +7,8 @@
|
|||||||
\usage{
|
\usage{
|
||||||
\method{predict}{xgb.Booster}(object, newdata, missing = NA,
|
\method{predict}{xgb.Booster}(object, newdata, missing = NA,
|
||||||
outputmargin = FALSE, ntreelimit = NULL, predleaf = FALSE,
|
outputmargin = FALSE, ntreelimit = NULL, predleaf = FALSE,
|
||||||
predcontrib = FALSE, approxcontrib = FALSE, reshape = FALSE, ...)
|
predcontrib = FALSE, approxcontrib = FALSE,
|
||||||
|
predinteraction = FALSE, reshape = FALSE, ...)
|
||||||
|
|
||||||
\method{predict}{xgb.Booster.handle}(object, ...)
|
\method{predict}{xgb.Booster.handle}(object, ...)
|
||||||
}
|
}
|
||||||
@ -26,14 +27,17 @@ logistic regression would result in predictions for log-odds instead of probabil
|
|||||||
\item{ntreelimit}{limit the number of model's trees or boosting iterations used in prediction (see Details).
|
\item{ntreelimit}{limit the number of model's trees or boosting iterations used in prediction (see Details).
|
||||||
It will use all the trees by default (\code{NULL} value).}
|
It will use all the trees by default (\code{NULL} value).}
|
||||||
|
|
||||||
\item{predleaf}{whether predict leaf index instead.}
|
\item{predleaf}{whether predict leaf index.}
|
||||||
|
|
||||||
\item{predcontrib}{whether to return feature contributions to individual predictions instead (see Details).}
|
\item{predcontrib}{whether to return feature contributions to individual predictions (see Details).}
|
||||||
|
|
||||||
\item{approxcontrib}{whether to use a fast approximation for feature contributions (see Details).}
|
\item{approxcontrib}{whether to use a fast approximation for feature contributions (see Details).}
|
||||||
|
|
||||||
|
\item{predinteraction}{whether to return contributions of feature interactions to individual predictions (see Details).}
|
||||||
|
|
||||||
\item{reshape}{whether to reshape the vector of predictions to a matrix form when there are several
|
\item{reshape}{whether to reshape the vector of predictions to a matrix form when there are several
|
||||||
prediction outputs per case. This option has no effect when \code{predleaf = TRUE}.}
|
prediction outputs per case. This option has no effect when either of predleaf, predcontrib,
|
||||||
|
or predinteraction flags is TRUE.}
|
||||||
|
|
||||||
\item{...}{Parameters passed to \code{predict.xgb.Booster}}
|
\item{...}{Parameters passed to \code{predict.xgb.Booster}}
|
||||||
}
|
}
|
||||||
@ -51,6 +55,14 @@ When \code{predcontrib = TRUE} and it is not a multiclass setting, the output is
|
|||||||
For a multiclass case, a list of \code{num_class} elements is returned, where each element is
|
For a multiclass case, a list of \code{num_class} elements is returned, where each element is
|
||||||
such a matrix. The contribution values are on the scale of untransformed margin
|
such a matrix. The contribution values are on the scale of untransformed margin
|
||||||
(e.g., for binary classification would mean that the contributions are log-odds deviations from bias).
|
(e.g., for binary classification would mean that the contributions are log-odds deviations from bias).
|
||||||
|
|
||||||
|
When \code{predinteraction = TRUE} and it is not a multiclass setting, the output is a 3d array with
|
||||||
|
dimensions \code{c(nrow, num_features + 1, num_features + 1)}. The off-diagonal (in the last two dimensions)
|
||||||
|
elements represent different features interaction contributions. The array is symmetric WRT the last
|
||||||
|
two dimensions. The "+ 1" columns corresponds to bias. Summing this array along the last dimension should
|
||||||
|
produce practically the same result as predict with \code{predcontrib = TRUE}.
|
||||||
|
For a multiclass case, a list of \code{num_class} elements is returned, where each element is
|
||||||
|
such an array.
|
||||||
}
|
}
|
||||||
\description{
|
\description{
|
||||||
Predicted values based on either xgboost model or model handle object.
|
Predicted values based on either xgboost model or model handle object.
|
||||||
@ -76,6 +88,11 @@ values (Lundberg 2017) that sum to the difference between the expected output
|
|||||||
of the model and the current prediction (where the hessian weights are used to compute the expectations).
|
of the model and the current prediction (where the hessian weights are used to compute the expectations).
|
||||||
Setting \code{approxcontrib = TRUE} approximates these values following the idea explained
|
Setting \code{approxcontrib = TRUE} approximates these values following the idea explained
|
||||||
in \url{http://blog.datadive.net/interpreting-random-forests/}.
|
in \url{http://blog.datadive.net/interpreting-random-forests/}.
|
||||||
|
|
||||||
|
With \code{predinteraction = TRUE}, SHAP values of contributions of interaction of each pair of features
|
||||||
|
are computed. Note that this operation might be rather expensive in terms of compute and memory.
|
||||||
|
Since it quadratically depends on the number of features, it is recommended to perfom selection
|
||||||
|
of the most important features first. See below about the format of the returned results.
|
||||||
}
|
}
|
||||||
\examples{
|
\examples{
|
||||||
## binary classification:
|
## binary classification:
|
||||||
|
|||||||
@ -4,11 +4,12 @@
|
|||||||
\alias{xgb.cv}
|
\alias{xgb.cv}
|
||||||
\title{Cross Validation}
|
\title{Cross Validation}
|
||||||
\usage{
|
\usage{
|
||||||
xgb.cv(params = list(), data, nrounds, nfold, label = NULL, missing = NA,
|
xgb.cv(params = list(), data, nrounds, nfold, label = NULL,
|
||||||
prediction = FALSE, showsd = TRUE, metrics = list(), obj = NULL,
|
missing = NA, prediction = FALSE, showsd = TRUE,
|
||||||
feval = NULL, stratified = TRUE, folds = NULL, verbose = TRUE,
|
metrics = list(), obj = NULL, feval = NULL, stratified = TRUE,
|
||||||
print_every_n = 1L, early_stopping_rounds = NULL, maximize = NULL,
|
folds = NULL, verbose = TRUE, print_every_n = 1L,
|
||||||
callbacks = list(), ...)
|
early_stopping_rounds = NULL, maximize = NULL, callbacks = list(),
|
||||||
|
...)
|
||||||
}
|
}
|
||||||
\arguments{
|
\arguments{
|
||||||
\item{params}{the list of parameters. Commonly used ones are:
|
\item{params}{the list of parameters. Commonly used ones are:
|
||||||
|
|||||||
@ -44,8 +44,8 @@ test <- agaricus.test
|
|||||||
bst <- xgboost(data = train$data, label = train$label, max_depth = 2,
|
bst <- xgboost(data = train$data, label = train$label, max_depth = 2,
|
||||||
eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic")
|
eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic")
|
||||||
# save the model in file 'xgb.model.dump'
|
# save the model in file 'xgb.model.dump'
|
||||||
dump.path = file.path(tempdir(), 'model.dump')
|
dump_path = file.path(tempdir(), 'model.dump')
|
||||||
xgb.dump(bst, dump.path, with_stats = TRUE)
|
xgb.dump(bst, dump_path, with_stats = TRUE)
|
||||||
|
|
||||||
# print the model without saving it to a file
|
# print the model without saving it to a file
|
||||||
print(xgb.dump(bst, with_stats = TRUE))
|
print(xgb.dump(bst, with_stats = TRUE))
|
||||||
|
|||||||
@ -5,11 +5,11 @@
|
|||||||
\alias{xgb.plot.deepness}
|
\alias{xgb.plot.deepness}
|
||||||
\title{Plot model trees deepness}
|
\title{Plot model trees deepness}
|
||||||
\usage{
|
\usage{
|
||||||
xgb.ggplot.deepness(model = NULL, which = c("2x1", "max.depth", "med.depth",
|
xgb.ggplot.deepness(model = NULL, which = c("2x1", "max.depth",
|
||||||
"med.weight"))
|
"med.depth", "med.weight"))
|
||||||
|
|
||||||
xgb.plot.deepness(model = NULL, which = c("2x1", "max.depth", "med.depth",
|
xgb.plot.deepness(model = NULL, which = c("2x1", "max.depth",
|
||||||
"med.weight"), plot = TRUE, ...)
|
"med.depth", "med.weight"), plot = TRUE, ...)
|
||||||
}
|
}
|
||||||
\arguments{
|
\arguments{
|
||||||
\item{model}{either an \code{xgb.Booster} model generated by the \code{xgb.train} function
|
\item{model}{either an \code{xgb.Booster} model generated by the \code{xgb.train} function
|
||||||
|
|||||||
@ -9,8 +9,8 @@ xgb.ggplot.importance(importance_matrix = NULL, top_n = NULL,
|
|||||||
measure = NULL, rel_to_first = FALSE, n_clusters = c(1:10), ...)
|
measure = NULL, rel_to_first = FALSE, n_clusters = c(1:10), ...)
|
||||||
|
|
||||||
xgb.plot.importance(importance_matrix = NULL, top_n = NULL,
|
xgb.plot.importance(importance_matrix = NULL, top_n = NULL,
|
||||||
measure = NULL, rel_to_first = FALSE, left_margin = 10, cex = NULL,
|
measure = NULL, rel_to_first = FALSE, left_margin = 10,
|
||||||
plot = TRUE, ...)
|
cex = NULL, plot = TRUE, ...)
|
||||||
}
|
}
|
||||||
\arguments{
|
\arguments{
|
||||||
\item{importance_matrix}{a \code{data.table} returned by \code{\link{xgb.importance}}.}
|
\item{importance_matrix}{a \code{data.table} returned by \code{\link{xgb.importance}}.}
|
||||||
|
|||||||
@ -6,8 +6,8 @@
|
|||||||
\usage{
|
\usage{
|
||||||
xgb.plot.shap(data, shap_contrib = NULL, features = NULL, top_n = 1,
|
xgb.plot.shap(data, shap_contrib = NULL, features = NULL, top_n = 1,
|
||||||
model = NULL, trees = NULL, target_class = NULL,
|
model = NULL, trees = NULL, target_class = NULL,
|
||||||
approxcontrib = FALSE, subsample = NULL, n_col = 1, col = rgb(0, 0, 1,
|
approxcontrib = FALSE, subsample = NULL, n_col = 1, col = rgb(0,
|
||||||
0.2), pch = ".", discrete_n_uniq = 5, discrete_jitter = 0.01,
|
0, 1, 0.2), pch = ".", discrete_n_uniq = 5, discrete_jitter = 0.01,
|
||||||
ylab = "SHAP", plot_NA = TRUE, col_NA = rgb(0.7, 0, 1, 0.6),
|
ylab = "SHAP", plot_NA = TRUE, col_NA = rgb(0.7, 0, 1, 0.6),
|
||||||
pch_NA = ".", pos_NA = 1.07, plot_loess = TRUE, col_loess = 2,
|
pch_NA = ".", pos_NA = 1.07, plot_loess = TRUE, col_loess = 2,
|
||||||
span_loess = 0.5, which = c("1d", "2d"), plot = TRUE, ...)
|
span_loess = 0.5, which = c("1d", "2d"), plot = TRUE, ...)
|
||||||
|
|||||||
@ -5,15 +5,17 @@
|
|||||||
\alias{xgboost}
|
\alias{xgboost}
|
||||||
\title{eXtreme Gradient Boosting Training}
|
\title{eXtreme Gradient Boosting Training}
|
||||||
\usage{
|
\usage{
|
||||||
xgb.train(params = list(), data, nrounds, watchlist = list(), obj = NULL,
|
xgb.train(params = list(), data, nrounds, watchlist = list(),
|
||||||
feval = NULL, verbose = 1, print_every_n = 1L,
|
obj = NULL, feval = NULL, verbose = 1, print_every_n = 1L,
|
||||||
early_stopping_rounds = NULL, maximize = NULL, save_period = NULL,
|
early_stopping_rounds = NULL, maximize = NULL, save_period = NULL,
|
||||||
save_name = "xgboost.model", xgb_model = NULL, callbacks = list(), ...)
|
save_name = "xgboost.model", xgb_model = NULL, callbacks = list(),
|
||||||
|
...)
|
||||||
|
|
||||||
xgboost(data = NULL, label = NULL, missing = NA, weight = NULL,
|
xgboost(data = NULL, label = NULL, missing = NA, weight = NULL,
|
||||||
params = list(), nrounds, verbose = 1, print_every_n = 1L,
|
params = list(), nrounds, verbose = 1, print_every_n = 1L,
|
||||||
early_stopping_rounds = NULL, maximize = NULL, save_period = NULL,
|
early_stopping_rounds = NULL, maximize = NULL, save_period = NULL,
|
||||||
save_name = "xgboost.model", xgb_model = NULL, callbacks = list(), ...)
|
save_name = "xgboost.model", xgb_model = NULL, callbacks = list(),
|
||||||
|
...)
|
||||||
}
|
}
|
||||||
\arguments{
|
\arguments{
|
||||||
\item{params}{the list of parameters.
|
\item{params}{the list of parameters.
|
||||||
|
|||||||
108
R-package/tests/testthat/test_interactions.R
Normal file
108
R-package/tests/testthat/test_interactions.R
Normal file
@ -0,0 +1,108 @@
|
|||||||
|
context('Test prediction of feature interactions')
|
||||||
|
|
||||||
|
require(xgboost)
|
||||||
|
require(magrittr)
|
||||||
|
|
||||||
|
set.seed(123)
|
||||||
|
|
||||||
|
test_that("predict feature interactions works", {
|
||||||
|
# simulate some binary data and a linear outcome with an interaction term
|
||||||
|
N <- 1000
|
||||||
|
P <- 5
|
||||||
|
X <- matrix(rbinom(N * P, 1, 0.5), ncol=P, dimnames = list(NULL, letters[1:P]))
|
||||||
|
# center the data (as contributions are computed WRT feature means)
|
||||||
|
X <- scale(X, scale=FALSE)
|
||||||
|
|
||||||
|
# outcome without any interactions, without any noise:
|
||||||
|
f <- function(x) 2 * x[, 1] - 3 * x[, 2]
|
||||||
|
# outcome with interactions, without noise:
|
||||||
|
f_int <- function(x) f(x) + 2 * x[, 2] * x[, 3]
|
||||||
|
# outcome with interactions, with noise:
|
||||||
|
#f_int_noise <- function(x) f_int(x) + rnorm(N, 0, 0.3)
|
||||||
|
|
||||||
|
y <- f_int(X)
|
||||||
|
|
||||||
|
dm <- xgb.DMatrix(X, label = y)
|
||||||
|
param <- list(eta=0.1, max_depth=4, base_score=mean(y), lambda=0, nthread=2)
|
||||||
|
b <- xgb.train(param, dm, 100)
|
||||||
|
|
||||||
|
pred = predict(b, dm, outputmargin=TRUE)
|
||||||
|
|
||||||
|
# SHAP contributions:
|
||||||
|
cont <- predict(b, dm, predcontrib=TRUE)
|
||||||
|
expect_equal(dim(cont), c(N, P+1))
|
||||||
|
# make sure for each row they add up to marginal predictions
|
||||||
|
max(abs(rowSums(cont) - pred)) %>% expect_lt(0.001)
|
||||||
|
# Hand-construct the 'ground truth' feature contributions:
|
||||||
|
gt_cont <- cbind(
|
||||||
|
2. * X[, 1],
|
||||||
|
-3. * X[, 2] + 1. * X[, 2] * X[, 3], # attribute a HALF of the interaction term to feature #2
|
||||||
|
1. * X[, 2] * X[, 3] # and another HALF of the interaction term to feature #3
|
||||||
|
)
|
||||||
|
gt_cont <- cbind(gt_cont, matrix(0, nrow=N, ncol=P + 1 - 3))
|
||||||
|
# These should be relatively close:
|
||||||
|
expect_lt(max(abs(cont - gt_cont)), 0.05)
|
||||||
|
|
||||||
|
|
||||||
|
# SHAP interaction contributions:
|
||||||
|
intr <- predict(b, dm, predinteraction=TRUE)
|
||||||
|
expect_equal(dim(intr), c(N, P+1, P+1))
|
||||||
|
# check assigned colnames
|
||||||
|
cn <- c(letters[1:P], "BIAS")
|
||||||
|
expect_equal(dimnames(intr), list(NULL, cn, cn))
|
||||||
|
|
||||||
|
# check the symmetry
|
||||||
|
max(abs(aperm(intr, c(1,3,2)) - intr)) %>% expect_lt(0.00001)
|
||||||
|
|
||||||
|
# sums WRT columns must be close to feature contributions
|
||||||
|
max(abs(apply(intr, c(1,2), sum) - cont)) %>% expect_lt(0.00001)
|
||||||
|
|
||||||
|
# diagonal terms for features 3,4,5 must be close to zero
|
||||||
|
Reduce(max, sapply(3:P, function(i) max(abs(intr[, i, i])))) %>% expect_lt(0.05)
|
||||||
|
|
||||||
|
# BIAS must have no interactions
|
||||||
|
max(abs(intr[, 1:P, P+1])) %>% expect_lt(0.00001)
|
||||||
|
|
||||||
|
# interactions other than 2 x 3 must be close to zero
|
||||||
|
intr23 <- intr
|
||||||
|
intr23[,2,3] <- 0
|
||||||
|
Reduce(max, sapply(1:P, function(i) max(abs(intr23[, i, (i+1):(P+1)])))) %>% expect_lt(0.05)
|
||||||
|
|
||||||
|
# Construct the 'ground truth' contributions of interactions directly from the linear terms:
|
||||||
|
gt_intr <- array(0, c(N, P+1, P+1))
|
||||||
|
gt_intr[,2,3] <- 1. * X[, 2] * X[, 3] # attribute a HALF of the interaction term to each symmetric element
|
||||||
|
gt_intr[,3,2] <- gt_intr[, 2, 3]
|
||||||
|
# merge-in the diagonal based on 'ground truth' feature contributions
|
||||||
|
intr_diag = gt_cont - apply(gt_intr, c(1,2), sum)
|
||||||
|
for(j in seq_len(P)) {
|
||||||
|
gt_intr[,j,j] = intr_diag[,j]
|
||||||
|
}
|
||||||
|
# These should be relatively close:
|
||||||
|
expect_lt(max(abs(intr - gt_intr)), 0.1)
|
||||||
|
})
|
||||||
|
|
||||||
|
|
||||||
|
test_that("multiclass feature interactions work", {
|
||||||
|
dm <- xgb.DMatrix(as.matrix(iris[,-5]), label=as.numeric(iris$Species)-1)
|
||||||
|
param <- list(eta=0.1, max_depth=4, objective='multi:softprob', num_class=3)
|
||||||
|
b <- xgb.train(param, dm, 40)
|
||||||
|
pred = predict(b, dm, outputmargin=TRUE) %>% array(c(3, 150)) %>% t
|
||||||
|
|
||||||
|
# SHAP contributions:
|
||||||
|
cont <- predict(b, dm, predcontrib=TRUE)
|
||||||
|
expect_length(cont, 3)
|
||||||
|
# rewrap them as a 3d array
|
||||||
|
cont <- unlist(cont) %>% array(c(150, 5, 3))
|
||||||
|
# make sure for each row they add up to marginal predictions
|
||||||
|
max(abs(apply(cont, c(1,3), sum) - pred)) %>% expect_lt(0.001)
|
||||||
|
|
||||||
|
# SHAP interaction contributions:
|
||||||
|
intr <- predict(b, dm, predinteraction=TRUE)
|
||||||
|
expect_length(intr, 3)
|
||||||
|
# rewrap them as a 4d array
|
||||||
|
intr <- unlist(intr) %>% array(c(150, 5, 5, 3)) %>% aperm(c(4, 1, 2, 3)) # [grp, row, col, col]
|
||||||
|
# check the symmetry
|
||||||
|
max(abs(aperm(intr, c(1,2,4,3)) - intr)) %>% expect_lt(0.00001)
|
||||||
|
# sums WRT columns must be close to feature contributions
|
||||||
|
max(abs(apply(intr, c(1,2,3), sum) - aperm(cont, c(3,1,2)))) %>% expect_lt(0.00001)
|
||||||
|
})
|
||||||
Loading…
x
Reference in New Issue
Block a user