[R] R-interface for SHAP interactions (#3636)

* add R-interface for SHAP interactions * update docs for new roxygen version
2018-08-30 19:06:21 -05:00 · 2018-08-30 19:06:21 -05:00 · 5b662cbe1c
commit 5b662cbe1c
parent 10c31ab2cb
10 changed files with 194 additions and 38 deletions
--- a/R-package/DESCRIPTION
+++ b/R-package/DESCRIPTION
@ -1,7 +1,7 @@
 Package: xgboost
 Type: Package
 Title: Extreme Gradient Boosting
-Version: 0.80.1
+Version: 0.81.0.1
 Date: 2018-08-13
 Authors@R: c(
  person("Tianqi", "Chen", role = c("aut"),
@ -61,5 +61,5 @@ Imports:
    data.table (>= 1.9.6),
    magrittr (>= 1.5),
    stringi (>= 0.5.2)
-RoxygenNote: 6.0.1
+RoxygenNote: 6.1.0
 SystemRequirements: GNU make, C++11
--- a/R-package/R/xgb.Booster.R
+++ b/R-package/R/xgb.Booster.R
@ -129,11 +129,13 @@ xgb.Booster.complete <- function(object, saveraw = TRUE) {
 #'        logistic regression would result in predictions for log-odds instead of probabilities.
 #' @param ntreelimit limit the number of model's trees or boosting iterations used in prediction (see Details).
 #'        It will use all the trees by default (\code{NULL} value).
-#' @param predleaf whether predict leaf index instead.
-#' @param predcontrib whether to return feature contributions to individual predictions instead (see Details).
+#' @param predleaf whether predict leaf index.
+#' @param predcontrib whether to return feature contributions to individual predictions (see Details).
 #' @param approxcontrib whether to use a fast approximation for feature contributions (see Details).
+#' @param predinteraction whether to return contributions of feature interactions to individual predictions (see Details).
 #' @param reshape whether to reshape the vector of predictions to a matrix form when there are several
-#'        prediction outputs per case. This option has no effect when \code{predleaf = TRUE}.
+#'        prediction outputs per case. This option has no effect when either of predleaf, predcontrib,
+#'        or predinteraction flags is TRUE.
 #' @param ... Parameters passed to \code{predict.xgb.Booster}
 #'
 #' @details
@ -158,6 +160,11 @@ xgb.Booster.complete <- function(object, saveraw = TRUE) {
 #' Setting \code{approxcontrib = TRUE} approximates these values following the idea explained
 #' in \url{http://blog.datadive.net/interpreting-random-forests/}.
 #'
+#' With \code{predinteraction = TRUE}, SHAP values of contributions of interaction of each pair of features
+#' are computed. Note that this operation might be rather expensive in terms of compute and memory.
+#' Since it quadratically depends on the number of features, it is recommended to perfom selection
+#' of the most important features first. See below about the format of the returned results.
+#'
 #' @return
 #' For regression or binary classification, it returns a vector of length \code{nrows(newdata)}.
 #' For multiclass classification, either a \code{num_class * nrows(newdata)} vector or
@ -173,6 +180,14 @@ xgb.Booster.complete <- function(object, saveraw = TRUE) {
 #' such a matrix. The contribution values are on the scale of untransformed margin
 #' (e.g., for binary classification would mean that the contributions are log-odds deviations from bias).
 #'
+#' When \code{predinteraction = TRUE} and it is not a multiclass setting, the output is a 3d array with
+#' dimensions \code{c(nrow, num_features + 1, num_features + 1)}. The off-diagonal (in the last two dimensions)
+#' elements represent different features interaction contributions. The array is symmetric WRT the last
+#' two dimensions. The "+ 1" columns corresponds to bias. Summing this array along the last dimension should
+#' produce practically the same result as predict with \code{predcontrib = TRUE}.
+#' For a multiclass case, a list of \code{num_class} elements is returned, where each element is
+#' such an array.
+#'
 #' @seealso
 #' \code{\link{xgb.train}}.
 #' 
@ -269,7 +284,8 @@ xgb.Booster.complete <- function(object, saveraw = TRUE) {
 #' @rdname predict.xgb.Booster
 #' @export
 predict.xgb.Booster <- function(object, newdata, missing = NA, outputmargin = FALSE, ntreelimit = NULL,
-                                predleaf = FALSE, predcontrib = FALSE, approxcontrib = FALSE, reshape = FALSE, ...) {
+                                predleaf = FALSE, predcontrib = FALSE, approxcontrib = FALSE, predinteraction = FALSE,
+                                reshape = FALSE, ...) {

  object <- xgb.Booster.complete(object, saveraw = FALSE)
  if (!inherits(newdata, "xgb.DMatrix"))
@ -285,7 +301,8 @@ predict.xgb.Booster <- function(object, newdata, missing = NA, outputmargin = FA
  if (ntreelimit < 0)
    stop("ntreelimit cannot be negative")

-  option <- 0L + 1L * as.logical(outputmargin) + 2L * as.logical(predleaf) + 4L * as.logical(predcontrib) + 8L * as.logical(approxcontrib)
+  option <- 0L + 1L * as.logical(outputmargin) + 2L * as.logical(predleaf) + 4L * as.logical(predcontrib) +
+    8L * as.logical(approxcontrib) + 16L * as.logical(predinteraction)

  ret <- .Call(XGBoosterPredict_R, object$handle, newdata, option[1], as.integer(ntreelimit))

@ -305,17 +322,28 @@ predict.xgb.Booster <- function(object, newdata, missing = NA, outputmargin = FA
  } else if (predcontrib) {
    n_col1 <- ncol(newdata) + 1
    n_group <- npred_per_case / n_col1
-    dnames <- if (!is.null(colnames(newdata))) list(NULL, c(colnames(newdata), "BIAS")) else NULL
+    cnames <- if (!is.null(colnames(newdata))) c(colnames(newdata), "BIAS") else NULL
    ret <- if (n_ret == n_row) {
-      matrix(ret, ncol = 1, dimnames = dnames)
+      matrix(ret, ncol = 1, dimnames = list(NULL, cnames))
    } else if (n_group == 1) {
-      matrix(ret, nrow = n_row, byrow = TRUE, dimnames = dnames)
+      matrix(ret, nrow = n_row, byrow = TRUE, dimnames = list(NULL, cnames))
    } else {
-      grp_mask <- rep(seq_len(n_col1), n_row) +
-        rep((seq_len(n_row) - 1) * n_col1 * n_group, each = n_col1)
-      lapply(seq_len(n_group), function(g) {
-        matrix(ret[grp_mask + n_col1 * (g - 1)], nrow = n_row, byrow = TRUE, dimnames = dnames)
-      })
+      arr <- array(ret, c(n_col1, n_group, n_row),
+                   dimnames = list(cnames, NULL, NULL)) %>% aperm(c(2,3,1)) # [group, row, col]
+      lapply(seq_len(n_group), function(g) arr[g,,])
+    }
+  } else if (predinteraction) {
+    n_col1 <- ncol(newdata) + 1
+    n_group <- npred_per_case / n_col1^2
+    cnames <- if (!is.null(colnames(newdata))) c(colnames(newdata), "BIAS") else NULL
+    ret <- if (n_ret == n_row) {
+      matrix(ret, ncol = 1, dimnames = list(NULL, cnames))
+    } else if (n_group == 1) {
+      array(ret, c(n_col1, n_col1, n_row), dimnames = list(cnames, cnames, NULL)) %>% aperm(c(3,1,2))
+    } else {
+      arr <- array(ret, c(n_col1, n_col1, n_group, n_row),
+                   dimnames = list(cnames, cnames, NULL, NULL)) %>% aperm(c(3,4,1,2)) # [group, row, col1, col2]
+      lapply(seq_len(n_group), function(g) arr[g,,,])
    }
  } else if (reshape && npred_per_case > 1) {
    ret <- matrix(ret, nrow = n_row, byrow = TRUE)
--- a/R-package/man/predict.xgb.Booster.Rd
+++ b/R-package/man/predict.xgb.Booster.Rd
@ -7,7 +7,8 @@
 \usage{
 \method{predict}{xgb.Booster}(object, newdata, missing = NA,
  outputmargin = FALSE, ntreelimit = NULL, predleaf = FALSE,
-  predcontrib = FALSE, approxcontrib = FALSE, reshape = FALSE, ...)
+  predcontrib = FALSE, approxcontrib = FALSE,
+  predinteraction = FALSE, reshape = FALSE, ...)

 \method{predict}{xgb.Booster.handle}(object, ...)
 }
@ -26,14 +27,17 @@ logistic regression would result in predictions for log-odds instead of probabil
 \item{ntreelimit}{limit the number of model's trees or boosting iterations used in prediction (see Details).
 It will use all the trees by default (\code{NULL} value).}

-\item{predleaf}{whether predict leaf index instead.}
+\item{predleaf}{whether predict leaf index.}

-\item{predcontrib}{whether to return feature contributions to individual predictions instead (see Details).}
+\item{predcontrib}{whether to return feature contributions to individual predictions (see Details).}

 \item{approxcontrib}{whether to use a fast approximation for feature contributions (see Details).}

+\item{predinteraction}{whether to return contributions of feature interactions to individual predictions (see Details).}
+
 \item{reshape}{whether to reshape the vector of predictions to a matrix form when there are several
-prediction outputs per case. This option has no effect when \code{predleaf = TRUE}.}
+prediction outputs per case. This option has no effect when either of predleaf, predcontrib,
+or predinteraction flags is TRUE.}

 \item{...}{Parameters passed to \code{predict.xgb.Booster}}
 }
@ -51,6 +55,14 @@ When \code{predcontrib = TRUE} and it is not a multiclass setting, the output is
 For a multiclass case, a list of \code{num_class} elements is returned, where each element is
 such a matrix. The contribution values are on the scale of untransformed margin
 (e.g., for binary classification would mean that the contributions are log-odds deviations from bias).
+
+When \code{predinteraction = TRUE} and it is not a multiclass setting, the output is a 3d array with
+dimensions \code{c(nrow, num_features + 1, num_features + 1)}. The off-diagonal (in the last two dimensions)
+elements represent different features interaction contributions. The array is symmetric WRT the last
+two dimensions. The "+ 1" columns corresponds to bias. Summing this array along the last dimension should
+produce practically the same result as predict with \code{predcontrib = TRUE}.
+For a multiclass case, a list of \code{num_class} elements is returned, where each element is
+such an array.
 }
 \description{
 Predicted values based on either xgboost model or model handle object.
@ -76,6 +88,11 @@ values (Lundberg 2017) that sum to the difference between the expected output
 of the model and the current prediction (where the hessian weights are used to compute the expectations).
 Setting \code{approxcontrib = TRUE} approximates these values following the idea explained
 in \url{http://blog.datadive.net/interpreting-random-forests/}.
+
+With \code{predinteraction = TRUE}, SHAP values of contributions of interaction of each pair of features
+are computed. Note that this operation might be rather expensive in terms of compute and memory.
+Since it quadratically depends on the number of features, it is recommended to perfom selection
+of the most important features first. See below about the format of the returned results.
 }
 \examples{
 ## binary classification:
--- a/R-package/man/xgb.cv.Rd
+++ b/R-package/man/xgb.cv.Rd
@ -4,11 +4,12 @@
 \alias{xgb.cv}
 \title{Cross Validation}
 \usage{
-xgb.cv(params = list(), data, nrounds, nfold, label = NULL, missing = NA,
-  prediction = FALSE, showsd = TRUE, metrics = list(), obj = NULL,
-  feval = NULL, stratified = TRUE, folds = NULL, verbose = TRUE,
-  print_every_n = 1L, early_stopping_rounds = NULL, maximize = NULL,
-  callbacks = list(), ...)
+xgb.cv(params = list(), data, nrounds, nfold, label = NULL,
+  missing = NA, prediction = FALSE, showsd = TRUE,
+  metrics = list(), obj = NULL, feval = NULL, stratified = TRUE,
+  folds = NULL, verbose = TRUE, print_every_n = 1L,
+  early_stopping_rounds = NULL, maximize = NULL, callbacks = list(),
+  ...)
 }
 \arguments{
 \item{params}{the list of parameters. Commonly used ones are:
--- a/R-package/man/xgb.dump.Rd
+++ b/R-package/man/xgb.dump.Rd
@ -44,8 +44,8 @@ test <- agaricus.test
 bst <- xgboost(data = train$data, label = train$label, max_depth = 2, 
               eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic")
 # save the model in file 'xgb.model.dump'
-dump.path = file.path(tempdir(), 'model.dump')
-xgb.dump(bst, dump.path, with_stats = TRUE)
+dump_path = file.path(tempdir(), 'model.dump')
+xgb.dump(bst, dump_path, with_stats = TRUE)

 # print the model without saving it to a file
 print(xgb.dump(bst, with_stats = TRUE))
--- a/R-package/man/xgb.plot.deepness.Rd
+++ b/R-package/man/xgb.plot.deepness.Rd
@ -5,11 +5,11 @@
 \alias{xgb.plot.deepness}
 \title{Plot model trees deepness}
 \usage{
-xgb.ggplot.deepness(model = NULL, which = c("2x1", "max.depth", "med.depth",
-  "med.weight"))
+xgb.ggplot.deepness(model = NULL, which = c("2x1", "max.depth",
+  "med.depth", "med.weight"))

-xgb.plot.deepness(model = NULL, which = c("2x1", "max.depth", "med.depth",
-  "med.weight"), plot = TRUE, ...)
+xgb.plot.deepness(model = NULL, which = c("2x1", "max.depth",
+  "med.depth", "med.weight"), plot = TRUE, ...)
 }
 \arguments{
 \item{model}{either an \code{xgb.Booster} model generated by the \code{xgb.train} function
--- a/R-package/man/xgb.plot.importance.Rd
+++ b/R-package/man/xgb.plot.importance.Rd
@ -9,8 +9,8 @@ xgb.ggplot.importance(importance_matrix = NULL, top_n = NULL,
  measure = NULL, rel_to_first = FALSE, n_clusters = c(1:10), ...)

 xgb.plot.importance(importance_matrix = NULL, top_n = NULL,
-  measure = NULL, rel_to_first = FALSE, left_margin = 10, cex = NULL,
-  plot = TRUE, ...)
+  measure = NULL, rel_to_first = FALSE, left_margin = 10,
+  cex = NULL, plot = TRUE, ...)
 }
 \arguments{
 \item{importance_matrix}{a \code{data.table} returned by \code{\link{xgb.importance}}.}
--- a/R-package/man/xgb.plot.shap.Rd
+++ b/R-package/man/xgb.plot.shap.Rd
@ -6,8 +6,8 @@
 \usage{
 xgb.plot.shap(data, shap_contrib = NULL, features = NULL, top_n = 1,
  model = NULL, trees = NULL, target_class = NULL,
-  approxcontrib = FALSE, subsample = NULL, n_col = 1, col = rgb(0, 0, 1,
-  0.2), pch = ".", discrete_n_uniq = 5, discrete_jitter = 0.01,
+  approxcontrib = FALSE, subsample = NULL, n_col = 1, col = rgb(0,
+  0, 1, 0.2), pch = ".", discrete_n_uniq = 5, discrete_jitter = 0.01,
  ylab = "SHAP", plot_NA = TRUE, col_NA = rgb(0.7, 0, 1, 0.6),
  pch_NA = ".", pos_NA = 1.07, plot_loess = TRUE, col_loess = 2,
  span_loess = 0.5, which = c("1d", "2d"), plot = TRUE, ...)
--- a/R-package/man/xgb.train.Rd
+++ b/R-package/man/xgb.train.Rd
@ -5,15 +5,17 @@
 \alias{xgboost}
 \title{eXtreme Gradient Boosting Training}
 \usage{
-xgb.train(params = list(), data, nrounds, watchlist = list(), obj = NULL,
-  feval = NULL, verbose = 1, print_every_n = 1L,
+xgb.train(params = list(), data, nrounds, watchlist = list(),
+  obj = NULL, feval = NULL, verbose = 1, print_every_n = 1L,
  early_stopping_rounds = NULL, maximize = NULL, save_period = NULL,
-  save_name = "xgboost.model", xgb_model = NULL, callbacks = list(), ...)
+  save_name = "xgboost.model", xgb_model = NULL, callbacks = list(),
+  ...)

 xgboost(data = NULL, label = NULL, missing = NA, weight = NULL,
  params = list(), nrounds, verbose = 1, print_every_n = 1L,
  early_stopping_rounds = NULL, maximize = NULL, save_period = NULL,
-  save_name = "xgboost.model", xgb_model = NULL, callbacks = list(), ...)
+  save_name = "xgboost.model", xgb_model = NULL, callbacks = list(),
+  ...)
 }
 \arguments{
 \item{params}{the list of parameters. 
--- a/R-package/tests/testthat/test_interactions.R
+++ b/R-package/tests/testthat/test_interactions.R
@ -0,0 +1,108 @@
+context('Test prediction of feature interactions')
+
+require(xgboost)
+require(magrittr)
+
+set.seed(123)
+
+test_that("predict feature interactions works", {
+  # simulate some binary data and a linear outcome with an interaction term
+  N <- 1000
+  P <- 5
+  X <- matrix(rbinom(N * P, 1, 0.5), ncol=P, dimnames = list(NULL, letters[1:P]))
+  # center the data (as contributions are computed WRT feature means)
+  X <- scale(X, scale=FALSE)
+
+  # outcome without any interactions, without any noise:
+  f <- function(x) 2 * x[, 1] - 3 * x[, 2]
+  # outcome with interactions, without noise:
+  f_int <- function(x) f(x) + 2 * x[, 2] * x[, 3]
+  # outcome with interactions, with noise:
+  #f_int_noise <- function(x) f_int(x) + rnorm(N, 0, 0.3)
+
+  y <- f_int(X)
+
+  dm <- xgb.DMatrix(X, label = y)
+  param <- list(eta=0.1, max_depth=4, base_score=mean(y), lambda=0, nthread=2)
+  b <- xgb.train(param, dm, 100)
+  
+  pred = predict(b, dm, outputmargin=TRUE)
+
+  # SHAP contributions:
+  cont <- predict(b, dm, predcontrib=TRUE)
+  expect_equal(dim(cont), c(N, P+1))
+  # make sure for each row they add up to marginal predictions
+  max(abs(rowSums(cont) - pred)) %>% expect_lt(0.001)
+  # Hand-construct the 'ground truth' feature contributions:
+  gt_cont <- cbind(
+      2. * X[, 1],
+     -3. * X[, 2] + 1. * X[, 2] * X[, 3], # attribute a HALF of the interaction term to feature #2
+      1. * X[, 2] * X[, 3]               # and another HALF of the interaction term to feature #3
+     )
+  gt_cont <- cbind(gt_cont, matrix(0, nrow=N, ncol=P + 1 - 3))
+  # These should be relatively close:
+  expect_lt(max(abs(cont - gt_cont)), 0.05)
+
+
+  # SHAP interaction contributions:
+  intr <- predict(b, dm, predinteraction=TRUE)
+  expect_equal(dim(intr), c(N, P+1, P+1))
+  # check assigned colnames
+  cn <- c(letters[1:P], "BIAS")
+  expect_equal(dimnames(intr), list(NULL, cn, cn))
+
+  # check the symmetry
+  max(abs(aperm(intr, c(1,3,2)) - intr)) %>% expect_lt(0.00001)
+
+  # sums WRT columns must be close to feature contributions
+  max(abs(apply(intr, c(1,2), sum) - cont)) %>% expect_lt(0.00001)
+
+  # diagonal terms for features 3,4,5 must be close to zero
+  Reduce(max, sapply(3:P, function(i) max(abs(intr[, i, i])))) %>% expect_lt(0.05)
+
+  # BIAS must have no interactions
+  max(abs(intr[, 1:P, P+1])) %>% expect_lt(0.00001)
+
+  # interactions other than 2 x 3 must be close to zero
+  intr23 <- intr
+  intr23[,2,3] <- 0
+  Reduce(max, sapply(1:P, function(i) max(abs(intr23[, i, (i+1):(P+1)])))) %>% expect_lt(0.05)
+
+  # Construct the 'ground truth' contributions of interactions directly from the linear terms:
+  gt_intr <- array(0, c(N, P+1, P+1))
+  gt_intr[,2,3] <- 1. * X[, 2] * X[, 3] # attribute a HALF of the interaction term to each symmetric element
+  gt_intr[,3,2] <- gt_intr[, 2, 3]
+  # merge-in the diagonal based on 'ground truth' feature contributions
+  intr_diag = gt_cont - apply(gt_intr, c(1,2), sum)
+  for(j in seq_len(P)) {
+    gt_intr[,j,j] = intr_diag[,j]
+  }
+  # These should be relatively close:
+  expect_lt(max(abs(intr - gt_intr)), 0.1)
+})
+
+
+test_that("multiclass feature interactions work", {
+  dm <- xgb.DMatrix(as.matrix(iris[,-5]), label=as.numeric(iris$Species)-1)
+  param <- list(eta=0.1, max_depth=4, objective='multi:softprob', num_class=3)
+  b <- xgb.train(param, dm, 40)
+  pred = predict(b, dm, outputmargin=TRUE) %>% array(c(3, 150)) %>% t
+
+  # SHAP contributions:
+  cont <- predict(b, dm, predcontrib=TRUE)
+  expect_length(cont, 3)
+  # rewrap them as a 3d array
+  cont <- unlist(cont) %>% array(c(150, 5, 3))
+  # make sure for each row they add up to marginal predictions
+  max(abs(apply(cont, c(1,3), sum) - pred)) %>% expect_lt(0.001)
+
+  # SHAP interaction contributions:
+  intr <- predict(b, dm, predinteraction=TRUE)
+  expect_length(intr, 3)
+  # rewrap them as a 4d array
+  intr <- unlist(intr) %>% array(c(150, 5, 5, 3)) %>% aperm(c(4, 1, 2, 3)) # [grp, row, col, col]
+  # check the symmetry
+  max(abs(aperm(intr, c(1,2,4,3)) - intr)) %>% expect_lt(0.00001)
+  # sums WRT columns must be close to feature contributions
+  max(abs(apply(intr, c(1,2,3), sum) - aperm(cont, c(3,1,2)))) %>% expect_lt(0.00001)
+})