[R] maintenance Nov 2017; SHAP plots (#2888)

* [R] fix predict contributions for data with no colnames * [R] add a render parameter for xgb.plot.multi.trees; fixes #2628 * [R] update Rd's * [R] remove unnecessary dep-package from R cmake install * silence type warnings; readability * [R] silence complaint about incomplete line at the end * [R] initial version of xgb.plot.shap() * [R] more work on xgb.plot.shap * [R] enforce black font in xgb.plot.tree; fixes #2640 * [R] if feature names are available, check in predict that they are the same; fixes #2857 * [R] cran check and lint fixes * remove tabs * [R] add references; a test for plot.shap
2017-12-05 11:45:34 -06:00
parent 1b77903eeb
commit e8a6597957
19 changed files with 554 additions and 118 deletions
--- a/R-package/DESCRIPTION
+++ b/R-package/DESCRIPTION
@@ -1,8 +1,8 @@
 Package: xgboost
 Type: Package
 Title: Extreme Gradient Boosting
-Version: 0.6.4.7
+Version: 0.6.4.8
-Date: 2017-09-25
+Date: 2017-12-05
 Author: Tianqi Chen <tianqi.tchen@gmail.com>, Tong He <hetong007@gmail.com>,
    Michael Benesty <michael@benesty.fr>, Vadim Khotilovich <khotilovich@gmail.com>,
    Yuan Tang <terrytangyuan@gmail.com>
--- a/R-package/NAMESPACE
+++ b/R-package/NAMESPACE
@@ -40,6 +40,7 @@ export(xgb.model.dt.tree)
 export(xgb.plot.deepness)
 export(xgb.plot.importance)
 export(xgb.plot.multi.trees)
 export(xgb.plot.shap)
 export(xgb.plot.tree)
 export(xgb.save)
 export(xgb.save.raw)
@@ -60,9 +61,12 @@ importFrom(data.table,rbindlist)
 importFrom(data.table,setkey)
 importFrom(data.table,setkeyv)
 importFrom(data.table,setnames)
 importFrom(grDevices,rgb)
 importFrom(graphics,barplot)
 importFrom(graphics,grid)
 importFrom(graphics,lines)
 importFrom(graphics,par)
 importFrom(graphics,points)
 importFrom(graphics,title)
 importFrom(magrittr,"%>%")
 importFrom(stats,median)
--- a/R-package/R/xgb.Booster.R
+++ b/R-package/R/xgb.Booster.R
@@ -150,7 +150,7 @@ xgb.Booster.complete <- function(object, saveraw = TRUE) {
 #' Setting \code{predcontrib = TRUE} allows to calculate contributions of each feature to
 #' individual predictions. For "gblinear" booster, feature contributions are simply linear terms
 #' (feature_beta * feature_value). For "gbtree" booster, feature contributions are SHAP
-#' values (https://arxiv.org/abs/1706.06060) that sum to the difference between the expected output
+#' values (Lundberg 2017) that sum to the difference between the expected output
 #' of the model and the current prediction (where the hessian weights are used to compute the expectations).
 #' Setting \code{approxcontrib = TRUE} approximates these values following the idea explained
 #' in \url{http://blog.datadive.net/interpreting-random-forests/}.
@@ -173,6 +173,12 @@ xgb.Booster.complete <- function(object, saveraw = TRUE) {
 #' @seealso
 #' \code{\link{xgb.train}}.
 #' 
 #' @references
 #'
 #' Scott M. Lundberg, Su-In Lee, "A Unified Approach to Interpreting Model Predictions", NIPS Proceedings 2017, \url{https://arxiv.org/abs/1705.07874}
 #'
 #' Scott M. Lundberg, Su-In Lee, "Consistent feature attribution for tree ensembles", \url{https://arxiv.org/abs/1706.06060}
 #'
 #' @examples
 #' ## binary classification:
 #'
@@ -265,6 +271,10 @@ predict.xgb.Booster <- function(object, newdata, missing = NA, outputmargin = FA
  object <- xgb.Booster.complete(object, saveraw = FALSE)
  if (!inherits(newdata, "xgb.DMatrix"))
    newdata <- xgb.DMatrix(newdata, missing = missing)
  if (!is.null(object[["feature_names"]]) &&
      !is.null(colnames(newdata)) &&
      !identical(object[["feature_names"]], colnames(newdata)))
    stop("Feature names stored in `object` and `newdata` are different!")
  if (is.null(ntreelimit))
    ntreelimit <- NVL(object$best_ntreelimit, 0)
  if (NVL(object$params[['booster']], '') == 'gblinear')
@@ -292,7 +302,7 @@ predict.xgb.Booster <- function(object, newdata, missing = NA, outputmargin = FA
  } else if (predcontrib) {
    n_col1 <- ncol(newdata) + 1
    n_group <- npred_per_case / n_col1
-    dnames <- list(NULL, c(colnames(newdata), "BIAS"))
+    dnames <- if (!is.null(colnames(newdata))) list(NULL, c(colnames(newdata), "BIAS")) else NULL
    ret <- if (n_ret == n_row) {
      matrix(ret, ncol = 1, dimnames = dnames)
    } else if (n_group == 1) {
--- a/R-package/R/xgb.importance.R
+++ b/R-package/R/xgb.importance.R
@@ -136,4 +136,4 @@ xgb.importance <- function(feature_names = NULL, model = NULL, trees = NULL,
 # Avoid error messages during CRAN check.
 # The reason is that these variables are never declared
 # They are mainly column names inferred by Data.table...
-globalVariables(c(".", ".N", "Gain", "Cover", "Frequency", "Feature"))
+globalVariables(c(".", ".N", "Gain", "Cover", "Frequency", "Feature", "Class"))
--- a/R-package/R/xgb.plot.multi.trees.R
+++ b/R-package/R/xgb.plot.multi.trees.R
@@ -7,10 +7,9 @@
 #' @param features_keep number of features to keep in each position of the multi trees.
 #' @param plot_width width in pixels of the graph to produce
 #' @param plot_height height in pixels of the graph to produce
 #' @param render a logical flag for whether the graph should be rendered (see Value).
 #' @param ... currently not used
 #' 
 #' @return Two graphs showing the distribution of the model deepness.
 #' 
 #' @details
 #'
 #' This function tries to capture the complexity of a gradient boosted tree model
@@ -30,26 +29,46 @@
 #' This function is inspired by this blog post:
 #' \url{https://wellecks.wordpress.com/2015/02/21/peering-into-the-black-box-visualizing-lambdamart/}
 #'
 #' @return
 #'
 #' When \code{render = TRUE}:
 #' returns a rendered graph object which is an \code{htmlwidget} of class \code{grViz}.
 #' Similar to ggplot objects, it needs to be printed to see it when not running from command line.
 #'
 #' When \code{render = FALSE}:
 #' silently returns a graph object which is of DiagrammeR's class \code{dgr_graph}.
 #' This could be useful if one wants to modify some of the graph attributes
 #' before rendering the graph with \code{\link[DiagrammeR]{render_graph}}.
 #'
 #' @examples
 #'
 #' data(agaricus.train, package='xgboost')
 #'
 #' bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max_depth = 15,
-#'                  eta = 1, nthread = 2, nrounds = 30, objective = "binary:logistic",
+#'                eta = 1, nthread = 2, nrounds = 30, objective = "binary:logistic",
-#'                  min_child_weight = 50)
+#'                min_child_weight = 50, verbose = 0)
 #'
-#' p <- xgb.plot.multi.trees(model = bst, feature_names = colnames(agaricus.train$data),
+#' p <- xgb.plot.multi.trees(model = bst, features_keep = 3)
 #'                           features_keep = 3)
 #' print(p)
 #'
 #' \dontrun{
 #' # Below is an example of how to save this plot to a file.
 #' # Note that for `export_graph` to work, the DiagrammeRsvg and rsvg packages must also be installed.
 #' library(DiagrammeR)
 #' gr <- xgb.plot.multi.trees(model=bst, features_keep = 3, render=FALSE)
 #' export_graph(gr, 'tree.pdf', width=1500, height=600)
 #' }
 #'
 #' @export
-xgb.plot.multi.trees <- function(model, feature_names = NULL, features_keep = 5, plot_width = NULL, plot_height = NULL, ...){
+xgb.plot.multi.trees <- function(model, feature_names = NULL, features_keep = 5, plot_width = NULL, plot_height = NULL,
                                 render = TRUE, ...){
  check.deprecation(...)
  tree.matrix <- xgb.model.dt.tree(feature_names = feature_names, model = model)
  # first number of the path represents the tree, then the following numbers are related to the path to follow
  # root init
  root.nodes <- tree.matrix[stri_detect_regex(ID, "\\d+-0"), ID]
-  tree.matrix[ID %in% root.nodes, abs.node.position:=root.nodes]
+  tree.matrix[ID %in% root.nodes, abs.node.position := root.nodes]
  precedent.nodes <- root.nodes
@@ -64,9 +83,8 @@ xgb.plot.multi.trees <- function(model, feature_names = NULL, features_keep = 5,
    precedent.nodes <- c(yes.nodes.abs.pos, no.nodes.abs.pos)
  }
-  tree.matrix[!is.na(Yes),Yes:= paste0(abs.node.position, "_0")]
+  tree.matrix[!is.na(Yes), Yes := paste0(abs.node.position, "_0")]
-  tree.matrix[!is.na(No),No:= paste0(abs.node.position, "_1")]
+  tree.matrix[!is.na(No), No := paste0(abs.node.position, "_1")]
  remove.tree <- . %>% stri_replace_first_regex(pattern = "^\\d+-", replacement = "")
@@ -121,6 +139,8 @@ xgb.plot.multi.trees <- function(model, feature_names = NULL, features_keep = 5,
      attr  = c("color", "arrowsize", "arrowhead", "fontname"),
      value = c("DimGray", "1.5", "vee", "Helvetica"))
  if (!render) return(invisible(graph))
  DiagrammeR::render_graph(graph, width = plot_width, height = plot_height)
 }
--- a/R-package/R/xgb.plot.shap.R
+++ b/R-package/R/xgb.plot.shap.R
@@ -0,0 +1,214 @@
 #' SHAP contribution dependency plots
 #'
 #' Visualizing the SHAP feature contribution to prediction dependencies on feature value.
 #' 
 #' @param data data as a \code{matrix} or \code{dgCMatrix}.
 #' @param shap_contrib a matrix of SHAP contributions that was computed earlier for the above 
 #'          \code{data}. When it is NULL, it is computed internally using \code{model} and \code{data}.
 #' @param features a vector of either column indices or of feature names to plot. When it is NULL,
 #'          feature importance is calculated, and \code{top_n} high ranked features are taken.
 #' @param top_n when \code{features} is NULL, top_n [1, 100] most important features in a model are taken.
 #' @param model an \code{xgb.Booster} model. It has to be provided when either \code{shap_contrib}
 #'          or \code{features} is missing.
 #' @param trees passed to \code{\link{xgb.importance}} when \code{features = NULL}.
 #' @param target_class is only relevant for multiclass models. When it is set to a 0-based class index,
 #'          only SHAP contributions for that specific class are used.
 #'          If it is not set, SHAP importances are averaged over all classes.
 #' @param approxcontrib passed to \code{\link{predict.xgb.Booster}} when \code{shap_contrib = NULL}.
 #' @param subsample a random fraction of data points to use for plotting. When it is NULL,
 #'          it is set so that up to 100K data points are used.
 #' @param n_col a number of columns in a grid of plots.
 #' @param col color of the scatterplot markers.
 #' @param pch scatterplot marker.
 #' @param discrete_n_uniq a maximal number of unique values in a feature to consider it as discrete.
 #' @param discrete_jitter an \code{amount} parameter of jitter added to discrete features' positions.
 #' @param ylab a y-axis label in 1D plots.
 #' @param plot_NA whether the contributions of cases with missing values should also be plotted.
 #' @param col_NA a color of marker for missing value contributions.
 #' @param pch_NA a marker type for NA values.
 #' @param pos_NA a relative position of the x-location where NA values are shown:
 #'          \code{min(x) + (max(x) - min(x)) * pos_NA}.
 #' @param plot_loess whether to plot loess-smoothed curves. The smoothing is only done for features with
 #'          more than 5 distinct values.
 #' @param col_loess a color to use for the loess curves.
 #' @param span_loess the \code{span} paramerer in \code{\link[stats]{loess}}'s call.
 #' @param which whether to do univariate or bivariate plotting. NOTE: only 1D is implemented so far.
 #' @param plot whether a plot should be drawn. If FALSE, only a lits of matrices is returned.
 #' @param ... other parameters passed to \code{plot}.
 #' 
 #' @details
 #' 
 #' These scatterplots represent how SHAP feature contributions depend of feature values.
 #' The similarity to partial dependency plots is that they also give an idea for how feature values
 #' affect predictions. However, in partial dependency plots, we usually see marginal dependencies
 #' of model prediction on feature value, while SHAP contribution dependency plots display the estimated
 #' contributions of a feature to model prediction for each individual case.
 #' 
 #' When \code{plot_loess = TRUE} is set, feature values are rounded to 3 significant digits and
 #' weighted LOESS is computed and plotted, where weights are the numbers of data points
 #' at each rounded value.
 #' 
 #' Note: SHAP contributions are shown on the scale of model margin. E.g., for a logistic binomial objective,
 #' the margin is prediction before a sigmoidal transform into probability-like values.
 #' Also, since SHAP stands for "SHapley Additive exPlanation" (model prediction = sum of SHAP
 #' contributions for all features + bias), depending on the objective used, transforming SHAP
 #' contributions for a feature from the marginal to the prediction space is not necessarily
 #' a meaningful thing to do.
 #' 
 #' @return
 #' 
 #' In addition to producing plots (when \code{plot=TRUE}), it silently returns a list of two matrices:
 #' \itemize{
 #'  \item \code{data} the values of selected features;
 #'  \item \code{shap_contrib} the contributions of selected features.
 #' }
 #'
 #' @references
 #'
 #' Scott M. Lundberg, Su-In Lee, "A Unified Approach to Interpreting Model Predictions", NIPS Proceedings 2017, \url{https://arxiv.org/abs/1705.07874}
 #'
 #' Scott M. Lundberg, Su-In Lee, "Consistent feature attribution for tree ensembles", \url{https://arxiv.org/abs/1706.06060}
 #'
 #' @examples
 #' 
 #' data(agaricus.train, package='xgboost')
 #' data(agaricus.test, package='xgboost')
 #'
 #' bst <- xgboost(agaricus.train$data, agaricus.train$label, nrounds = 50, 
 #'                eta = 0.1, max_depth = 3, subsample = .5,
 #'                method = "hist", objective = "binary:logistic", nthread = 2, verbose = 0)
 #'
 #' xgb.plot.shap(agaricus.test$data, model = bst, features = "odor=none")
 #' contr <- predict(bst, agaricus.test$data, predcontrib = TRUE)
 #' xgb.plot.shap(agaricus.test$data, contr, model = bst, top_n = 12, n_col = 3)
 #'
 #' # multiclass example - plots for each class separately:
 #' nclass <- 3
 #' nrounds <- 20
 #' x <- as.matrix(iris[, -5])
 #' set.seed(123)
 #' is.na(x[sample(nrow(x) * 4, 30)]) <- TRUE # introduce some missing values
 #' mbst <- xgboost(data = x, label = as.numeric(iris$Species) - 1, nrounds = nrounds,
 #'                 max_depth = 2, eta = 0.3, subsample = .5, nthread = 2,
 #'                 objective = "multi:softprob", num_class = nclass, verbose = 0)
 #' trees0 <- seq(from=0, by=nclass, length.out=nrounds)
 #' col <- rgb(0, 0, 1, 0.5)
 #' xgb.plot.shap(x, model = mbst, trees = trees0, target_class = 0, top_n = 4, n_col = 2, col = col, pch = 16, pch_NA = 17)
 #' xgb.plot.shap(x, model = mbst, trees = trees0 + 1, target_class = 1, top_n = 4, n_col = 2, col = col, pch = 16, pch_NA = 17)
 #' xgb.plot.shap(x, model = mbst, trees = trees0 + 2, target_class = 2, top_n = 4, n_col = 2, col = col, pch = 16, pch_NA = 17)
 #' 
 #' @rdname xgb.plot.shap
 #' @export
 xgb.plot.shap <- function(data, shap_contrib = NULL, features = NULL, top_n = 1, model = NULL,
                          trees = NULL, target_class = NULL, approxcontrib = FALSE,
                          subsample = NULL, n_col = 1, col = rgb(0, 0, 1, 0.2), pch = '.',
                          discrete_n_uniq = 5, discrete_jitter = 0.01, ylab = "SHAP",
                          plot_NA = TRUE, col_NA = rgb(0.7, 0, 1, 0.6), pch_NA = '.', pos_NA = 1.07,
                          plot_loess = TRUE, col_loess = 2, span_loess = 0.5,
                          which = c("1d", "2d"), plot = TRUE, ...) {
  if (!is.matrix(data) && !inherits(data, "dgCMatrix"))
    stop("data: must be either matrix or dgCMatrix")
  if (is.null(shap_contrib) && (is.null(model) || !inherits(model, "xgb.Booster")))
    stop("when shap_contrib is not provided, one must provide an xgb.Booster model")
  if (is.null(features) && (is.null(model) || !inherits(model, "xgb.Booster")))
    stop("when features are not provided, one must provide an xgb.Booster model to rank the features")
  if (!is.null(shap_contrib) &&
      (!is.matrix(shap_contrib) || nrow(shap_contrib) != nrow(data) || ncol(shap_contrib) != ncol(data) + 1))
    stop("shap_contrib is not compatible with the provided data")
  nsample <- if (is.null(subsample)) min(100000, nrow(data)) else as.integer(subsample * nrow(data))
  idx <- sample(1:nrow(data), nsample)
  data <- data[idx,]
  if (is.null(shap_contrib)) {
    shap_contrib <- predict(model, data, predcontrib = TRUE, approxcontrib = approxcontrib)
  } else {
    shap_contrib <- shap_contrib[idx,]
  }
  which <- match.arg(which)
  if (which == "2d")
    stop("2D plots are not implemented yet")
  if (is.null(features)) {
    imp <- xgb.importance(model = model, trees = trees)
    top_n <- as.integer(top_n[1])
    if (top_n < 1 && top_n > 100)
      stop("top_n: must be an integer within [1, 100]")
    features <- imp$Feature[1:min(top_n, NROW(imp))]
  }
  if (is.character(features)) {
    if (is.null(colnames(data)))
      stop("Either provide `data` with column names or provide `features` as column indices")
    features <- match(features, colnames(data))
  }
  if (n_col > length(features)) n_col <- length(features)
  if (is.list(shap_contrib)) { # multiclass: either choose a class or merge
    shap_contrib <- if (!is.null(target_class)) shap_contrib[[target_class + 1]]
                    else Reduce("+", lapply(shap_contrib, abs))
  }
  shap_contrib <- shap_contrib[, features, drop = FALSE]
  data <- data[, features, drop = FALSE]
  cols <- colnames(data)
  if (is.null(cols)) cols <- colnames(shap_contrib)
  if (is.null(cols)) cols <- paste0('X', 1:ncol(data))
  colnames(data) <- cols
  colnames(shap_contrib) <- cols
  if (plot && which == "1d") {
    op <- par(mfrow = c(ceiling(length(features) / n_col), n_col),
              oma = c(0,0,0,0) + 0.2,
              mar = c(3.5,3.5,0,0) + 0.1,
              mgp = c(1.7, 0.6, 0))
    for (f in cols) {
      ord <- order(data[, f])
      x <- data[, f][ord]
      y <- shap_contrib[, f][ord]
      x_lim <- range(x, na.rm = TRUE)
      y_lim <- range(y, na.rm = TRUE)
      do_na <- plot_NA && any(is.na(x))
      if (do_na) {
        x_range <- diff(x_lim)
        loc_na <- min(x, na.rm = TRUE) + x_range * pos_NA
        x_lim <- range(c(x_lim, loc_na))
      }
      x_uniq <- unique(x)
      x2plot <- x
      # add small jitter for discrete features with <= 5 distinct values
      if (length(x_uniq) <= discrete_n_uniq)
        x2plot <- jitter(x, amount = discrete_jitter * min(diff(x_uniq), na.rm = TRUE))
      plot(x2plot, y, pch = pch, xlab = f, col = col, xlim = x_lim, ylim = y_lim, ylab = ylab, ...)
      grid()
      if (plot_loess) {
        # compress x to 3 digits, and mean-aggredate y
        zz <- data.table(x = signif(x, 3), y)[, .(.N, y=mean(y)), x]
        if (nrow(zz) <= 5) {
          lines(zz$x, zz$y, col = col_loess)
        } else {
          lo <- stats::loess(y ~ x, data = zz, weights = zz$N, span = span_loess)
          zz$y_lo <- predict(lo, zz, type = "link")
          lines(zz$x, zz$y_lo, col = col_loess)
        }
      }
      if (do_na) {
        i_na <- which(is.na(x))
        x_na <- rep(loc_na, length(i_na))
        x_na <- jitter(x_na, amount = x_range * 0.01)
        points(x_na, y[i_na], pch = pch_NA, col = col_NA)
      }
    }
    par(op)
  }
  if (plot && which == "2d") {
    # TODO
  }
  invisible(list(data = data, shap_contrib = shap_contrib))
 }
--- a/R-package/R/xgb.plot.tree.R
+++ b/R-package/R/xgb.plot.tree.R
@@ -95,7 +95,8 @@ xgb.plot.tree <- function(feature_names = NULL, model = NULL, trees = NULL, plot
    label     = dt$label,
    fillcolor = dt$filledcolor,
    shape     = dt$shape,
-    data      = dt$Feature)
+    data      = dt$Feature,
    fontcolor = "black")
  edges <- DiagrammeR::create_edge_df(
    from  = match(dt[Feature != "Leaf", c(ID)] %>% rep(2), dt$ID),
--- a/R-package/R/xgb.train.R
+++ b/R-package/R/xgb.train.R
@@ -169,6 +169,11 @@
 #' \code{\link{predict.xgb.Booster}},
 #' \code{\link{xgb.cv}}
 #' 
 #' @references
 #'
 #' Tianqi Chen and Carlos Guestrin, "XGBoost: A Scalable Tree Boosting System",
 #' 22nd SIGKDD Conference on Knowledge Discovery and Data Mining, 2016, \url{https://arxiv.org/abs/1603.02754}
 #'
 #' @examples
 #' data(agaricus.train, package='xgboost')
 #' data(agaricus.test, package='xgboost')
--- a/R-package/R/xgboost.R
+++ b/R-package/R/xgboost.R
@@ -100,9 +100,12 @@ NULL
 #' @importFrom stats median
 #' @importFrom utils head
 #' @importFrom graphics barplot
 #' @importFrom graphics lines
 #' @importFrom graphics points
 #' @importFrom graphics grid
 #' @importFrom graphics par
 #' @importFrom graphics title
 #' @importFrom grDevices rgb
 #' 
 #' @import methods
 #' @useDynLib xgboost, .registration = TRUE
--- a/R-package/demo/00Index
+++ b/R-package/demo/00Index
@@ -11,3 +11,4 @@ early_stopping                  Early Stop in training
 poisson_regression              Poisson Regression on count data
 tweedie_regression              Tweddie Regression
 gpu_accelerated                 GPU-accelerated tree building algorithms
--- a/R-package/man/predict.xgb.Booster.Rd
+++ b/R-package/man/predict.xgb.Booster.Rd
@@ -7,7 +7,7 @@
 \usage{
 \method{predict}{xgb.Booster}(object, newdata, missing = NA,
  outputmargin = FALSE, ntreelimit = NULL, predleaf = FALSE,
-  predcontrib = FALSE, reshape = FALSE, ...)
+  predcontrib = FALSE, approxcontrib = FALSE, reshape = FALSE, ...)
 \method{predict}{xgb.Booster.handle}(object, ...)
 }
@@ -30,6 +30,8 @@ It will use all the trees by default (\code{NULL} value).}
 \item{predcontrib}{whether to return feature contributions to individual predictions instead (see Details).}
 \item{approxcontrib}{whether to use a fast approximation for feature contributions (see Details).}
 \item{reshape}{whether to reshape the vector of predictions to a matrix form when there are several
 prediction outputs per case. This option has no effect when \code{predleaf = TRUE}.}
@@ -69,10 +71,11 @@ e.g., as implemented in \code{\link{xgb.create.features}}.
 Setting \code{predcontrib = TRUE} allows to calculate contributions of each feature to
 individual predictions. For "gblinear" booster, feature contributions are simply linear terms
-(feature_beta * feature_value). For "gbtree" booster, feature contribution is calculated 
+(feature_beta * feature_value). For "gbtree" booster, feature contributions are SHAP
-as a sum of average contribution of that feature's split nodes across all trees to an 
+values (Lundberg 2017) that sum to the difference between the expected output
-individual prediction, following the idea explained in 
+of the model and the current prediction (where the hessian weights are used to compute the expectations).
-\url{http://blog.datadive.net/interpreting-random-forests/}.
+Setting \code{approxcontrib = TRUE} approximates these values following the idea explained
 in \url{http://blog.datadive.net/interpreting-random-forests/}.
 }
 \examples{
 ## binary classification:
@@ -98,7 +101,7 @@ str(pred_leaf)
 # the result is an nsamples X (nfeatures + 1) matrix
 pred_contr <- predict(bst, test$data, predcontrib = TRUE)
 str(pred_contr)
-# verify that contributions' sums are equal to log-odds of predictions (up to foat precision):
+# verify that contributions' sums are equal to log-odds of predictions (up to float precision):
 summary(rowSums(pred_contr) - qlogis(pred))
 # for the 1st record, let's inspect its features that had non-zero contribution to prediction:
 contr1 <- pred_contr[1,]
@@ -158,6 +161,11 @@ err <- sapply(1:25, function(n) {
 })
 plot(err, type='l', ylim=c(0,0.1), xlab='#trees')
 }
 \references{
 Scott M. Lundberg, Su-In Lee, "A Unified Approach to Interpreting Model Predictions", NIPS Proceedings 2017, \url{https://arxiv.org/abs/1705.07874}
 Scott M. Lundberg, Su-In Lee, "Consistent feature attribution for tree ensembles", \url{https://arxiv.org/abs/1706.06060}
 }
 \seealso{
 \code{\link{xgb.train}}.
--- a/R-package/man/xgb.plot.multi.trees.Rd
+++ b/R-package/man/xgb.plot.multi.trees.Rd
@@ -5,7 +5,7 @@
 \title{Project all trees on one tree and plot it}
 \usage{
 xgb.plot.multi.trees(model, feature_names = NULL, features_keep = 5,
-  plot_width = NULL, plot_height = NULL, ...)
+  plot_width = NULL, plot_height = NULL, render = TRUE, ...)
 }
 \arguments{
 \item{model}{produced by the \code{xgb.train} function.}
@@ -18,10 +18,19 @@ xgb.plot.multi.trees(model, feature_names = NULL, features_keep = 5,
 \item{plot_height}{height in pixels of the graph to produce}
 \item{render}{a logical flag for whether the graph should be rendered (see Value).}
 \item{...}{currently not used}
 }
 \value{
-Two graphs showing the distribution of the model deepness.
+When \code{render = TRUE}:
 returns a rendered graph object which is an \code{htmlwidget} of class \code{grViz}.
 Similar to ggplot objects, it needs to be printed to see it when not running from command line.
 When \code{render = FALSE}:
 silently returns a graph object which is of DiagrammeR's class \code{dgr_graph}.
 This could be useful if one wants to modify some of the graph attributes
 before rendering the graph with \code{\link[DiagrammeR]{render_graph}}.
 }
 \description{
 Visualization of the ensemble of trees as a single collective unit.
@@ -45,14 +54,22 @@ This function is inspired by this blog post:
 \url{https://wellecks.wordpress.com/2015/02/21/peering-into-the-black-box-visualizing-lambdamart/}
 }
 \examples{
 data(agaricus.train, package='xgboost')
 bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max_depth = 15,
-                 eta = 1, nthread = 2, nrounds = 30, objective = "binary:logistic",
+               eta = 1, nthread = 2, nrounds = 30, objective = "binary:logistic",
-                 min_child_weight = 50)
+               min_child_weight = 50, verbose = 0)
-p <- xgb.plot.multi.trees(model = bst, feature_names = colnames(agaricus.train$data),
+p <- xgb.plot.multi.trees(model = bst, features_keep = 3)
                          features_keep = 3)
 print(p)
 \dontrun{
 # Below is an example of how to save this plot to a file.
 # Note that for `export_graph` to work, the DiagrammeRsvg and rsvg packages must also be installed.
 library(DiagrammeR)
 gr <- xgb.plot.multi.trees(model=bst, features_keep = 3, render=FALSE)
 export_graph(gr, 'tree.pdf', width=1500, height=600)
 }
 }
--- a/R-package/man/xgb.plot.shap.Rd
+++ b/R-package/man/xgb.plot.shap.Rd
@@ -0,0 +1,135 @@
 % Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/xgb.plot.shap.R
 \name{xgb.plot.shap}
 \alias{xgb.plot.shap}
 \title{SHAP contribution dependency plots}
 \usage{
 xgb.plot.shap(data, shap_contrib = NULL, features = NULL, top_n = 1,
  model = NULL, trees = NULL, target_class = NULL,
  approxcontrib = FALSE, subsample = NULL, n_col = 1, col = rgb(0, 0, 1,
  0.2), pch = ".", discrete_n_uniq = 5, discrete_jitter = 0.01,
  ylab = "SHAP", plot_NA = TRUE, col_NA = rgb(0.7, 0, 1, 0.6),
  pch_NA = ".", pos_NA = 1.07, plot_loess = TRUE, col_loess = 2,
  span_loess = 0.5, which = c("1d", "2d"), plot = TRUE, ...)
 }
 \arguments{
 \item{data}{data as a \code{matrix} or \code{dgCMatrix}.}
 \item{shap_contrib}{a matrix of SHAP contributions that was computed earlier for the above 
 \code{data}. When it is NULL, it is computed internally using \code{model} and \code{data}.}
 \item{features}{a vector of either column indices or of feature names to plot. When it is NULL,
 feature importance is calculated, and \code{top_n} high ranked features are taken.}
 \item{top_n}{when \code{features} is NULL, top_n [1, 100] most important features in a model are taken.}
 \item{model}{an \code{xgb.Booster} model. It has to be provided when either \code{shap_contrib}
 or \code{features} is missing.}
 \item{trees}{passed to \code{\link{xgb.importance}} when \code{features = NULL}.}
 \item{target_class}{is only relevant for multiclass models. When it is set to a 0-based class index,
 only SHAP contributions for that specific class are used.
 If it is not set, SHAP importances are averaged over all classes.}
 \item{approxcontrib}{passed to \code{\link{predict.xgb.Booster}} when \code{shap_contrib = NULL}.}
 \item{subsample}{a random fraction of data points to use for plotting. When it is NULL,
 it is set so that up to 100K data points are used.}
 \item{n_col}{a number of columns in a grid of plots.}
 \item{col}{color of the scatterplot markers.}
 \item{pch}{scatterplot marker.}
 \item{discrete_n_uniq}{a maximal number of unique values in a feature to consider it as discrete.}
 \item{discrete_jitter}{an \code{amount} parameter of jitter added to discrete features' positions.}
 \item{ylab}{a y-axis label in 1D plots.}
 \item{plot_NA}{whether the contributions of cases with missing values should also be plotted.}
 \item{col_NA}{a color of marker for missing value contributions.}
 \item{pch_NA}{a marker type for NA values.}
 \item{pos_NA}{a relative position of the x-location where NA values are shown:
 \code{min(x) + (max(x) - min(x)) * pos_NA}.}
 \item{plot_loess}{whether to plot loess-smoothed curves. The smoothing is only done for features with
 more than 5 distinct values.}
 \item{col_loess}{a color to use for the loess curves.}
 \item{span_loess}{the \code{span} paramerer in \code{\link[stats]{loess}}'s call.}
 \item{which}{whether to do univariate or bivariate plotting. NOTE: only 1D is implemented so far.}
 \item{plot}{whether a plot should be drawn. If FALSE, only a lits of matrices is returned.}
 \item{...}{other parameters passed to \code{plot}.}
 }
 \value{
 In addition to producing plots (when \code{plot=TRUE}), it silently returns a list of two matrices:
 \itemize{
 \item \code{data} the values of selected features;
 \item \code{shap_contrib} the contributions of selected features.
 }
 }
 \description{
 Visualizing the SHAP feature contribution to prediction dependencies on feature value.
 }
 \details{
 These scatterplots represent how SHAP feature contributions depend of feature values.
 The similarity to partial dependency plots is that they also give an idea for how feature values
 affect predictions. However, in partial dependency plots, we usually see marginal dependencies
 of model prediction on feature value, while SHAP contribution dependency plots display the estimated
 contributions of a feature to model prediction for each individual case.
 When \code{plot_loess = TRUE} is set, feature values are rounded to 3 significant digits and
 weighted LOESS is computed and plotted, where weights are the numbers of data points
 at each rounded value.
 Note: SHAP contributions are shown on the scale of model margin. E.g., for a logistic binomial objective,
 the margin is prediction before a sigmoidal transform into probability-like values.
 Also, since SHAP stands for "SHapley Additive exPlanation" (model prediction = sum of SHAP
 contributions for all features + bias), depending on the objective used, transforming SHAP
 contributions for a feature from the marginal to the prediction space is not necessarily
 a meaningful thing to do.
 }
 \examples{
 data(agaricus.train, package='xgboost')
 data(agaricus.test, package='xgboost')
 bst <- xgboost(agaricus.train$data, agaricus.train$label, nrounds = 50, 
               eta = 0.1, max_depth = 3, subsample = .5,
               method = "hist", objective = "binary:logistic", nthread = 2, verbose = 0)
 xgb.plot.shap(agaricus.test$data, model = bst, features = "odor=none")
 contr <- predict(bst, agaricus.test$data, predcontrib = TRUE)
 xgb.plot.shap(agaricus.test$data, contr, model = bst, top_n = 12, n_col = 3)
 # multiclass example - plots for each class separately:
 nclass <- 3
 nrounds <- 20
 x <- as.matrix(iris[, -5])
 set.seed(123)
 is.na(x[sample(nrow(x) * 4, 30)]) <- TRUE # introduce some missing values
 mbst <- xgboost(data = x, label = as.numeric(iris$Species) - 1, nrounds = nrounds,
                max_depth = 2, eta = 0.3, subsample = .5, nthread = 2,
                objective = "multi:softprob", num_class = nclass, verbose = 0)
 trees0 <- seq(from=0, by=nclass, length.out=nrounds)
 col <- rgb(0, 0, 1, 0.5)
 xgb.plot.shap(x, model = mbst, trees = trees0, target_class = 0, top_n = 4, n_col = 2, col = col, pch = 16, pch_NA = 17)
 xgb.plot.shap(x, model = mbst, trees = trees0 + 1, target_class = 1, top_n = 4, n_col = 2, col = col, pch = 16, pch_NA = 17)
 xgb.plot.shap(x, model = mbst, trees = trees0 + 2, target_class = 2, top_n = 4, n_col = 2, col = col, pch = 16, pch_NA = 17)
 }
 \references{
 Scott M. Lundberg, Su-In Lee, "A Unified Approach to Interpreting Model Predictions", NIPS Proceedings 2017, \url{https://arxiv.org/abs/1705.07874}
 Scott M. Lundberg, Su-In Lee, "Consistent feature attribution for tree ensembles", \url{https://arxiv.org/abs/1706.06060}
 }
--- a/R-package/man/xgb.train.Rd
+++ b/R-package/man/xgb.train.Rd
@@ -258,6 +258,10 @@ bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label,
               objective = "binary:logistic")
 pred <- predict(bst, agaricus.test$data)
 }
 \references{
 Tianqi Chen and Carlos Guestrin, "XGBoost: A Scalable Tree Boosting System",
 22nd SIGKDD Conference on Knowledge Discovery and Data Mining, 2016, \url{https://arxiv.org/abs/1603.02754}
 }
 \seealso{
 \code{\link{callbacks}},
--- a/R-package/tests/testthat/test_helpers.R
+++ b/R-package/tests/testthat/test_helpers.R
@@ -81,6 +81,11 @@ test_that("predict feature contributions works", {
  expect_equal(colnames(pred_contr), c(colnames(sparse_matrix), "BIAS"))
  pred <- predict(bst.Tree, sparse_matrix, outputmargin = TRUE)
  expect_lt(max(abs(rowSums(pred_contr) - pred)), 1e-5)
  # must work with data that has no column names
  X <- sparse_matrix
  colnames(X) <- NULL
  expect_error(pred_contr_ <- predict(bst.Tree, X, predcontrib = TRUE), regexp = NA)
  expect_equal(pred_contr, pred_contr_, check.attributes = FALSE)
  # gbtree binary classifier (approximate method)
  expect_error(pred_contr <- predict(bst.Tree, sparse_matrix, predcontrib = TRUE, approxcontrib = TRUE), regexp = NA)
@@ -289,6 +294,13 @@ test_that("xgb.plot.deepness works", {
  xgb.ggplot.deepness(model = bst.Tree)
 })
 test_that("xgb.plot.shap works", {
  sh <- xgb.plot.shap(data = sparse_matrix, model = bst.Tree, top_n = 2, col = 4)
  expect_equal(names(sh), c("data", "shap_contrib"))
  expect_equal(NCOL(sh$data), 2)
  expect_equal(NCOL(sh$shap_contrib), 2)
 })
 test_that("check.deprecation works", {
  ttt <- function(a = NNULL, DUMMY=NULL, ...) {
    check.deprecation(...)
--- a/cmake/Utils.cmake
+++ b/cmake/Utils.cmake
@@ -78,7 +78,7 @@ function(setup_rpackage_install_target rlib_target build_dir)
  install(CODE "file(WRITE \"${build_dir}/R-package/src/Makevars\" \"all:\")")
  install(CODE "file(WRITE \"${build_dir}/R-package/src/Makevars.win\" \"all:\")")
  set(XGB_DEPS_SCRIPT
-    "deps = setdiff(c('statar','data.table', 'magrittr', 'stringi'), rownames(installed.packages()));\
+    "deps = setdiff(c('data.table', 'magrittr', 'stringi'), rownames(installed.packages()));\
    if(length(deps)>0) install.packages(deps, repo = 'https://cloud.r-project.org/')")
  install(CODE "execute_process(COMMAND \"${LIBR_EXECUTABLE}\" \"-q\" \"-e\" \"${XGB_DEPS_SCRIPT}\")")
  install(CODE "execute_process(COMMAND \"${LIBR_EXECUTABLE}\" CMD INSTALL\
--- a/include/xgboost/tree_model.h
+++ b/include/xgboost/tree_model.h
@@ -516,7 +516,7 @@ class RegTree: public TreeModel<bst_float, RTreeNodeStat> {
   * \param out_contribs output vector to hold the contributions
   */
  inline void CalculateContributionsApprox(const RegTree::FVec& feat, unsigned root_id,
-                                    bst_float *out_contribs) const;
+                                           bst_float *out_contribs) const;
  /*!
   * \brief get next position of the tree given current pid
   * \param pid Current node id.
@@ -619,7 +619,7 @@ inline bst_float RegTree::FillNodeMeanValue(int nid) {
 }
 inline void RegTree::CalculateContributionsApprox(const RegTree::FVec& feat, unsigned root_id,
-                                            bst_float *out_contribs) const {
+                                                  bst_float *out_contribs) const {
  CHECK_GT(this->node_mean_values.size(), 0U);
  // this follows the idea of http://blog.datadive.net/interpreting-random-forests/
  bst_float node_value;
@@ -647,16 +647,16 @@ inline void RegTree::CalculateContributionsApprox(const RegTree::FVec& feat, uns
 // extend our decision path with a fraction of one and zero extensions
 inline void ExtendPath(PathElement *unique_path, unsigned unique_depth,
-                        bst_float zero_fraction, bst_float one_fraction, int feature_index) {
+                       bst_float zero_fraction, bst_float one_fraction, int feature_index) {
  unique_path[unique_depth].feature_index = feature_index;
  unique_path[unique_depth].zero_fraction = zero_fraction;
  unique_path[unique_depth].one_fraction = one_fraction;
-  unique_path[unique_depth].pweight = static_cast<bst_float>(unique_depth == 0 ? 1 : 0);
+  unique_path[unique_depth].pweight = (unique_depth == 0 ? 1.0f : 0.0f);
-  for (int i = unique_depth-1; i >= 0; i--) {
+  for (int i = unique_depth - 1; i >= 0; i--) {
-    unique_path[i+1].pweight += one_fraction*unique_path[i].pweight*(i+1)
+    unique_path[i+1].pweight += one_fraction * unique_path[i].pweight * (i + 1)
-                                / static_cast<bst_float>(unique_depth+1);
+                                / static_cast<bst_float>(unique_depth + 1);
-    unique_path[i].pweight = zero_fraction*unique_path[i].pweight*(unique_depth-i)
+    unique_path[i].pweight = zero_fraction * unique_path[i].pweight * (unique_depth - i)
-                             / static_cast<bst_float>(unique_depth+1);
+                             / static_cast<bst_float>(unique_depth + 1);
  }
 }
@@ -666,16 +666,16 @@ inline void UnwindPath(PathElement *unique_path, unsigned unique_depth, unsigned
  const bst_float zero_fraction = unique_path[path_index].zero_fraction;
  bst_float next_one_portion = unique_path[unique_depth].pweight;
-  for (int i = unique_depth-1; i >= 0; --i) {
+  for (int i = unique_depth - 1; i >= 0; --i) {
    if (one_fraction != 0) {
      const bst_float tmp = unique_path[i].pweight;
-      unique_path[i].pweight = next_one_portion*(unique_depth+1)
+      unique_path[i].pweight = next_one_portion * (unique_depth + 1)
-                               / static_cast<bst_float>((i+1)*one_fraction);
+                               / static_cast<bst_float>((i + 1) * one_fraction);
-      next_one_portion = tmp - unique_path[i].pweight*zero_fraction*(unique_depth-i)
+      next_one_portion = tmp - unique_path[i].pweight * zero_fraction * (unique_depth - i)
-                               / static_cast<bst_float>(unique_depth+1);
+                               / static_cast<bst_float>(unique_depth + 1);
    } else {
-      unique_path[i].pweight = (unique_path[i].pweight*(unique_depth+1))
+      unique_path[i].pweight = (unique_path[i].pweight * (unique_depth + 1))
-                               / static_cast<bst_float>(zero_fraction*(unique_depth-i));
+                               / static_cast<bst_float>(zero_fraction * (unique_depth - i));
    }
  }
@@ -694,16 +694,16 @@ inline bst_float UnwoundPathSum(const PathElement *unique_path, unsigned unique_
  const bst_float zero_fraction = unique_path[path_index].zero_fraction;
  bst_float next_one_portion = unique_path[unique_depth].pweight;
  bst_float total = 0;
-  for (int i = unique_depth-1; i >= 0; --i) {
+  for (int i = unique_depth - 1; i >= 0; --i) {
    if (one_fraction != 0) {
-      const bst_float tmp = next_one_portion*(unique_depth+1)
+      const bst_float tmp = next_one_portion * (unique_depth + 1)
-                            / static_cast<bst_float>((i+1)*one_fraction);
+                            / static_cast<bst_float>((i + 1) * one_fraction);
      total += tmp;
-      next_one_portion = unique_path[i].pweight - tmp*zero_fraction*((unique_depth-i)
+      next_one_portion = unique_path[i].pweight - tmp * zero_fraction * ((unique_depth - i)
                         / static_cast<bst_float>(unique_depth+1));
    } else {
-      total += (unique_path[i].pweight/zero_fraction)/((unique_depth-i)
+      total += (unique_path[i].pweight / zero_fraction) / ((unique_depth - i)
-               / static_cast<bst_float>(unique_depth+1));
+               / static_cast<bst_float>(unique_depth + 1));
    }
  }
  return total;
@@ -718,7 +718,8 @@ inline void RegTree::TreeShap(const RegTree::FVec& feat, bst_float *phi,
  // extend the unique path
  PathElement *unique_path = parent_unique_path + unique_depth;
-  if (unique_depth > 0) std::copy(parent_unique_path, parent_unique_path+unique_depth, unique_path);
+  if (unique_depth > 0) std::copy(parent_unique_path,
                                  parent_unique_path + unique_depth, unique_path);
  ExtendPath(unique_path, unique_depth, parent_zero_fraction,
             parent_one_fraction, parent_feature_index);
  const unsigned split_index = node.split_index();
@@ -728,7 +729,7 @@ inline void RegTree::TreeShap(const RegTree::FVec& feat, bst_float *phi,
    for (unsigned i = 1; i <= unique_depth; ++i) {
      const bst_float w = UnwoundPathSum(unique_path, unique_depth, i);
      const PathElement &el = unique_path[i];
-      phi[el.feature_index] += w*(el.one_fraction-el.zero_fraction)*node.leaf_value();
+      phi[el.feature_index] += w * (el.one_fraction - el.zero_fraction) * node.leaf_value();
    }
  // internal node
@@ -742,10 +743,11 @@ inline void RegTree::TreeShap(const RegTree::FVec& feat, bst_float *phi,
    } else {
      hot_index = node.cright();
    }
-    const unsigned cold_index = (hot_index == node.cleft() ? node.cright() : node.cleft());
+    const unsigned cold_index = (static_cast<int>(hot_index) == node.cleft() ?
                                 node.cright() : node.cleft());
    const bst_float w = this->stat(node_index).sum_hess;
-    const bst_float hot_zero_fraction = this->stat(hot_index).sum_hess/w;
+    const bst_float hot_zero_fraction = this->stat(hot_index).sum_hess / w;
-    const bst_float cold_zero_fraction = this->stat(cold_index).sum_hess/w;
+    const bst_float cold_zero_fraction = this->stat(cold_index).sum_hess / w;
    bst_float incoming_zero_fraction = 1;
    bst_float incoming_one_fraction = 1;
@@ -753,19 +755,19 @@ inline void RegTree::TreeShap(const RegTree::FVec& feat, bst_float *phi,
    // if so we undo that split so we can redo it for this node
    unsigned path_index = 0;
    for (; path_index <= unique_depth; ++path_index) {
-      if (unique_path[path_index].feature_index == split_index) break;
+      if (static_cast<unsigned>(unique_path[path_index].feature_index) == split_index) break;
    }
-    if (path_index != unique_depth+1) {
+    if (path_index != unique_depth + 1) {
      incoming_zero_fraction = unique_path[path_index].zero_fraction;
      incoming_one_fraction = unique_path[path_index].one_fraction;
      UnwindPath(unique_path, unique_depth, path_index);
      unique_depth -= 1;
    }
-    TreeShap(feat, phi, hot_index, unique_depth+1, unique_path,
+    TreeShap(feat, phi, hot_index, unique_depth + 1, unique_path,
             hot_zero_fraction*incoming_zero_fraction, incoming_one_fraction, split_index);
-    TreeShap(feat, phi, cold_index, unique_depth+1, unique_path,
+    TreeShap(feat, phi, cold_index, unique_depth + 1, unique_path,
             cold_zero_fraction*incoming_zero_fraction, 0, split_index);
  }
 }
@@ -773,21 +775,21 @@ inline void RegTree::TreeShap(const RegTree::FVec& feat, bst_float *phi,
 inline void RegTree::CalculateContributions(const RegTree::FVec& feat, unsigned root_id,
                                            bst_float *out_contribs) const {
  // find the expected value of the tree's predictions
-  bst_float base_value = 0.0;
+  bst_float base_value = 0.0f;
-  bst_float total_cover = 0;
+  bst_float total_cover = 0.0f;
  for (int i = 0; i < (*this).param.num_nodes; ++i) {
    const auto node = (*this)[i];
    if (node.is_leaf()) {
      const auto cover = this->stat(i).sum_hess;
-      base_value += cover*node.leaf_value();
+      base_value += cover * node.leaf_value();
      total_cover += cover;
    }
  }
  out_contribs[feat.size()] += base_value / total_cover;
  // Preallocate space for the unique path data
-  const int maxd = this->MaxDepth(root_id)+1;
+  const int maxd = this->MaxDepth(root_id) + 1;
-  PathElement *unique_path_data = new PathElement[(maxd*(maxd+1))/2];
+  PathElement *unique_path_data = new PathElement[(maxd * (maxd + 1)) / 2];
  TreeShap(feat, out_contribs, root_id, 0, unique_path_data, 1, 1, -1);
  delete[] unique_path_data;