[R] maintenance Nov 2017; SHAP plots (#2888)

* [R] fix predict contributions for data with no colnames * [R] add a render parameter for xgb.plot.multi.trees; fixes #2628 * [R] update Rd's * [R] remove unnecessary dep-package from R cmake install * silence type warnings; readability * [R] silence complaint about incomplete line at the end * [R] initial version of xgb.plot.shap() * [R] more work on xgb.plot.shap * [R] enforce black font in xgb.plot.tree; fixes #2640 * [R] if feature names are available, check in predict that they are the same; fixes #2857 * [R] cran check and lint fixes * remove tabs * [R] add references; a test for plot.shap
2017-12-05 11:45:34 -06:00
parent 1b77903eeb
commit e8a6597957
19 changed files with 554 additions and 118 deletions
--- a/R-package/DESCRIPTION
+++ b/R-package/DESCRIPTION
@@ -1,8 +1,8 @@
 Package: xgboost
 Type: Package
 Title: Extreme Gradient Boosting
-Version: 0.6.4.7
-Date: 2017-09-25
+Version: 0.6.4.8
+Date: 2017-12-05
 Author: Tianqi Chen <tianqi.tchen@gmail.com>, Tong He <hetong007@gmail.com>,
    Michael Benesty <michael@benesty.fr>, Vadim Khotilovich <khotilovich@gmail.com>,
    Yuan Tang <terrytangyuan@gmail.com>
--- a/R-package/NAMESPACE
+++ b/R-package/NAMESPACE
@@ -40,6 +40,7 @@ export(xgb.model.dt.tree)
 export(xgb.plot.deepness)
 export(xgb.plot.importance)
 export(xgb.plot.multi.trees)
+export(xgb.plot.shap)
 export(xgb.plot.tree)
 export(xgb.save)
 export(xgb.save.raw)
@@ -60,9 +61,12 @@ importFrom(data.table,rbindlist)
 importFrom(data.table,setkey)
 importFrom(data.table,setkeyv)
 importFrom(data.table,setnames)
+importFrom(grDevices,rgb)
 importFrom(graphics,barplot)
 importFrom(graphics,grid)
+importFrom(graphics,lines)
 importFrom(graphics,par)
+importFrom(graphics,points)
 importFrom(graphics,title)
 importFrom(magrittr,"%>%")
 importFrom(stats,median)
--- a/R-package/R/xgb.Booster.R
+++ b/R-package/R/xgb.Booster.R
@@ -150,7 +150,7 @@ xgb.Booster.complete <- function(object, saveraw = TRUE) {
 #' Setting \code{predcontrib = TRUE} allows to calculate contributions of each feature to
 #' individual predictions. For "gblinear" booster, feature contributions are simply linear terms
 #' (feature_beta * feature_value). For "gbtree" booster, feature contributions are SHAP
-#' values (https://arxiv.org/abs/1706.06060) that sum to the difference between the expected output
+#' values (Lundberg 2017) that sum to the difference between the expected output
 #' of the model and the current prediction (where the hessian weights are used to compute the expectations).
 #' Setting \code{approxcontrib = TRUE} approximates these values following the idea explained
 #' in \url{http://blog.datadive.net/interpreting-random-forests/}.
@@ -172,6 +172,12 @@ xgb.Booster.complete <- function(object, saveraw = TRUE) {
 #'
 #' @seealso
 #' \code{\link{xgb.train}}.
+#' 
+#' @references
+#'
+#' Scott M. Lundberg, Su-In Lee, "A Unified Approach to Interpreting Model Predictions", NIPS Proceedings 2017, \url{https://arxiv.org/abs/1705.07874}
+#'
+#' Scott M. Lundberg, Su-In Lee, "Consistent feature attribution for tree ensembles", \url{https://arxiv.org/abs/1706.06060}
 #'
 #' @examples
 #' ## binary classification:
@@ -265,6 +271,10 @@ predict.xgb.Booster <- function(object, newdata, missing = NA, outputmargin = FA
  object <- xgb.Booster.complete(object, saveraw = FALSE)
  if (!inherits(newdata, "xgb.DMatrix"))
    newdata <- xgb.DMatrix(newdata, missing = missing)
+  if (!is.null(object[["feature_names"]]) &&
+      !is.null(colnames(newdata)) &&
+      !identical(object[["feature_names"]], colnames(newdata)))
+    stop("Feature names stored in `object` and `newdata` are different!")
  if (is.null(ntreelimit))
    ntreelimit <- NVL(object$best_ntreelimit, 0)
  if (NVL(object$params[['booster']], '') == 'gblinear')
@@ -292,7 +302,7 @@ predict.xgb.Booster <- function(object, newdata, missing = NA, outputmargin = FA
  } else if (predcontrib) {
    n_col1 <- ncol(newdata) + 1
    n_group <- npred_per_case / n_col1
-    dnames <- list(NULL, c(colnames(newdata), "BIAS"))
+    dnames <- if (!is.null(colnames(newdata))) list(NULL, c(colnames(newdata), "BIAS")) else NULL
    ret <- if (n_ret == n_row) {
      matrix(ret, ncol = 1, dimnames = dnames)
    } else if (n_group == 1) {
--- a/R-package/R/xgb.importance.R
+++ b/R-package/R/xgb.importance.R
@@ -136,4 +136,4 @@ xgb.importance <- function(feature_names = NULL, model = NULL, trees = NULL,
 # Avoid error messages during CRAN check.
 # The reason is that these variables are never declared
 # They are mainly column names inferred by Data.table...
-globalVariables(c(".", ".N", "Gain", "Cover", "Frequency", "Feature"))
+globalVariables(c(".", ".N", "Gain", "Cover", "Frequency", "Feature", "Class"))
--- a/R-package/R/xgb.plot.multi.trees.R
+++ b/R-package/R/xgb.plot.multi.trees.R
@@ -1,5 +1,5 @@
 #' Project all trees on one tree and plot it
-#' 
+#'
 #' Visualization of the ensemble of trees as a single collective unit.
 #'
 #' @param model produced by the \code{xgb.train} function.
@@ -7,52 +7,71 @@
 #' @param features_keep number of features to keep in each position of the multi trees.
 #' @param plot_width width in pixels of the graph to produce
 #' @param plot_height height in pixels of the graph to produce
+#' @param render a logical flag for whether the graph should be rendered (see Value).
 #' @param ... currently not used
 #' 
-#' @return Two graphs showing the distribution of the model deepness.
-#' 
 #' @details
-#' 
-#' This function tries to capture the complexity of a gradient boosted tree model 
+#'
+#' This function tries to capture the complexity of a gradient boosted tree model
 #' in a cohesive way by compressing an ensemble of trees into a single tree-graph representation.
 #' The goal is to improve the interpretability of a model generally seen as black box.
-#' 
+#'
 #' Note: this function is applicable to tree booster-based models only.
-#' 
-#' It takes advantage of the fact that the shape of a binary tree is only defined by 
-#' its depth (therefore, in a boosting model, all trees have similar shape). 
-#' 
+#'
+#' It takes advantage of the fact that the shape of a binary tree is only defined by
+#' its depth (therefore, in a boosting model, all trees have similar shape).
+#'
 #' Moreover, the trees tend to reuse the same features.
-#' 
-#' The function projects each tree onto one, and keeps for each position the 
+#'
+#' The function projects each tree onto one, and keeps for each position the
 #' \code{features_keep} first features (based on the Gain per feature measure).
-#' 
+#'
 #' This function is inspired by this blog post:
 #' \url{https://wellecks.wordpress.com/2015/02/21/peering-into-the-black-box-visualizing-lambdamart/}
 #'
+#' @return
+#'
+#' When \code{render = TRUE}:
+#' returns a rendered graph object which is an \code{htmlwidget} of class \code{grViz}.
+#' Similar to ggplot objects, it needs to be printed to see it when not running from command line.
+#'
+#' When \code{render = FALSE}:
+#' silently returns a graph object which is of DiagrammeR's class \code{dgr_graph}.
+#' This could be useful if one wants to modify some of the graph attributes
+#' before rendering the graph with \code{\link[DiagrammeR]{render_graph}}.
+#'
 #' @examples
+#'
 #' data(agaricus.train, package='xgboost')
 #'
 #' bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max_depth = 15,
-#'                  eta = 1, nthread = 2, nrounds = 30, objective = "binary:logistic",
-#'                  min_child_weight = 50)
+#'                eta = 1, nthread = 2, nrounds = 30, objective = "binary:logistic",
+#'                min_child_weight = 50, verbose = 0)
 #'
-#' p <- xgb.plot.multi.trees(model = bst, feature_names = colnames(agaricus.train$data),
-#'                           features_keep = 3)
+#' p <- xgb.plot.multi.trees(model = bst, features_keep = 3)
 #' print(p)
 #'
+#' \dontrun{
+#' # Below is an example of how to save this plot to a file.
+#' # Note that for `export_graph` to work, the DiagrammeRsvg and rsvg packages must also be installed.
+#' library(DiagrammeR)
+#' gr <- xgb.plot.multi.trees(model=bst, features_keep = 3, render=FALSE)
+#' export_graph(gr, 'tree.pdf', width=1500, height=600)
+#' }
+#'
 #' @export
-xgb.plot.multi.trees <- function(model, feature_names = NULL, features_keep = 5, plot_width = NULL, plot_height = NULL, ...){
+xgb.plot.multi.trees <- function(model, feature_names = NULL, features_keep = 5, plot_width = NULL, plot_height = NULL,
+                                 render = TRUE, ...){
  check.deprecation(...)
  tree.matrix <- xgb.model.dt.tree(feature_names = feature_names, model = model)
-  
+
  # first number of the path represents the tree, then the following numbers are related to the path to follow
  # root init
  root.nodes <- tree.matrix[stri_detect_regex(ID, "\\d+-0"), ID]
-  tree.matrix[ID %in% root.nodes, abs.node.position:=root.nodes]
-  
+  tree.matrix[ID %in% root.nodes, abs.node.position := root.nodes]
+
  precedent.nodes <- root.nodes
-  
+
  while(tree.matrix[,sum(is.na(abs.node.position))] > 0) {
    yes.row.nodes <- tree.matrix[abs.node.position %in% precedent.nodes & !is.na(Yes)]
    no.row.nodes <- tree.matrix[abs.node.position %in% precedent.nodes & !is.na(No)]
@@ -64,9 +83,8 @@ xgb.plot.multi.trees <- function(model, feature_names = NULL, features_keep = 5,
    precedent.nodes <- c(yes.nodes.abs.pos, no.nodes.abs.pos)
  }
  
-  tree.matrix[!is.na(Yes),Yes:= paste0(abs.node.position, "_0")]
-  tree.matrix[!is.na(No),No:= paste0(abs.node.position, "_1")]
-  
+  tree.matrix[!is.na(Yes), Yes := paste0(abs.node.position, "_0")]
+  tree.matrix[!is.na(No), No := paste0(abs.node.position, "_1")]
  
  remove.tree <- . %>% stri_replace_first_regex(pattern = "^\\d+-", replacement = "")
  
@@ -120,8 +138,10 @@ xgb.plot.multi.trees <- function(model, feature_names = NULL, features_keep = 5,
      attr_type = "edge",
      attr  = c("color", "arrowsize", "arrowhead", "fontname"),
      value = c("DimGray", "1.5", "vee", "Helvetica"))
-  
-  DiagrammeR::render_graph(graph, width = plot_width, height = plot_height)  
+
+  if (!render) return(invisible(graph))
+
+  DiagrammeR::render_graph(graph, width = plot_width, height = plot_height)
 }

 globalVariables(c(".N", "N", "From", "To", "Text", "Feature", "no.nodes.abs.pos",
--- a/R-package/R/xgb.plot.shap.R
+++ b/R-package/R/xgb.plot.shap.R
@@ -0,0 +1,214 @@
+#' SHAP contribution dependency plots
+#'
+#' Visualizing the SHAP feature contribution to prediction dependencies on feature value.
+#' 
+#' @param data data as a \code{matrix} or \code{dgCMatrix}.
+#' @param shap_contrib a matrix of SHAP contributions that was computed earlier for the above 
+#'          \code{data}. When it is NULL, it is computed internally using \code{model} and \code{data}.
+#' @param features a vector of either column indices or of feature names to plot. When it is NULL,
+#'          feature importance is calculated, and \code{top_n} high ranked features are taken.
+#' @param top_n when \code{features} is NULL, top_n [1, 100] most important features in a model are taken.
+#' @param model an \code{xgb.Booster} model. It has to be provided when either \code{shap_contrib}
+#'          or \code{features} is missing.
+#' @param trees passed to \code{\link{xgb.importance}} when \code{features = NULL}.
+#' @param target_class is only relevant for multiclass models. When it is set to a 0-based class index,
+#'          only SHAP contributions for that specific class are used.
+#'          If it is not set, SHAP importances are averaged over all classes.
+#' @param approxcontrib passed to \code{\link{predict.xgb.Booster}} when \code{shap_contrib = NULL}.
+#' @param subsample a random fraction of data points to use for plotting. When it is NULL,
+#'          it is set so that up to 100K data points are used.
+#' @param n_col a number of columns in a grid of plots.
+#' @param col color of the scatterplot markers.
+#' @param pch scatterplot marker.
+#' @param discrete_n_uniq a maximal number of unique values in a feature to consider it as discrete.
+#' @param discrete_jitter an \code{amount} parameter of jitter added to discrete features' positions.
+#' @param ylab a y-axis label in 1D plots.
+#' @param plot_NA whether the contributions of cases with missing values should also be plotted.
+#' @param col_NA a color of marker for missing value contributions.
+#' @param pch_NA a marker type for NA values.
+#' @param pos_NA a relative position of the x-location where NA values are shown:
+#'          \code{min(x) + (max(x) - min(x)) * pos_NA}.
+#' @param plot_loess whether to plot loess-smoothed curves. The smoothing is only done for features with
+#'          more than 5 distinct values.
+#' @param col_loess a color to use for the loess curves.
+#' @param span_loess the \code{span} paramerer in \code{\link[stats]{loess}}'s call.
+#' @param which whether to do univariate or bivariate plotting. NOTE: only 1D is implemented so far.
+#' @param plot whether a plot should be drawn. If FALSE, only a lits of matrices is returned.
+#' @param ... other parameters passed to \code{plot}.
+#' 
+#' @details
+#' 
+#' These scatterplots represent how SHAP feature contributions depend of feature values.
+#' The similarity to partial dependency plots is that they also give an idea for how feature values
+#' affect predictions. However, in partial dependency plots, we usually see marginal dependencies
+#' of model prediction on feature value, while SHAP contribution dependency plots display the estimated
+#' contributions of a feature to model prediction for each individual case.
+#' 
+#' When \code{plot_loess = TRUE} is set, feature values are rounded to 3 significant digits and
+#' weighted LOESS is computed and plotted, where weights are the numbers of data points
+#' at each rounded value.
+#' 
+#' Note: SHAP contributions are shown on the scale of model margin. E.g., for a logistic binomial objective,
+#' the margin is prediction before a sigmoidal transform into probability-like values.
+#' Also, since SHAP stands for "SHapley Additive exPlanation" (model prediction = sum of SHAP
+#' contributions for all features + bias), depending on the objective used, transforming SHAP
+#' contributions for a feature from the marginal to the prediction space is not necessarily
+#' a meaningful thing to do.
+#' 
+#' @return
+#' 
+#' In addition to producing plots (when \code{plot=TRUE}), it silently returns a list of two matrices:
+#' \itemize{
+#'  \item \code{data} the values of selected features;
+#'  \item \code{shap_contrib} the contributions of selected features.
+#' }
+#'
+#' @references
+#'
+#' Scott M. Lundberg, Su-In Lee, "A Unified Approach to Interpreting Model Predictions", NIPS Proceedings 2017, \url{https://arxiv.org/abs/1705.07874}
+#'
+#' Scott M. Lundberg, Su-In Lee, "Consistent feature attribution for tree ensembles", \url{https://arxiv.org/abs/1706.06060}
+#'
+#' @examples
+#' 
+#' data(agaricus.train, package='xgboost')
+#' data(agaricus.test, package='xgboost')
+#'
+#' bst <- xgboost(agaricus.train$data, agaricus.train$label, nrounds = 50, 
+#'                eta = 0.1, max_depth = 3, subsample = .5,
+#'                method = "hist", objective = "binary:logistic", nthread = 2, verbose = 0)
+#'
+#' xgb.plot.shap(agaricus.test$data, model = bst, features = "odor=none")
+#' contr <- predict(bst, agaricus.test$data, predcontrib = TRUE)
+#' xgb.plot.shap(agaricus.test$data, contr, model = bst, top_n = 12, n_col = 3)
+#'
+#' # multiclass example - plots for each class separately:
+#' nclass <- 3
+#' nrounds <- 20
+#' x <- as.matrix(iris[, -5])
+#' set.seed(123)
+#' is.na(x[sample(nrow(x) * 4, 30)]) <- TRUE # introduce some missing values
+#' mbst <- xgboost(data = x, label = as.numeric(iris$Species) - 1, nrounds = nrounds,
+#'                 max_depth = 2, eta = 0.3, subsample = .5, nthread = 2,
+#'                 objective = "multi:softprob", num_class = nclass, verbose = 0)
+#' trees0 <- seq(from=0, by=nclass, length.out=nrounds)
+#' col <- rgb(0, 0, 1, 0.5)
+#' xgb.plot.shap(x, model = mbst, trees = trees0, target_class = 0, top_n = 4, n_col = 2, col = col, pch = 16, pch_NA = 17)
+#' xgb.plot.shap(x, model = mbst, trees = trees0 + 1, target_class = 1, top_n = 4, n_col = 2, col = col, pch = 16, pch_NA = 17)
+#' xgb.plot.shap(x, model = mbst, trees = trees0 + 2, target_class = 2, top_n = 4, n_col = 2, col = col, pch = 16, pch_NA = 17)
+#' 
+#' @rdname xgb.plot.shap
+#' @export
+xgb.plot.shap <- function(data, shap_contrib = NULL, features = NULL, top_n = 1, model = NULL,
+                          trees = NULL, target_class = NULL, approxcontrib = FALSE,
+                          subsample = NULL, n_col = 1, col = rgb(0, 0, 1, 0.2), pch = '.',
+                          discrete_n_uniq = 5, discrete_jitter = 0.01, ylab = "SHAP",
+                          plot_NA = TRUE, col_NA = rgb(0.7, 0, 1, 0.6), pch_NA = '.', pos_NA = 1.07,
+                          plot_loess = TRUE, col_loess = 2, span_loess = 0.5,
+                          which = c("1d", "2d"), plot = TRUE, ...) {
+  
+  if (!is.matrix(data) && !inherits(data, "dgCMatrix"))
+    stop("data: must be either matrix or dgCMatrix")
+
+  if (is.null(shap_contrib) && (is.null(model) || !inherits(model, "xgb.Booster")))
+    stop("when shap_contrib is not provided, one must provide an xgb.Booster model")
+
+  if (is.null(features) && (is.null(model) || !inherits(model, "xgb.Booster")))
+    stop("when features are not provided, one must provide an xgb.Booster model to rank the features")
+
+  if (!is.null(shap_contrib) &&
+      (!is.matrix(shap_contrib) || nrow(shap_contrib) != nrow(data) || ncol(shap_contrib) != ncol(data) + 1))
+    stop("shap_contrib is not compatible with the provided data")
+  
+  nsample <- if (is.null(subsample)) min(100000, nrow(data)) else as.integer(subsample * nrow(data))
+  idx <- sample(1:nrow(data), nsample)
+  data <- data[idx,]
+
+  if (is.null(shap_contrib)) {
+    shap_contrib <- predict(model, data, predcontrib = TRUE, approxcontrib = approxcontrib)
+  } else {
+    shap_contrib <- shap_contrib[idx,]
+  }
+
+  which <- match.arg(which)
+  if (which == "2d")
+    stop("2D plots are not implemented yet")
+
+  if (is.null(features)) {
+    imp <- xgb.importance(model = model, trees = trees)
+    top_n <- as.integer(top_n[1])
+    if (top_n < 1 && top_n > 100)
+      stop("top_n: must be an integer within [1, 100]")
+    features <- imp$Feature[1:min(top_n, NROW(imp))]
+  }
+  
+  if (is.character(features)) {
+    if (is.null(colnames(data)))
+      stop("Either provide `data` with column names or provide `features` as column indices")
+    features <- match(features, colnames(data))
+  }
+  
+  if (n_col > length(features)) n_col <- length(features)
+
+  if (is.list(shap_contrib)) { # multiclass: either choose a class or merge
+    shap_contrib <- if (!is.null(target_class)) shap_contrib[[target_class + 1]]
+                    else Reduce("+", lapply(shap_contrib, abs))
+  }
+
+  shap_contrib <- shap_contrib[, features, drop = FALSE]
+  data <- data[, features, drop = FALSE]
+  cols <- colnames(data)
+  if (is.null(cols)) cols <- colnames(shap_contrib)
+  if (is.null(cols)) cols <- paste0('X', 1:ncol(data))
+  colnames(data) <- cols
+  colnames(shap_contrib) <- cols
+  
+  if (plot && which == "1d") {
+    op <- par(mfrow = c(ceiling(length(features) / n_col), n_col),
+              oma = c(0,0,0,0) + 0.2,
+              mar = c(3.5,3.5,0,0) + 0.1,
+              mgp = c(1.7, 0.6, 0))
+    for (f in cols) {
+      ord <- order(data[, f])
+      x <- data[, f][ord]
+      y <- shap_contrib[, f][ord]
+      x_lim <- range(x, na.rm = TRUE)
+      y_lim <- range(y, na.rm = TRUE)
+      do_na <- plot_NA && any(is.na(x))
+      if (do_na) {
+        x_range <- diff(x_lim)
+        loc_na <- min(x, na.rm = TRUE) + x_range * pos_NA
+        x_lim <- range(c(x_lim, loc_na))
+      }
+      x_uniq <- unique(x)
+      x2plot <- x
+      # add small jitter for discrete features with <= 5 distinct values
+      if (length(x_uniq) <= discrete_n_uniq)
+        x2plot <- jitter(x, amount = discrete_jitter * min(diff(x_uniq), na.rm = TRUE))
+      plot(x2plot, y, pch = pch, xlab = f, col = col, xlim = x_lim, ylim = y_lim, ylab = ylab, ...)
+      grid()
+      if (plot_loess) {
+        # compress x to 3 digits, and mean-aggredate y
+        zz <- data.table(x = signif(x, 3), y)[, .(.N, y=mean(y)), x]
+        if (nrow(zz) <= 5) {
+          lines(zz$x, zz$y, col = col_loess)
+        } else {
+          lo <- stats::loess(y ~ x, data = zz, weights = zz$N, span = span_loess)
+          zz$y_lo <- predict(lo, zz, type = "link")
+          lines(zz$x, zz$y_lo, col = col_loess)
+        }
+      }
+      if (do_na) {
+        i_na <- which(is.na(x))
+        x_na <- rep(loc_na, length(i_na))
+        x_na <- jitter(x_na, amount = x_range * 0.01)
+        points(x_na, y[i_na], pch = pch_NA, col = col_NA)
+      }
+    }
+    par(op)
+  }
+  if (plot && which == "2d") {
+    # TODO
+  }
+  invisible(list(data = data, shap_contrib = shap_contrib))
+}
--- a/R-package/R/xgb.plot.tree.R
+++ b/R-package/R/xgb.plot.tree.R
@@ -95,7 +95,8 @@ xgb.plot.tree <- function(feature_names = NULL, model = NULL, trees = NULL, plot
    label     = dt$label,
    fillcolor = dt$filledcolor,
    shape     = dt$shape,
-    data      = dt$Feature)
+    data      = dt$Feature,
+    fontcolor = "black")
  
  edges <- DiagrammeR::create_edge_df(
    from  = match(dt[Feature != "Leaf", c(ID)] %>% rep(2), dt$ID),
--- a/R-package/R/xgb.train.R
+++ b/R-package/R/xgb.train.R
@@ -169,6 +169,11 @@
 #' \code{\link{predict.xgb.Booster}},
 #' \code{\link{xgb.cv}}
 #' 
+#' @references
+#'
+#' Tianqi Chen and Carlos Guestrin, "XGBoost: A Scalable Tree Boosting System",
+#' 22nd SIGKDD Conference on Knowledge Discovery and Data Mining, 2016, \url{https://arxiv.org/abs/1603.02754}
+#'
 #' @examples
 #' data(agaricus.train, package='xgboost')
 #' data(agaricus.test, package='xgboost')
--- a/R-package/R/xgboost.R
+++ b/R-package/R/xgboost.R
@@ -100,9 +100,12 @@ NULL
 #' @importFrom stats median
 #' @importFrom utils head
 #' @importFrom graphics barplot
+#' @importFrom graphics lines
+#' @importFrom graphics points
 #' @importFrom graphics grid
 #' @importFrom graphics par
 #' @importFrom graphics title
+#' @importFrom grDevices rgb
 #' 
 #' @import methods
 #' @useDynLib xgboost, .registration = TRUE
--- a/R-package/demo/00Index
+++ b/R-package/demo/00Index
@@ -10,4 +10,5 @@ predict_leaf_indices            Predicting the corresponding leaves
 early_stopping                  Early Stop in training
 poisson_regression              Poisson Regression on count data
 tweedie_regression              Tweddie Regression
-gpu_accelerated                 GPU-accelerated tree building algorithms
+gpu_accelerated                 GPU-accelerated tree building algorithms
+
--- a/R-package/man/predict.xgb.Booster.Rd
+++ b/R-package/man/predict.xgb.Booster.Rd
@@ -7,7 +7,7 @@
 \usage{
 \method{predict}{xgb.Booster}(object, newdata, missing = NA,
  outputmargin = FALSE, ntreelimit = NULL, predleaf = FALSE,
-  predcontrib = FALSE, reshape = FALSE, ...)
+  predcontrib = FALSE, approxcontrib = FALSE, reshape = FALSE, ...)

 \method{predict}{xgb.Booster.handle}(object, ...)
 }
@@ -19,8 +19,8 @@
 \item{missing}{Missing is only used when input is dense matrix. Pick a float value that represents
 missing values in data (e.g., sometimes 0 or some other extreme value is used).}

-\item{outputmargin}{whether the prediction should be returned in the for of original untransformed 
-sum of predictions from boosting iterations' results. E.g., setting \code{outputmargin=TRUE} for 
+\item{outputmargin}{whether the prediction should be returned in the for of original untransformed
+sum of predictions from boosting iterations' results. E.g., setting \code{outputmargin=TRUE} for
 logistic regression would result in predictions for log-odds instead of probabilities.}

 \item{ntreelimit}{limit the number of model's trees or boosting iterations used in prediction (see Details).
@@ -30,24 +30,26 @@ It will use all the trees by default (\code{NULL} value).}

 \item{predcontrib}{whether to return feature contributions to individual predictions instead (see Details).}

-\item{reshape}{whether to reshape the vector of predictions to a matrix form when there are several 
+\item{approxcontrib}{whether to use a fast approximation for feature contributions (see Details).}
+
+\item{reshape}{whether to reshape the vector of predictions to a matrix form when there are several
 prediction outputs per case. This option has no effect when \code{predleaf = TRUE}.}

 \item{...}{Parameters passed to \code{predict.xgb.Booster}}
 }
 \value{
 For regression or binary classification, it returns a vector of length \code{nrows(newdata)}.
-For multiclass classification, either a \code{num_class * nrows(newdata)} vector or 
-a \code{(nrows(newdata), num_class)} dimension matrix is returned, depending on 
+For multiclass classification, either a \code{num_class * nrows(newdata)} vector or
+a \code{(nrows(newdata), num_class)} dimension matrix is returned, depending on
 the \code{reshape} value.

-When \code{predleaf = TRUE}, the output is a matrix object with the 
+When \code{predleaf = TRUE}, the output is a matrix object with the
 number of columns corresponding to the number of trees.

 When \code{predcontrib = TRUE} and it is not a multiclass setting, the output is a matrix object with
 \code{num_features + 1} columns. The last "+ 1" column in a matrix corresponds to bias.
 For a multiclass case, a list of \code{num_class} elements is returned, where each element is
-such a matrix. The contribution values are on the scale of untransformed margin 
+such a matrix. The contribution values are on the scale of untransformed margin
 (e.g., for binary classification would mean that the contributions are log-odds deviations from bias).
 }
 \description{
@@ -57,22 +59,23 @@ Predicted values based on either xgboost model or model handle object.
 Note that \code{ntreelimit} is not necessarily equal to the number of boosting iterations
 and it is not necessarily equal to the number of trees in a model.
 E.g., in a random forest-like model, \code{ntreelimit} would limit the number of trees.
-But for multiclass classification, while there are multiple trees per iteration, 
+But for multiclass classification, while there are multiple trees per iteration,
 \code{ntreelimit} limits the number of boosting iterations.

-Also note that \code{ntreelimit} would currently do nothing for predictions from gblinear, 
+Also note that \code{ntreelimit} would currently do nothing for predictions from gblinear,
 since gblinear doesn't keep its boosting history.

-One possible practical applications of the \code{predleaf} option is to use the model 
-as a generator of new features which capture non-linearity and interactions, 
+One possible practical applications of the \code{predleaf} option is to use the model
+as a generator of new features which capture non-linearity and interactions,
 e.g., as implemented in \code{\link{xgb.create.features}}.

 Setting \code{predcontrib = TRUE} allows to calculate contributions of each feature to
 individual predictions. For "gblinear" booster, feature contributions are simply linear terms
-(feature_beta * feature_value). For "gbtree" booster, feature contribution is calculated 
-as a sum of average contribution of that feature's split nodes across all trees to an 
-individual prediction, following the idea explained in 
-\url{http://blog.datadive.net/interpreting-random-forests/}.
+(feature_beta * feature_value). For "gbtree" booster, feature contributions are SHAP
+values (Lundberg 2017) that sum to the difference between the expected output
+of the model and the current prediction (where the hessian weights are used to compute the expectations).
+Setting \code{approxcontrib = TRUE} approximates these values following the idea explained
+in \url{http://blog.datadive.net/interpreting-random-forests/}.
 }
 \examples{
 ## binary classification:
@@ -82,7 +85,7 @@ data(agaricus.test, package='xgboost')
 train <- agaricus.train
 test <- agaricus.test

-bst <- xgboost(data = train$data, label = train$label, max_depth = 2, 
+bst <- xgboost(data = train$data, label = train$label, max_depth = 2,
               eta = 0.5, nthread = 2, nrounds = 5, objective = "binary:logistic")
 # use all trees by default
 pred <- predict(bst, test$data)
@@ -98,7 +101,7 @@ str(pred_leaf)
 # the result is an nsamples X (nfeatures + 1) matrix
 pred_contr <- predict(bst, test$data, predcontrib = TRUE)
 str(pred_contr)
-# verify that contributions' sums are equal to log-odds of predictions (up to foat precision):
+# verify that contributions' sums are equal to log-odds of predictions (up to float precision):
 summary(rowSums(pred_contr) - qlogis(pred))
 # for the 1st record, let's inspect its features that had non-zero contribution to prediction:
 contr1 <- pred_contr[1,]
@@ -137,7 +140,7 @@ bst <- xgboost(data = as.matrix(iris[, -5]), label = lb,
 pred <- predict(bst, as.matrix(iris[, -5]))
 str(pred)
 all.equal(pred, pred_labels)
-# prediction from using only 5 iterations should result 
+# prediction from using only 5 iterations should result
 # in the same error as seen in iteration 5:
 pred5 <- predict(bst, as.matrix(iris[, -5]), ntreelimit=5)
 sum(pred5 != lb)/length(lb)
@@ -158,6 +161,11 @@ err <- sapply(1:25, function(n) {
 })
 plot(err, type='l', ylim=c(0,0.1), xlab='#trees')

+}
+\references{
+Scott M. Lundberg, Su-In Lee, "A Unified Approach to Interpreting Model Predictions", NIPS Proceedings 2017, \url{https://arxiv.org/abs/1705.07874}
+
+Scott M. Lundberg, Su-In Lee, "Consistent feature attribution for tree ensembles", \url{https://arxiv.org/abs/1706.06060}
 }
 \seealso{
 \code{\link{xgb.train}}.
--- a/R-package/man/xgb.Booster.complete.Rd
+++ b/R-package/man/xgb.Booster.complete.Rd
@@ -9,32 +9,32 @@ xgb.Booster.complete(object, saveraw = TRUE)
 \arguments{
 \item{object}{object of class \code{xgb.Booster}}

-\item{saveraw}{a flag indicating whether to append \code{raw} Booster memory dump data 
+\item{saveraw}{a flag indicating whether to append \code{raw} Booster memory dump data
 when it doesn't already exist.}
 }
 \value{
 An object of \code{xgb.Booster} class.
 }
 \description{
-It attempts to complete an \code{xgb.Booster} object by restoring either its missing 
+It attempts to complete an \code{xgb.Booster} object by restoring either its missing
 raw model memory dump (when it has no \code{raw} data but its \code{xgb.Booster.handle} is valid)
-or its missing internal handle (when its \code{xgb.Booster.handle} is not valid 
+or its missing internal handle (when its \code{xgb.Booster.handle} is not valid
 but it has a raw Booster memory dump).
 }
 \details{
 While this method is primarily for internal use, it might be useful in some practical situations.

 E.g., when an \code{xgb.Booster} model is saved as an R object and then is loaded as an R object,
-its handle (pointer) to an internal xgboost model would be invalid. The majority of xgboost methods 
-should still work for such a model object since those methods would be using 
-\code{xgb.Booster.complete} internally. However, one might find it to be more efficient to call the  
+its handle (pointer) to an internal xgboost model would be invalid. The majority of xgboost methods
+should still work for such a model object since those methods would be using
+\code{xgb.Booster.complete} internally. However, one might find it to be more efficient to call the
 \code{xgb.Booster.complete} function explicitely once after loading a model as an R-object.
 That would prevent further repeated implicit reconstruction of an internal booster model.
 }
 \examples{

 data(agaricus.train, package='xgboost')
-bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max_depth = 2, 
+bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max_depth = 2,
               eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic")
 saveRDS(bst, "xgb.model.rds")

--- a/R-package/man/xgb.attr.Rd
+++ b/R-package/man/xgb.attr.Rd
@@ -20,18 +20,18 @@ xgb.attributes(object) <- value

 \item{name}{a non-empty character string specifying which attribute is to be accessed.}

-\item{value}{a value of an attribute for \code{xgb.attr<-}; for \code{xgb.attributes<-} 
-it's a list (or an object coercible to a list) with the names of attributes to set 
-and the elements corresponding to attribute values. 
+\item{value}{a value of an attribute for \code{xgb.attr<-}; for \code{xgb.attributes<-}
+it's a list (or an object coercible to a list) with the names of attributes to set
+and the elements corresponding to attribute values.
 Non-character values are converted to character.
 When attribute value is not a scalar, only the first index is used.
 Use \code{NULL} to remove an attribute.}
 }
 \value{
-\code{xgb.attr} returns either a string value of an attribute 
+\code{xgb.attr} returns either a string value of an attribute
 or \code{NULL} if an attribute wasn't stored in a model.

-\code{xgb.attributes} returns a list of all attribute stored in a model 
+\code{xgb.attributes} returns a list of all attribute stored in a model
 or \code{NULL} if a model has no stored attributes.
 }
 \description{
@@ -41,23 +41,23 @@ These methods allow to manipulate the key-value attribute strings of an xgboost
 The primary purpose of xgboost model attributes is to store some meta-data about the model.
 Note that they are a separate concept from the object attributes in R.
 Specifically, they refer to key-value strings that can be attached to an xgboost model,
-stored together with the model's binary representation, and accessed later 
+stored together with the model's binary representation, and accessed later
 (from R or any other interface).
 In contrast, any R-attribute assigned to an R-object of \code{xgb.Booster} class
 would not be saved by \code{xgb.save} because an xgboost model is an external memory object
 and its serialization is handled externally.
-Also, setting an attribute that has the same name as one of xgboost's parameters wouldn't 
-change the value of that parameter for a model. 
+Also, setting an attribute that has the same name as one of xgboost's parameters wouldn't
+change the value of that parameter for a model.
 Use \code{\link{xgb.parameters<-}} to set or change model parameters.

 The attribute setters would usually work more efficiently for \code{xgb.Booster.handle}
 than for \code{xgb.Booster}, since only just a handle (pointer) would need to be copied.
 That would only matter if attributes need to be set many times.
 Note, however, that when feeding a handle of an \code{xgb.Booster} object to the attribute setters,
-the raw model cache of an \code{xgb.Booster} object would not be automatically updated, 
+the raw model cache of an \code{xgb.Booster} object would not be automatically updated,
 and it would be user's responsibility to call \code{xgb.save.raw} to update it.

-The \code{xgb.attributes<-} setter either updates the existing or adds one or several attributes, 
+The \code{xgb.attributes<-} setter either updates the existing or adds one or several attributes,
 but it doesn't delete the other existing attributes.
 }
 \examples{
--- a/R-package/man/xgb.plot.multi.trees.Rd
+++ b/R-package/man/xgb.plot.multi.trees.Rd
@@ -5,7 +5,7 @@
 \title{Project all trees on one tree and plot it}
 \usage{
 xgb.plot.multi.trees(model, feature_names = NULL, features_keep = 5,
-  plot_width = NULL, plot_height = NULL, ...)
+  plot_width = NULL, plot_height = NULL, render = TRUE, ...)
 }
 \arguments{
 \item{model}{produced by the \code{xgb.train} function.}
@@ -18,41 +18,58 @@ xgb.plot.multi.trees(model, feature_names = NULL, features_keep = 5,

 \item{plot_height}{height in pixels of the graph to produce}

+\item{render}{a logical flag for whether the graph should be rendered (see Value).}
+
 \item{...}{currently not used}
 }
 \value{
-Two graphs showing the distribution of the model deepness.
+When \code{render = TRUE}:
+returns a rendered graph object which is an \code{htmlwidget} of class \code{grViz}.
+Similar to ggplot objects, it needs to be printed to see it when not running from command line.
+
+When \code{render = FALSE}:
+silently returns a graph object which is of DiagrammeR's class \code{dgr_graph}.
+This could be useful if one wants to modify some of the graph attributes
+before rendering the graph with \code{\link[DiagrammeR]{render_graph}}.
 }
 \description{
 Visualization of the ensemble of trees as a single collective unit.
 }
 \details{
-This function tries to capture the complexity of a gradient boosted tree model 
+This function tries to capture the complexity of a gradient boosted tree model
 in a cohesive way by compressing an ensemble of trees into a single tree-graph representation.
 The goal is to improve the interpretability of a model generally seen as black box.

 Note: this function is applicable to tree booster-based models only.

-It takes advantage of the fact that the shape of a binary tree is only defined by 
-its depth (therefore, in a boosting model, all trees have similar shape). 
+It takes advantage of the fact that the shape of a binary tree is only defined by
+its depth (therefore, in a boosting model, all trees have similar shape).

 Moreover, the trees tend to reuse the same features.

-The function projects each tree onto one, and keeps for each position the 
+The function projects each tree onto one, and keeps for each position the
 \code{features_keep} first features (based on the Gain per feature measure).

 This function is inspired by this blog post:
 \url{https://wellecks.wordpress.com/2015/02/21/peering-into-the-black-box-visualizing-lambdamart/}
 }
 \examples{
+
 data(agaricus.train, package='xgboost')

 bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max_depth = 15,
-                 eta = 1, nthread = 2, nrounds = 30, objective = "binary:logistic",
-                 min_child_weight = 50)
+               eta = 1, nthread = 2, nrounds = 30, objective = "binary:logistic",
+               min_child_weight = 50, verbose = 0)

-p <- xgb.plot.multi.trees(model = bst, feature_names = colnames(agaricus.train$data),
-                          features_keep = 3)
+p <- xgb.plot.multi.trees(model = bst, features_keep = 3)
 print(p)

+\dontrun{
+# Below is an example of how to save this plot to a file.
+# Note that for `export_graph` to work, the DiagrammeRsvg and rsvg packages must also be installed.
+library(DiagrammeR)
+gr <- xgb.plot.multi.trees(model=bst, features_keep = 3, render=FALSE)
+export_graph(gr, 'tree.pdf', width=1500, height=600)
+}
+
 }
--- a/R-package/man/xgb.plot.shap.Rd
+++ b/R-package/man/xgb.plot.shap.Rd
@@ -0,0 +1,135 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/xgb.plot.shap.R
+\name{xgb.plot.shap}
+\alias{xgb.plot.shap}
+\title{SHAP contribution dependency plots}
+\usage{
+xgb.plot.shap(data, shap_contrib = NULL, features = NULL, top_n = 1,
+  model = NULL, trees = NULL, target_class = NULL,
+  approxcontrib = FALSE, subsample = NULL, n_col = 1, col = rgb(0, 0, 1,
+  0.2), pch = ".", discrete_n_uniq = 5, discrete_jitter = 0.01,
+  ylab = "SHAP", plot_NA = TRUE, col_NA = rgb(0.7, 0, 1, 0.6),
+  pch_NA = ".", pos_NA = 1.07, plot_loess = TRUE, col_loess = 2,
+  span_loess = 0.5, which = c("1d", "2d"), plot = TRUE, ...)
+}
+\arguments{
+\item{data}{data as a \code{matrix} or \code{dgCMatrix}.}
+
+\item{shap_contrib}{a matrix of SHAP contributions that was computed earlier for the above 
+\code{data}. When it is NULL, it is computed internally using \code{model} and \code{data}.}
+
+\item{features}{a vector of either column indices or of feature names to plot. When it is NULL,
+feature importance is calculated, and \code{top_n} high ranked features are taken.}
+
+\item{top_n}{when \code{features} is NULL, top_n [1, 100] most important features in a model are taken.}
+
+\item{model}{an \code{xgb.Booster} model. It has to be provided when either \code{shap_contrib}
+or \code{features} is missing.}
+
+\item{trees}{passed to \code{\link{xgb.importance}} when \code{features = NULL}.}
+
+\item{target_class}{is only relevant for multiclass models. When it is set to a 0-based class index,
+only SHAP contributions for that specific class are used.
+If it is not set, SHAP importances are averaged over all classes.}
+
+\item{approxcontrib}{passed to \code{\link{predict.xgb.Booster}} when \code{shap_contrib = NULL}.}
+
+\item{subsample}{a random fraction of data points to use for plotting. When it is NULL,
+it is set so that up to 100K data points are used.}
+
+\item{n_col}{a number of columns in a grid of plots.}
+
+\item{col}{color of the scatterplot markers.}
+
+\item{pch}{scatterplot marker.}
+
+\item{discrete_n_uniq}{a maximal number of unique values in a feature to consider it as discrete.}
+
+\item{discrete_jitter}{an \code{amount} parameter of jitter added to discrete features' positions.}
+
+\item{ylab}{a y-axis label in 1D plots.}
+
+\item{plot_NA}{whether the contributions of cases with missing values should also be plotted.}
+
+\item{col_NA}{a color of marker for missing value contributions.}
+
+\item{pch_NA}{a marker type for NA values.}
+
+\item{pos_NA}{a relative position of the x-location where NA values are shown:
+\code{min(x) + (max(x) - min(x)) * pos_NA}.}
+
+\item{plot_loess}{whether to plot loess-smoothed curves. The smoothing is only done for features with
+more than 5 distinct values.}
+
+\item{col_loess}{a color to use for the loess curves.}
+
+\item{span_loess}{the \code{span} paramerer in \code{\link[stats]{loess}}'s call.}
+
+\item{which}{whether to do univariate or bivariate plotting. NOTE: only 1D is implemented so far.}
+
+\item{plot}{whether a plot should be drawn. If FALSE, only a lits of matrices is returned.}
+
+\item{...}{other parameters passed to \code{plot}.}
+}
+\value{
+In addition to producing plots (when \code{plot=TRUE}), it silently returns a list of two matrices:
+\itemize{
+ \item \code{data} the values of selected features;
+ \item \code{shap_contrib} the contributions of selected features.
+}
+}
+\description{
+Visualizing the SHAP feature contribution to prediction dependencies on feature value.
+}
+\details{
+These scatterplots represent how SHAP feature contributions depend of feature values.
+The similarity to partial dependency plots is that they also give an idea for how feature values
+affect predictions. However, in partial dependency plots, we usually see marginal dependencies
+of model prediction on feature value, while SHAP contribution dependency plots display the estimated
+contributions of a feature to model prediction for each individual case.
+
+When \code{plot_loess = TRUE} is set, feature values are rounded to 3 significant digits and
+weighted LOESS is computed and plotted, where weights are the numbers of data points
+at each rounded value.
+
+Note: SHAP contributions are shown on the scale of model margin. E.g., for a logistic binomial objective,
+the margin is prediction before a sigmoidal transform into probability-like values.
+Also, since SHAP stands for "SHapley Additive exPlanation" (model prediction = sum of SHAP
+contributions for all features + bias), depending on the objective used, transforming SHAP
+contributions for a feature from the marginal to the prediction space is not necessarily
+a meaningful thing to do.
+}
+\examples{
+
+data(agaricus.train, package='xgboost')
+data(agaricus.test, package='xgboost')
+
+bst <- xgboost(agaricus.train$data, agaricus.train$label, nrounds = 50, 
+               eta = 0.1, max_depth = 3, subsample = .5,
+               method = "hist", objective = "binary:logistic", nthread = 2, verbose = 0)
+
+xgb.plot.shap(agaricus.test$data, model = bst, features = "odor=none")
+contr <- predict(bst, agaricus.test$data, predcontrib = TRUE)
+xgb.plot.shap(agaricus.test$data, contr, model = bst, top_n = 12, n_col = 3)
+
+# multiclass example - plots for each class separately:
+nclass <- 3
+nrounds <- 20
+x <- as.matrix(iris[, -5])
+set.seed(123)
+is.na(x[sample(nrow(x) * 4, 30)]) <- TRUE # introduce some missing values
+mbst <- xgboost(data = x, label = as.numeric(iris$Species) - 1, nrounds = nrounds,
+                max_depth = 2, eta = 0.3, subsample = .5, nthread = 2,
+                objective = "multi:softprob", num_class = nclass, verbose = 0)
+trees0 <- seq(from=0, by=nclass, length.out=nrounds)
+col <- rgb(0, 0, 1, 0.5)
+xgb.plot.shap(x, model = mbst, trees = trees0, target_class = 0, top_n = 4, n_col = 2, col = col, pch = 16, pch_NA = 17)
+xgb.plot.shap(x, model = mbst, trees = trees0 + 1, target_class = 1, top_n = 4, n_col = 2, col = col, pch = 16, pch_NA = 17)
+xgb.plot.shap(x, model = mbst, trees = trees0 + 2, target_class = 2, top_n = 4, n_col = 2, col = col, pch = 16, pch_NA = 17)
+
+}
+\references{
+Scott M. Lundberg, Su-In Lee, "A Unified Approach to Interpreting Model Predictions", NIPS Proceedings 2017, \url{https://arxiv.org/abs/1705.07874}
+
+Scott M. Lundberg, Su-In Lee, "Consistent feature attribution for tree ensembles", \url{https://arxiv.org/abs/1706.06060}
+}
--- a/R-package/man/xgb.train.Rd
+++ b/R-package/man/xgb.train.Rd
@@ -258,6 +258,10 @@ bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label,
               objective = "binary:logistic")
 pred <- predict(bst, agaricus.test$data)

+}
+\references{
+Tianqi Chen and Carlos Guestrin, "XGBoost: A Scalable Tree Boosting System",
+22nd SIGKDD Conference on Knowledge Discovery and Data Mining, 2016, \url{https://arxiv.org/abs/1603.02754}
 }
 \seealso{
 \code{\link{callbacks}},
--- a/R-package/tests/testthat/test_helpers.R
+++ b/R-package/tests/testthat/test_helpers.R
@@ -81,6 +81,11 @@ test_that("predict feature contributions works", {
  expect_equal(colnames(pred_contr), c(colnames(sparse_matrix), "BIAS"))
  pred <- predict(bst.Tree, sparse_matrix, outputmargin = TRUE)
  expect_lt(max(abs(rowSums(pred_contr) - pred)), 1e-5)
+  # must work with data that has no column names
+  X <- sparse_matrix
+  colnames(X) <- NULL
+  expect_error(pred_contr_ <- predict(bst.Tree, X, predcontrib = TRUE), regexp = NA)
+  expect_equal(pred_contr, pred_contr_, check.attributes = FALSE)

  # gbtree binary classifier (approximate method)
  expect_error(pred_contr <- predict(bst.Tree, sparse_matrix, predcontrib = TRUE, approxcontrib = TRUE), regexp = NA)
@@ -289,6 +294,13 @@ test_that("xgb.plot.deepness works", {
  xgb.ggplot.deepness(model = bst.Tree)
 })

+test_that("xgb.plot.shap works", {
+  sh <- xgb.plot.shap(data = sparse_matrix, model = bst.Tree, top_n = 2, col = 4)
+  expect_equal(names(sh), c("data", "shap_contrib"))
+  expect_equal(NCOL(sh$data), 2)
+  expect_equal(NCOL(sh$shap_contrib), 2)
+})
+
 test_that("check.deprecation works", {
  ttt <- function(a = NNULL, DUMMY=NULL, ...) {
    check.deprecation(...)
--- a/cmake/Utils.cmake
+++ b/cmake/Utils.cmake
@@ -78,9 +78,9 @@ function(setup_rpackage_install_target rlib_target build_dir)
  install(CODE "file(WRITE \"${build_dir}/R-package/src/Makevars\" \"all:\")")
  install(CODE "file(WRITE \"${build_dir}/R-package/src/Makevars.win\" \"all:\")")
  set(XGB_DEPS_SCRIPT
-    "deps = setdiff(c('statar','data.table', 'magrittr', 'stringi'), rownames(installed.packages()));\
+    "deps = setdiff(c('data.table', 'magrittr', 'stringi'), rownames(installed.packages()));\
    if(length(deps)>0) install.packages(deps, repo = 'https://cloud.r-project.org/')")
  install(CODE "execute_process(COMMAND \"${LIBR_EXECUTABLE}\" \"-q\" \"-e\" \"${XGB_DEPS_SCRIPT}\")")
  install(CODE "execute_process(COMMAND \"${LIBR_EXECUTABLE}\" CMD INSTALL\
    \"--no-multiarch\" \"${build_dir}/R-package\")")
-endfunction(setup_rpackage_install_target)
+endfunction(setup_rpackage_install_target)
--- a/include/xgboost/tree_model.h
+++ b/include/xgboost/tree_model.h
@@ -516,7 +516,7 @@ class RegTree: public TreeModel<bst_float, RTreeNodeStat> {
   * \param out_contribs output vector to hold the contributions
   */
  inline void CalculateContributionsApprox(const RegTree::FVec& feat, unsigned root_id,
-                                    bst_float *out_contribs) const;
+                                           bst_float *out_contribs) const;
  /*!
   * \brief get next position of the tree given current pid
   * \param pid Current node id.
@@ -619,7 +619,7 @@ inline bst_float RegTree::FillNodeMeanValue(int nid) {
 }

 inline void RegTree::CalculateContributionsApprox(const RegTree::FVec& feat, unsigned root_id,
-                                            bst_float *out_contribs) const {
+                                                  bst_float *out_contribs) const {
  CHECK_GT(this->node_mean_values.size(), 0U);
  // this follows the idea of http://blog.datadive.net/interpreting-random-forests/
  bst_float node_value;
@@ -647,16 +647,16 @@ inline void RegTree::CalculateContributionsApprox(const RegTree::FVec& feat, uns

 // extend our decision path with a fraction of one and zero extensions
 inline void ExtendPath(PathElement *unique_path, unsigned unique_depth,
-                        bst_float zero_fraction, bst_float one_fraction, int feature_index) {
+                       bst_float zero_fraction, bst_float one_fraction, int feature_index) {
  unique_path[unique_depth].feature_index = feature_index;
  unique_path[unique_depth].zero_fraction = zero_fraction;
  unique_path[unique_depth].one_fraction = one_fraction;
-  unique_path[unique_depth].pweight = static_cast<bst_float>(unique_depth == 0 ? 1 : 0);
-  for (int i = unique_depth-1; i >= 0; i--) {
-    unique_path[i+1].pweight += one_fraction*unique_path[i].pweight*(i+1)
-                                / static_cast<bst_float>(unique_depth+1);
-    unique_path[i].pweight = zero_fraction*unique_path[i].pweight*(unique_depth-i)
-                             / static_cast<bst_float>(unique_depth+1);
+  unique_path[unique_depth].pweight = (unique_depth == 0 ? 1.0f : 0.0f);
+  for (int i = unique_depth - 1; i >= 0; i--) {
+    unique_path[i+1].pweight += one_fraction * unique_path[i].pweight * (i + 1)
+                                / static_cast<bst_float>(unique_depth + 1);
+    unique_path[i].pweight = zero_fraction * unique_path[i].pweight * (unique_depth - i)
+                             / static_cast<bst_float>(unique_depth + 1);
  }
 }

@@ -666,16 +666,16 @@ inline void UnwindPath(PathElement *unique_path, unsigned unique_depth, unsigned
  const bst_float zero_fraction = unique_path[path_index].zero_fraction;
  bst_float next_one_portion = unique_path[unique_depth].pweight;

-  for (int i = unique_depth-1; i >= 0; --i) {
+  for (int i = unique_depth - 1; i >= 0; --i) {
    if (one_fraction != 0) {
      const bst_float tmp = unique_path[i].pweight;
-      unique_path[i].pweight = next_one_portion*(unique_depth+1)
-                               / static_cast<bst_float>((i+1)*one_fraction);
-      next_one_portion = tmp - unique_path[i].pweight*zero_fraction*(unique_depth-i)
-                               / static_cast<bst_float>(unique_depth+1);
+      unique_path[i].pweight = next_one_portion * (unique_depth + 1)
+                               / static_cast<bst_float>((i + 1) * one_fraction);
+      next_one_portion = tmp - unique_path[i].pweight * zero_fraction * (unique_depth - i)
+                               / static_cast<bst_float>(unique_depth + 1);
    } else {
-      unique_path[i].pweight = (unique_path[i].pweight*(unique_depth+1))
-                               / static_cast<bst_float>(zero_fraction*(unique_depth-i));
+      unique_path[i].pweight = (unique_path[i].pweight * (unique_depth + 1))
+                               / static_cast<bst_float>(zero_fraction * (unique_depth - i));
    }
  }

@@ -694,16 +694,16 @@ inline bst_float UnwoundPathSum(const PathElement *unique_path, unsigned unique_
  const bst_float zero_fraction = unique_path[path_index].zero_fraction;
  bst_float next_one_portion = unique_path[unique_depth].pweight;
  bst_float total = 0;
-  for (int i = unique_depth-1; i >= 0; --i) {
+  for (int i = unique_depth - 1; i >= 0; --i) {
    if (one_fraction != 0) {
-      const bst_float tmp = next_one_portion*(unique_depth+1)
-                            / static_cast<bst_float>((i+1)*one_fraction);
+      const bst_float tmp = next_one_portion * (unique_depth + 1)
+                            / static_cast<bst_float>((i + 1) * one_fraction);
      total += tmp;
-      next_one_portion = unique_path[i].pweight - tmp*zero_fraction*((unique_depth-i)
+      next_one_portion = unique_path[i].pweight - tmp * zero_fraction * ((unique_depth - i)
                         / static_cast<bst_float>(unique_depth+1));
    } else {
-      total += (unique_path[i].pweight/zero_fraction)/((unique_depth-i)
-               / static_cast<bst_float>(unique_depth+1));
+      total += (unique_path[i].pweight / zero_fraction) / ((unique_depth - i)
+               / static_cast<bst_float>(unique_depth + 1));
    }
  }
  return total;
@@ -718,7 +718,8 @@ inline void RegTree::TreeShap(const RegTree::FVec& feat, bst_float *phi,

  // extend the unique path
  PathElement *unique_path = parent_unique_path + unique_depth;
-  if (unique_depth > 0) std::copy(parent_unique_path, parent_unique_path+unique_depth, unique_path);
+  if (unique_depth > 0) std::copy(parent_unique_path,
+                                  parent_unique_path + unique_depth, unique_path);
  ExtendPath(unique_path, unique_depth, parent_zero_fraction,
             parent_one_fraction, parent_feature_index);
  const unsigned split_index = node.split_index();
@@ -728,7 +729,7 @@ inline void RegTree::TreeShap(const RegTree::FVec& feat, bst_float *phi,
    for (unsigned i = 1; i <= unique_depth; ++i) {
      const bst_float w = UnwoundPathSum(unique_path, unique_depth, i);
      const PathElement &el = unique_path[i];
-      phi[el.feature_index] += w*(el.one_fraction-el.zero_fraction)*node.leaf_value();
+      phi[el.feature_index] += w * (el.one_fraction - el.zero_fraction) * node.leaf_value();
    }

  // internal node
@@ -742,10 +743,11 @@ inline void RegTree::TreeShap(const RegTree::FVec& feat, bst_float *phi,
    } else {
      hot_index = node.cright();
    }
-    const unsigned cold_index = (hot_index == node.cleft() ? node.cright() : node.cleft());
+    const unsigned cold_index = (static_cast<int>(hot_index) == node.cleft() ?
+                                 node.cright() : node.cleft());
    const bst_float w = this->stat(node_index).sum_hess;
-    const bst_float hot_zero_fraction = this->stat(hot_index).sum_hess/w;
-    const bst_float cold_zero_fraction = this->stat(cold_index).sum_hess/w;
+    const bst_float hot_zero_fraction = this->stat(hot_index).sum_hess / w;
+    const bst_float cold_zero_fraction = this->stat(cold_index).sum_hess / w;
    bst_float incoming_zero_fraction = 1;
    bst_float incoming_one_fraction = 1;

@@ -753,19 +755,19 @@ inline void RegTree::TreeShap(const RegTree::FVec& feat, bst_float *phi,
    // if so we undo that split so we can redo it for this node
    unsigned path_index = 0;
    for (; path_index <= unique_depth; ++path_index) {
-      if (unique_path[path_index].feature_index == split_index) break;
+      if (static_cast<unsigned>(unique_path[path_index].feature_index) == split_index) break;
    }
-    if (path_index != unique_depth+1) {
+    if (path_index != unique_depth + 1) {
      incoming_zero_fraction = unique_path[path_index].zero_fraction;
      incoming_one_fraction = unique_path[path_index].one_fraction;
      UnwindPath(unique_path, unique_depth, path_index);
      unique_depth -= 1;
    }

-    TreeShap(feat, phi, hot_index, unique_depth+1, unique_path,
+    TreeShap(feat, phi, hot_index, unique_depth + 1, unique_path,
             hot_zero_fraction*incoming_zero_fraction, incoming_one_fraction, split_index);

-    TreeShap(feat, phi, cold_index, unique_depth+1, unique_path,
+    TreeShap(feat, phi, cold_index, unique_depth + 1, unique_path,
             cold_zero_fraction*incoming_zero_fraction, 0, split_index);
  }
 }
@@ -773,21 +775,21 @@ inline void RegTree::TreeShap(const RegTree::FVec& feat, bst_float *phi,
 inline void RegTree::CalculateContributions(const RegTree::FVec& feat, unsigned root_id,
                                            bst_float *out_contribs) const {
  // find the expected value of the tree's predictions
-  bst_float base_value = 0.0;
-  bst_float total_cover = 0;
+  bst_float base_value = 0.0f;
+  bst_float total_cover = 0.0f;
  for (int i = 0; i < (*this).param.num_nodes; ++i) {
    const auto node = (*this)[i];
    if (node.is_leaf()) {
      const auto cover = this->stat(i).sum_hess;
-      base_value += cover*node.leaf_value();
+      base_value += cover * node.leaf_value();
      total_cover += cover;
    }
  }
  out_contribs[feat.size()] += base_value / total_cover;

  // Preallocate space for the unique path data
-  const int maxd = this->MaxDepth(root_id)+1;
-  PathElement *unique_path_data = new PathElement[(maxd*(maxd+1))/2];
+  const int maxd = this->MaxDepth(root_id) + 1;
+  PathElement *unique_path_data = new PathElement[(maxd * (maxd + 1)) / 2];

  TreeShap(feat, out_contribs, root_id, 0, unique_path_data, 1, 1, -1);
  delete[] unique_path_data;