diff --git a/R-package/DESCRIPTION b/R-package/DESCRIPTION index b39385bc4..32f83d2e7 100644 --- a/R-package/DESCRIPTION +++ b/R-package/DESCRIPTION @@ -1,8 +1,8 @@ Package: xgboost Type: Package Title: Extreme Gradient Boosting -Version: 0.6.4.7 -Date: 2017-09-25 +Version: 0.6.4.8 +Date: 2017-12-05 Author: Tianqi Chen , Tong He , Michael Benesty , Vadim Khotilovich , Yuan Tang diff --git a/R-package/NAMESPACE b/R-package/NAMESPACE index 031bc7880..e0fe76a58 100644 --- a/R-package/NAMESPACE +++ b/R-package/NAMESPACE @@ -40,6 +40,7 @@ export(xgb.model.dt.tree) export(xgb.plot.deepness) export(xgb.plot.importance) export(xgb.plot.multi.trees) +export(xgb.plot.shap) export(xgb.plot.tree) export(xgb.save) export(xgb.save.raw) @@ -60,9 +61,12 @@ importFrom(data.table,rbindlist) importFrom(data.table,setkey) importFrom(data.table,setkeyv) importFrom(data.table,setnames) +importFrom(grDevices,rgb) importFrom(graphics,barplot) importFrom(graphics,grid) +importFrom(graphics,lines) importFrom(graphics,par) +importFrom(graphics,points) importFrom(graphics,title) importFrom(magrittr,"%>%") importFrom(stats,median) diff --git a/R-package/R/xgb.Booster.R b/R-package/R/xgb.Booster.R index 21edbfa7f..c49b45179 100644 --- a/R-package/R/xgb.Booster.R +++ b/R-package/R/xgb.Booster.R @@ -150,7 +150,7 @@ xgb.Booster.complete <- function(object, saveraw = TRUE) { #' Setting \code{predcontrib = TRUE} allows to calculate contributions of each feature to #' individual predictions. For "gblinear" booster, feature contributions are simply linear terms #' (feature_beta * feature_value). For "gbtree" booster, feature contributions are SHAP -#' values (https://arxiv.org/abs/1706.06060) that sum to the difference between the expected output +#' values (Lundberg 2017) that sum to the difference between the expected output #' of the model and the current prediction (where the hessian weights are used to compute the expectations). #' Setting \code{approxcontrib = TRUE} approximates these values following the idea explained #' in \url{http://blog.datadive.net/interpreting-random-forests/}. @@ -172,6 +172,12 @@ xgb.Booster.complete <- function(object, saveraw = TRUE) { #' #' @seealso #' \code{\link{xgb.train}}. +#' +#' @references +#' +#' Scott M. Lundberg, Su-In Lee, "A Unified Approach to Interpreting Model Predictions", NIPS Proceedings 2017, \url{https://arxiv.org/abs/1705.07874} +#' +#' Scott M. Lundberg, Su-In Lee, "Consistent feature attribution for tree ensembles", \url{https://arxiv.org/abs/1706.06060} #' #' @examples #' ## binary classification: @@ -265,6 +271,10 @@ predict.xgb.Booster <- function(object, newdata, missing = NA, outputmargin = FA object <- xgb.Booster.complete(object, saveraw = FALSE) if (!inherits(newdata, "xgb.DMatrix")) newdata <- xgb.DMatrix(newdata, missing = missing) + if (!is.null(object[["feature_names"]]) && + !is.null(colnames(newdata)) && + !identical(object[["feature_names"]], colnames(newdata))) + stop("Feature names stored in `object` and `newdata` are different!") if (is.null(ntreelimit)) ntreelimit <- NVL(object$best_ntreelimit, 0) if (NVL(object$params[['booster']], '') == 'gblinear') @@ -292,7 +302,7 @@ predict.xgb.Booster <- function(object, newdata, missing = NA, outputmargin = FA } else if (predcontrib) { n_col1 <- ncol(newdata) + 1 n_group <- npred_per_case / n_col1 - dnames <- list(NULL, c(colnames(newdata), "BIAS")) + dnames <- if (!is.null(colnames(newdata))) list(NULL, c(colnames(newdata), "BIAS")) else NULL ret <- if (n_ret == n_row) { matrix(ret, ncol = 1, dimnames = dnames) } else if (n_group == 1) { diff --git a/R-package/R/xgb.importance.R b/R-package/R/xgb.importance.R index 5c08d4d0e..62e37e8d5 100644 --- a/R-package/R/xgb.importance.R +++ b/R-package/R/xgb.importance.R @@ -136,4 +136,4 @@ xgb.importance <- function(feature_names = NULL, model = NULL, trees = NULL, # Avoid error messages during CRAN check. # The reason is that these variables are never declared # They are mainly column names inferred by Data.table... -globalVariables(c(".", ".N", "Gain", "Cover", "Frequency", "Feature")) +globalVariables(c(".", ".N", "Gain", "Cover", "Frequency", "Feature", "Class")) diff --git a/R-package/R/xgb.plot.multi.trees.R b/R-package/R/xgb.plot.multi.trees.R index 3cd565e02..3e7b04b8c 100644 --- a/R-package/R/xgb.plot.multi.trees.R +++ b/R-package/R/xgb.plot.multi.trees.R @@ -1,5 +1,5 @@ #' Project all trees on one tree and plot it -#' +#' #' Visualization of the ensemble of trees as a single collective unit. #' #' @param model produced by the \code{xgb.train} function. @@ -7,52 +7,71 @@ #' @param features_keep number of features to keep in each position of the multi trees. #' @param plot_width width in pixels of the graph to produce #' @param plot_height height in pixels of the graph to produce +#' @param render a logical flag for whether the graph should be rendered (see Value). #' @param ... currently not used #' -#' @return Two graphs showing the distribution of the model deepness. -#' #' @details -#' -#' This function tries to capture the complexity of a gradient boosted tree model +#' +#' This function tries to capture the complexity of a gradient boosted tree model #' in a cohesive way by compressing an ensemble of trees into a single tree-graph representation. #' The goal is to improve the interpretability of a model generally seen as black box. -#' +#' #' Note: this function is applicable to tree booster-based models only. -#' -#' It takes advantage of the fact that the shape of a binary tree is only defined by -#' its depth (therefore, in a boosting model, all trees have similar shape). -#' +#' +#' It takes advantage of the fact that the shape of a binary tree is only defined by +#' its depth (therefore, in a boosting model, all trees have similar shape). +#' #' Moreover, the trees tend to reuse the same features. -#' -#' The function projects each tree onto one, and keeps for each position the +#' +#' The function projects each tree onto one, and keeps for each position the #' \code{features_keep} first features (based on the Gain per feature measure). -#' +#' #' This function is inspired by this blog post: #' \url{https://wellecks.wordpress.com/2015/02/21/peering-into-the-black-box-visualizing-lambdamart/} #' +#' @return +#' +#' When \code{render = TRUE}: +#' returns a rendered graph object which is an \code{htmlwidget} of class \code{grViz}. +#' Similar to ggplot objects, it needs to be printed to see it when not running from command line. +#' +#' When \code{render = FALSE}: +#' silently returns a graph object which is of DiagrammeR's class \code{dgr_graph}. +#' This could be useful if one wants to modify some of the graph attributes +#' before rendering the graph with \code{\link[DiagrammeR]{render_graph}}. +#' #' @examples +#' #' data(agaricus.train, package='xgboost') #' #' bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max_depth = 15, -#' eta = 1, nthread = 2, nrounds = 30, objective = "binary:logistic", -#' min_child_weight = 50) +#' eta = 1, nthread = 2, nrounds = 30, objective = "binary:logistic", +#' min_child_weight = 50, verbose = 0) #' -#' p <- xgb.plot.multi.trees(model = bst, feature_names = colnames(agaricus.train$data), -#' features_keep = 3) +#' p <- xgb.plot.multi.trees(model = bst, features_keep = 3) #' print(p) #' +#' \dontrun{ +#' # Below is an example of how to save this plot to a file. +#' # Note that for `export_graph` to work, the DiagrammeRsvg and rsvg packages must also be installed. +#' library(DiagrammeR) +#' gr <- xgb.plot.multi.trees(model=bst, features_keep = 3, render=FALSE) +#' export_graph(gr, 'tree.pdf', width=1500, height=600) +#' } +#' #' @export -xgb.plot.multi.trees <- function(model, feature_names = NULL, features_keep = 5, plot_width = NULL, plot_height = NULL, ...){ +xgb.plot.multi.trees <- function(model, feature_names = NULL, features_keep = 5, plot_width = NULL, plot_height = NULL, + render = TRUE, ...){ check.deprecation(...) tree.matrix <- xgb.model.dt.tree(feature_names = feature_names, model = model) - + # first number of the path represents the tree, then the following numbers are related to the path to follow # root init root.nodes <- tree.matrix[stri_detect_regex(ID, "\\d+-0"), ID] - tree.matrix[ID %in% root.nodes, abs.node.position:=root.nodes] - + tree.matrix[ID %in% root.nodes, abs.node.position := root.nodes] + precedent.nodes <- root.nodes - + while(tree.matrix[,sum(is.na(abs.node.position))] > 0) { yes.row.nodes <- tree.matrix[abs.node.position %in% precedent.nodes & !is.na(Yes)] no.row.nodes <- tree.matrix[abs.node.position %in% precedent.nodes & !is.na(No)] @@ -64,9 +83,8 @@ xgb.plot.multi.trees <- function(model, feature_names = NULL, features_keep = 5, precedent.nodes <- c(yes.nodes.abs.pos, no.nodes.abs.pos) } - tree.matrix[!is.na(Yes),Yes:= paste0(abs.node.position, "_0")] - tree.matrix[!is.na(No),No:= paste0(abs.node.position, "_1")] - + tree.matrix[!is.na(Yes), Yes := paste0(abs.node.position, "_0")] + tree.matrix[!is.na(No), No := paste0(abs.node.position, "_1")] remove.tree <- . %>% stri_replace_first_regex(pattern = "^\\d+-", replacement = "") @@ -120,8 +138,10 @@ xgb.plot.multi.trees <- function(model, feature_names = NULL, features_keep = 5, attr_type = "edge", attr = c("color", "arrowsize", "arrowhead", "fontname"), value = c("DimGray", "1.5", "vee", "Helvetica")) - - DiagrammeR::render_graph(graph, width = plot_width, height = plot_height) + + if (!render) return(invisible(graph)) + + DiagrammeR::render_graph(graph, width = plot_width, height = plot_height) } globalVariables(c(".N", "N", "From", "To", "Text", "Feature", "no.nodes.abs.pos", diff --git a/R-package/R/xgb.plot.shap.R b/R-package/R/xgb.plot.shap.R new file mode 100644 index 000000000..6031b69a7 --- /dev/null +++ b/R-package/R/xgb.plot.shap.R @@ -0,0 +1,214 @@ +#' SHAP contribution dependency plots +#' +#' Visualizing the SHAP feature contribution to prediction dependencies on feature value. +#' +#' @param data data as a \code{matrix} or \code{dgCMatrix}. +#' @param shap_contrib a matrix of SHAP contributions that was computed earlier for the above +#' \code{data}. When it is NULL, it is computed internally using \code{model} and \code{data}. +#' @param features a vector of either column indices or of feature names to plot. When it is NULL, +#' feature importance is calculated, and \code{top_n} high ranked features are taken. +#' @param top_n when \code{features} is NULL, top_n [1, 100] most important features in a model are taken. +#' @param model an \code{xgb.Booster} model. It has to be provided when either \code{shap_contrib} +#' or \code{features} is missing. +#' @param trees passed to \code{\link{xgb.importance}} when \code{features = NULL}. +#' @param target_class is only relevant for multiclass models. When it is set to a 0-based class index, +#' only SHAP contributions for that specific class are used. +#' If it is not set, SHAP importances are averaged over all classes. +#' @param approxcontrib passed to \code{\link{predict.xgb.Booster}} when \code{shap_contrib = NULL}. +#' @param subsample a random fraction of data points to use for plotting. When it is NULL, +#' it is set so that up to 100K data points are used. +#' @param n_col a number of columns in a grid of plots. +#' @param col color of the scatterplot markers. +#' @param pch scatterplot marker. +#' @param discrete_n_uniq a maximal number of unique values in a feature to consider it as discrete. +#' @param discrete_jitter an \code{amount} parameter of jitter added to discrete features' positions. +#' @param ylab a y-axis label in 1D plots. +#' @param plot_NA whether the contributions of cases with missing values should also be plotted. +#' @param col_NA a color of marker for missing value contributions. +#' @param pch_NA a marker type for NA values. +#' @param pos_NA a relative position of the x-location where NA values are shown: +#' \code{min(x) + (max(x) - min(x)) * pos_NA}. +#' @param plot_loess whether to plot loess-smoothed curves. The smoothing is only done for features with +#' more than 5 distinct values. +#' @param col_loess a color to use for the loess curves. +#' @param span_loess the \code{span} paramerer in \code{\link[stats]{loess}}'s call. +#' @param which whether to do univariate or bivariate plotting. NOTE: only 1D is implemented so far. +#' @param plot whether a plot should be drawn. If FALSE, only a lits of matrices is returned. +#' @param ... other parameters passed to \code{plot}. +#' +#' @details +#' +#' These scatterplots represent how SHAP feature contributions depend of feature values. +#' The similarity to partial dependency plots is that they also give an idea for how feature values +#' affect predictions. However, in partial dependency plots, we usually see marginal dependencies +#' of model prediction on feature value, while SHAP contribution dependency plots display the estimated +#' contributions of a feature to model prediction for each individual case. +#' +#' When \code{plot_loess = TRUE} is set, feature values are rounded to 3 significant digits and +#' weighted LOESS is computed and plotted, where weights are the numbers of data points +#' at each rounded value. +#' +#' Note: SHAP contributions are shown on the scale of model margin. E.g., for a logistic binomial objective, +#' the margin is prediction before a sigmoidal transform into probability-like values. +#' Also, since SHAP stands for "SHapley Additive exPlanation" (model prediction = sum of SHAP +#' contributions for all features + bias), depending on the objective used, transforming SHAP +#' contributions for a feature from the marginal to the prediction space is not necessarily +#' a meaningful thing to do. +#' +#' @return +#' +#' In addition to producing plots (when \code{plot=TRUE}), it silently returns a list of two matrices: +#' \itemize{ +#' \item \code{data} the values of selected features; +#' \item \code{shap_contrib} the contributions of selected features. +#' } +#' +#' @references +#' +#' Scott M. Lundberg, Su-In Lee, "A Unified Approach to Interpreting Model Predictions", NIPS Proceedings 2017, \url{https://arxiv.org/abs/1705.07874} +#' +#' Scott M. Lundberg, Su-In Lee, "Consistent feature attribution for tree ensembles", \url{https://arxiv.org/abs/1706.06060} +#' +#' @examples +#' +#' data(agaricus.train, package='xgboost') +#' data(agaricus.test, package='xgboost') +#' +#' bst <- xgboost(agaricus.train$data, agaricus.train$label, nrounds = 50, +#' eta = 0.1, max_depth = 3, subsample = .5, +#' method = "hist", objective = "binary:logistic", nthread = 2, verbose = 0) +#' +#' xgb.plot.shap(agaricus.test$data, model = bst, features = "odor=none") +#' contr <- predict(bst, agaricus.test$data, predcontrib = TRUE) +#' xgb.plot.shap(agaricus.test$data, contr, model = bst, top_n = 12, n_col = 3) +#' +#' # multiclass example - plots for each class separately: +#' nclass <- 3 +#' nrounds <- 20 +#' x <- as.matrix(iris[, -5]) +#' set.seed(123) +#' is.na(x[sample(nrow(x) * 4, 30)]) <- TRUE # introduce some missing values +#' mbst <- xgboost(data = x, label = as.numeric(iris$Species) - 1, nrounds = nrounds, +#' max_depth = 2, eta = 0.3, subsample = .5, nthread = 2, +#' objective = "multi:softprob", num_class = nclass, verbose = 0) +#' trees0 <- seq(from=0, by=nclass, length.out=nrounds) +#' col <- rgb(0, 0, 1, 0.5) +#' xgb.plot.shap(x, model = mbst, trees = trees0, target_class = 0, top_n = 4, n_col = 2, col = col, pch = 16, pch_NA = 17) +#' xgb.plot.shap(x, model = mbst, trees = trees0 + 1, target_class = 1, top_n = 4, n_col = 2, col = col, pch = 16, pch_NA = 17) +#' xgb.plot.shap(x, model = mbst, trees = trees0 + 2, target_class = 2, top_n = 4, n_col = 2, col = col, pch = 16, pch_NA = 17) +#' +#' @rdname xgb.plot.shap +#' @export +xgb.plot.shap <- function(data, shap_contrib = NULL, features = NULL, top_n = 1, model = NULL, + trees = NULL, target_class = NULL, approxcontrib = FALSE, + subsample = NULL, n_col = 1, col = rgb(0, 0, 1, 0.2), pch = '.', + discrete_n_uniq = 5, discrete_jitter = 0.01, ylab = "SHAP", + plot_NA = TRUE, col_NA = rgb(0.7, 0, 1, 0.6), pch_NA = '.', pos_NA = 1.07, + plot_loess = TRUE, col_loess = 2, span_loess = 0.5, + which = c("1d", "2d"), plot = TRUE, ...) { + + if (!is.matrix(data) && !inherits(data, "dgCMatrix")) + stop("data: must be either matrix or dgCMatrix") + + if (is.null(shap_contrib) && (is.null(model) || !inherits(model, "xgb.Booster"))) + stop("when shap_contrib is not provided, one must provide an xgb.Booster model") + + if (is.null(features) && (is.null(model) || !inherits(model, "xgb.Booster"))) + stop("when features are not provided, one must provide an xgb.Booster model to rank the features") + + if (!is.null(shap_contrib) && + (!is.matrix(shap_contrib) || nrow(shap_contrib) != nrow(data) || ncol(shap_contrib) != ncol(data) + 1)) + stop("shap_contrib is not compatible with the provided data") + + nsample <- if (is.null(subsample)) min(100000, nrow(data)) else as.integer(subsample * nrow(data)) + idx <- sample(1:nrow(data), nsample) + data <- data[idx,] + + if (is.null(shap_contrib)) { + shap_contrib <- predict(model, data, predcontrib = TRUE, approxcontrib = approxcontrib) + } else { + shap_contrib <- shap_contrib[idx,] + } + + which <- match.arg(which) + if (which == "2d") + stop("2D plots are not implemented yet") + + if (is.null(features)) { + imp <- xgb.importance(model = model, trees = trees) + top_n <- as.integer(top_n[1]) + if (top_n < 1 && top_n > 100) + stop("top_n: must be an integer within [1, 100]") + features <- imp$Feature[1:min(top_n, NROW(imp))] + } + + if (is.character(features)) { + if (is.null(colnames(data))) + stop("Either provide `data` with column names or provide `features` as column indices") + features <- match(features, colnames(data)) + } + + if (n_col > length(features)) n_col <- length(features) + + if (is.list(shap_contrib)) { # multiclass: either choose a class or merge + shap_contrib <- if (!is.null(target_class)) shap_contrib[[target_class + 1]] + else Reduce("+", lapply(shap_contrib, abs)) + } + + shap_contrib <- shap_contrib[, features, drop = FALSE] + data <- data[, features, drop = FALSE] + cols <- colnames(data) + if (is.null(cols)) cols <- colnames(shap_contrib) + if (is.null(cols)) cols <- paste0('X', 1:ncol(data)) + colnames(data) <- cols + colnames(shap_contrib) <- cols + + if (plot && which == "1d") { + op <- par(mfrow = c(ceiling(length(features) / n_col), n_col), + oma = c(0,0,0,0) + 0.2, + mar = c(3.5,3.5,0,0) + 0.1, + mgp = c(1.7, 0.6, 0)) + for (f in cols) { + ord <- order(data[, f]) + x <- data[, f][ord] + y <- shap_contrib[, f][ord] + x_lim <- range(x, na.rm = TRUE) + y_lim <- range(y, na.rm = TRUE) + do_na <- plot_NA && any(is.na(x)) + if (do_na) { + x_range <- diff(x_lim) + loc_na <- min(x, na.rm = TRUE) + x_range * pos_NA + x_lim <- range(c(x_lim, loc_na)) + } + x_uniq <- unique(x) + x2plot <- x + # add small jitter for discrete features with <= 5 distinct values + if (length(x_uniq) <= discrete_n_uniq) + x2plot <- jitter(x, amount = discrete_jitter * min(diff(x_uniq), na.rm = TRUE)) + plot(x2plot, y, pch = pch, xlab = f, col = col, xlim = x_lim, ylim = y_lim, ylab = ylab, ...) + grid() + if (plot_loess) { + # compress x to 3 digits, and mean-aggredate y + zz <- data.table(x = signif(x, 3), y)[, .(.N, y=mean(y)), x] + if (nrow(zz) <= 5) { + lines(zz$x, zz$y, col = col_loess) + } else { + lo <- stats::loess(y ~ x, data = zz, weights = zz$N, span = span_loess) + zz$y_lo <- predict(lo, zz, type = "link") + lines(zz$x, zz$y_lo, col = col_loess) + } + } + if (do_na) { + i_na <- which(is.na(x)) + x_na <- rep(loc_na, length(i_na)) + x_na <- jitter(x_na, amount = x_range * 0.01) + points(x_na, y[i_na], pch = pch_NA, col = col_NA) + } + } + par(op) + } + if (plot && which == "2d") { + # TODO + } + invisible(list(data = data, shap_contrib = shap_contrib)) +} diff --git a/R-package/R/xgb.plot.tree.R b/R-package/R/xgb.plot.tree.R index b5ed3445c..29c37d606 100644 --- a/R-package/R/xgb.plot.tree.R +++ b/R-package/R/xgb.plot.tree.R @@ -95,7 +95,8 @@ xgb.plot.tree <- function(feature_names = NULL, model = NULL, trees = NULL, plot label = dt$label, fillcolor = dt$filledcolor, shape = dt$shape, - data = dt$Feature) + data = dt$Feature, + fontcolor = "black") edges <- DiagrammeR::create_edge_df( from = match(dt[Feature != "Leaf", c(ID)] %>% rep(2), dt$ID), diff --git a/R-package/R/xgb.train.R b/R-package/R/xgb.train.R index a644055ca..73b79bebc 100644 --- a/R-package/R/xgb.train.R +++ b/R-package/R/xgb.train.R @@ -169,6 +169,11 @@ #' \code{\link{predict.xgb.Booster}}, #' \code{\link{xgb.cv}} #' +#' @references +#' +#' Tianqi Chen and Carlos Guestrin, "XGBoost: A Scalable Tree Boosting System", +#' 22nd SIGKDD Conference on Knowledge Discovery and Data Mining, 2016, \url{https://arxiv.org/abs/1603.02754} +#' #' @examples #' data(agaricus.train, package='xgboost') #' data(agaricus.test, package='xgboost') diff --git a/R-package/R/xgboost.R b/R-package/R/xgboost.R index 66c4c1bd2..6991a0f83 100644 --- a/R-package/R/xgboost.R +++ b/R-package/R/xgboost.R @@ -100,9 +100,12 @@ NULL #' @importFrom stats median #' @importFrom utils head #' @importFrom graphics barplot +#' @importFrom graphics lines +#' @importFrom graphics points #' @importFrom graphics grid #' @importFrom graphics par #' @importFrom graphics title +#' @importFrom grDevices rgb #' #' @import methods #' @useDynLib xgboost, .registration = TRUE diff --git a/R-package/demo/00Index b/R-package/demo/00Index index 3ecb7966c..e1955f7f0 100644 --- a/R-package/demo/00Index +++ b/R-package/demo/00Index @@ -10,4 +10,5 @@ predict_leaf_indices Predicting the corresponding leaves early_stopping Early Stop in training poisson_regression Poisson Regression on count data tweedie_regression Tweddie Regression -gpu_accelerated GPU-accelerated tree building algorithms \ No newline at end of file +gpu_accelerated GPU-accelerated tree building algorithms + diff --git a/R-package/man/predict.xgb.Booster.Rd b/R-package/man/predict.xgb.Booster.Rd index ef93c12a7..33830b159 100644 --- a/R-package/man/predict.xgb.Booster.Rd +++ b/R-package/man/predict.xgb.Booster.Rd @@ -7,7 +7,7 @@ \usage{ \method{predict}{xgb.Booster}(object, newdata, missing = NA, outputmargin = FALSE, ntreelimit = NULL, predleaf = FALSE, - predcontrib = FALSE, reshape = FALSE, ...) + predcontrib = FALSE, approxcontrib = FALSE, reshape = FALSE, ...) \method{predict}{xgb.Booster.handle}(object, ...) } @@ -19,8 +19,8 @@ \item{missing}{Missing is only used when input is dense matrix. Pick a float value that represents missing values in data (e.g., sometimes 0 or some other extreme value is used).} -\item{outputmargin}{whether the prediction should be returned in the for of original untransformed -sum of predictions from boosting iterations' results. E.g., setting \code{outputmargin=TRUE} for +\item{outputmargin}{whether the prediction should be returned in the for of original untransformed +sum of predictions from boosting iterations' results. E.g., setting \code{outputmargin=TRUE} for logistic regression would result in predictions for log-odds instead of probabilities.} \item{ntreelimit}{limit the number of model's trees or boosting iterations used in prediction (see Details). @@ -30,24 +30,26 @@ It will use all the trees by default (\code{NULL} value).} \item{predcontrib}{whether to return feature contributions to individual predictions instead (see Details).} -\item{reshape}{whether to reshape the vector of predictions to a matrix form when there are several +\item{approxcontrib}{whether to use a fast approximation for feature contributions (see Details).} + +\item{reshape}{whether to reshape the vector of predictions to a matrix form when there are several prediction outputs per case. This option has no effect when \code{predleaf = TRUE}.} \item{...}{Parameters passed to \code{predict.xgb.Booster}} } \value{ For regression or binary classification, it returns a vector of length \code{nrows(newdata)}. -For multiclass classification, either a \code{num_class * nrows(newdata)} vector or -a \code{(nrows(newdata), num_class)} dimension matrix is returned, depending on +For multiclass classification, either a \code{num_class * nrows(newdata)} vector or +a \code{(nrows(newdata), num_class)} dimension matrix is returned, depending on the \code{reshape} value. -When \code{predleaf = TRUE}, the output is a matrix object with the +When \code{predleaf = TRUE}, the output is a matrix object with the number of columns corresponding to the number of trees. When \code{predcontrib = TRUE} and it is not a multiclass setting, the output is a matrix object with \code{num_features + 1} columns. The last "+ 1" column in a matrix corresponds to bias. For a multiclass case, a list of \code{num_class} elements is returned, where each element is -such a matrix. The contribution values are on the scale of untransformed margin +such a matrix. The contribution values are on the scale of untransformed margin (e.g., for binary classification would mean that the contributions are log-odds deviations from bias). } \description{ @@ -57,22 +59,23 @@ Predicted values based on either xgboost model or model handle object. Note that \code{ntreelimit} is not necessarily equal to the number of boosting iterations and it is not necessarily equal to the number of trees in a model. E.g., in a random forest-like model, \code{ntreelimit} would limit the number of trees. -But for multiclass classification, while there are multiple trees per iteration, +But for multiclass classification, while there are multiple trees per iteration, \code{ntreelimit} limits the number of boosting iterations. -Also note that \code{ntreelimit} would currently do nothing for predictions from gblinear, +Also note that \code{ntreelimit} would currently do nothing for predictions from gblinear, since gblinear doesn't keep its boosting history. -One possible practical applications of the \code{predleaf} option is to use the model -as a generator of new features which capture non-linearity and interactions, +One possible practical applications of the \code{predleaf} option is to use the model +as a generator of new features which capture non-linearity and interactions, e.g., as implemented in \code{\link{xgb.create.features}}. Setting \code{predcontrib = TRUE} allows to calculate contributions of each feature to individual predictions. For "gblinear" booster, feature contributions are simply linear terms -(feature_beta * feature_value). For "gbtree" booster, feature contribution is calculated -as a sum of average contribution of that feature's split nodes across all trees to an -individual prediction, following the idea explained in -\url{http://blog.datadive.net/interpreting-random-forests/}. +(feature_beta * feature_value). For "gbtree" booster, feature contributions are SHAP +values (Lundberg 2017) that sum to the difference between the expected output +of the model and the current prediction (where the hessian weights are used to compute the expectations). +Setting \code{approxcontrib = TRUE} approximates these values following the idea explained +in \url{http://blog.datadive.net/interpreting-random-forests/}. } \examples{ ## binary classification: @@ -82,7 +85,7 @@ data(agaricus.test, package='xgboost') train <- agaricus.train test <- agaricus.test -bst <- xgboost(data = train$data, label = train$label, max_depth = 2, +bst <- xgboost(data = train$data, label = train$label, max_depth = 2, eta = 0.5, nthread = 2, nrounds = 5, objective = "binary:logistic") # use all trees by default pred <- predict(bst, test$data) @@ -98,7 +101,7 @@ str(pred_leaf) # the result is an nsamples X (nfeatures + 1) matrix pred_contr <- predict(bst, test$data, predcontrib = TRUE) str(pred_contr) -# verify that contributions' sums are equal to log-odds of predictions (up to foat precision): +# verify that contributions' sums are equal to log-odds of predictions (up to float precision): summary(rowSums(pred_contr) - qlogis(pred)) # for the 1st record, let's inspect its features that had non-zero contribution to prediction: contr1 <- pred_contr[1,] @@ -137,7 +140,7 @@ bst <- xgboost(data = as.matrix(iris[, -5]), label = lb, pred <- predict(bst, as.matrix(iris[, -5])) str(pred) all.equal(pred, pred_labels) -# prediction from using only 5 iterations should result +# prediction from using only 5 iterations should result # in the same error as seen in iteration 5: pred5 <- predict(bst, as.matrix(iris[, -5]), ntreelimit=5) sum(pred5 != lb)/length(lb) @@ -158,6 +161,11 @@ err <- sapply(1:25, function(n) { }) plot(err, type='l', ylim=c(0,0.1), xlab='#trees') +} +\references{ +Scott M. Lundberg, Su-In Lee, "A Unified Approach to Interpreting Model Predictions", NIPS Proceedings 2017, \url{https://arxiv.org/abs/1705.07874} + +Scott M. Lundberg, Su-In Lee, "Consistent feature attribution for tree ensembles", \url{https://arxiv.org/abs/1706.06060} } \seealso{ \code{\link{xgb.train}}. diff --git a/R-package/man/xgb.Booster.complete.Rd b/R-package/man/xgb.Booster.complete.Rd index 44b3c039c..57b5c7e7e 100644 --- a/R-package/man/xgb.Booster.complete.Rd +++ b/R-package/man/xgb.Booster.complete.Rd @@ -9,32 +9,32 @@ xgb.Booster.complete(object, saveraw = TRUE) \arguments{ \item{object}{object of class \code{xgb.Booster}} -\item{saveraw}{a flag indicating whether to append \code{raw} Booster memory dump data +\item{saveraw}{a flag indicating whether to append \code{raw} Booster memory dump data when it doesn't already exist.} } \value{ An object of \code{xgb.Booster} class. } \description{ -It attempts to complete an \code{xgb.Booster} object by restoring either its missing +It attempts to complete an \code{xgb.Booster} object by restoring either its missing raw model memory dump (when it has no \code{raw} data but its \code{xgb.Booster.handle} is valid) -or its missing internal handle (when its \code{xgb.Booster.handle} is not valid +or its missing internal handle (when its \code{xgb.Booster.handle} is not valid but it has a raw Booster memory dump). } \details{ While this method is primarily for internal use, it might be useful in some practical situations. E.g., when an \code{xgb.Booster} model is saved as an R object and then is loaded as an R object, -its handle (pointer) to an internal xgboost model would be invalid. The majority of xgboost methods -should still work for such a model object since those methods would be using -\code{xgb.Booster.complete} internally. However, one might find it to be more efficient to call the +its handle (pointer) to an internal xgboost model would be invalid. The majority of xgboost methods +should still work for such a model object since those methods would be using +\code{xgb.Booster.complete} internally. However, one might find it to be more efficient to call the \code{xgb.Booster.complete} function explicitely once after loading a model as an R-object. That would prevent further repeated implicit reconstruction of an internal booster model. } \examples{ data(agaricus.train, package='xgboost') -bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max_depth = 2, +bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max_depth = 2, eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic") saveRDS(bst, "xgb.model.rds") diff --git a/R-package/man/xgb.attr.Rd b/R-package/man/xgb.attr.Rd index d501491bb..7bbf45b8b 100644 --- a/R-package/man/xgb.attr.Rd +++ b/R-package/man/xgb.attr.Rd @@ -20,18 +20,18 @@ xgb.attributes(object) <- value \item{name}{a non-empty character string specifying which attribute is to be accessed.} -\item{value}{a value of an attribute for \code{xgb.attr<-}; for \code{xgb.attributes<-} -it's a list (or an object coercible to a list) with the names of attributes to set -and the elements corresponding to attribute values. +\item{value}{a value of an attribute for \code{xgb.attr<-}; for \code{xgb.attributes<-} +it's a list (or an object coercible to a list) with the names of attributes to set +and the elements corresponding to attribute values. Non-character values are converted to character. When attribute value is not a scalar, only the first index is used. Use \code{NULL} to remove an attribute.} } \value{ -\code{xgb.attr} returns either a string value of an attribute +\code{xgb.attr} returns either a string value of an attribute or \code{NULL} if an attribute wasn't stored in a model. -\code{xgb.attributes} returns a list of all attribute stored in a model +\code{xgb.attributes} returns a list of all attribute stored in a model or \code{NULL} if a model has no stored attributes. } \description{ @@ -41,23 +41,23 @@ These methods allow to manipulate the key-value attribute strings of an xgboost The primary purpose of xgboost model attributes is to store some meta-data about the model. Note that they are a separate concept from the object attributes in R. Specifically, they refer to key-value strings that can be attached to an xgboost model, -stored together with the model's binary representation, and accessed later +stored together with the model's binary representation, and accessed later (from R or any other interface). In contrast, any R-attribute assigned to an R-object of \code{xgb.Booster} class would not be saved by \code{xgb.save} because an xgboost model is an external memory object and its serialization is handled externally. -Also, setting an attribute that has the same name as one of xgboost's parameters wouldn't -change the value of that parameter for a model. +Also, setting an attribute that has the same name as one of xgboost's parameters wouldn't +change the value of that parameter for a model. Use \code{\link{xgb.parameters<-}} to set or change model parameters. The attribute setters would usually work more efficiently for \code{xgb.Booster.handle} than for \code{xgb.Booster}, since only just a handle (pointer) would need to be copied. That would only matter if attributes need to be set many times. Note, however, that when feeding a handle of an \code{xgb.Booster} object to the attribute setters, -the raw model cache of an \code{xgb.Booster} object would not be automatically updated, +the raw model cache of an \code{xgb.Booster} object would not be automatically updated, and it would be user's responsibility to call \code{xgb.save.raw} to update it. -The \code{xgb.attributes<-} setter either updates the existing or adds one or several attributes, +The \code{xgb.attributes<-} setter either updates the existing or adds one or several attributes, but it doesn't delete the other existing attributes. } \examples{ diff --git a/R-package/man/xgb.plot.multi.trees.Rd b/R-package/man/xgb.plot.multi.trees.Rd index 3f8adcaaa..84055dd22 100644 --- a/R-package/man/xgb.plot.multi.trees.Rd +++ b/R-package/man/xgb.plot.multi.trees.Rd @@ -5,7 +5,7 @@ \title{Project all trees on one tree and plot it} \usage{ xgb.plot.multi.trees(model, feature_names = NULL, features_keep = 5, - plot_width = NULL, plot_height = NULL, ...) + plot_width = NULL, plot_height = NULL, render = TRUE, ...) } \arguments{ \item{model}{produced by the \code{xgb.train} function.} @@ -18,41 +18,58 @@ xgb.plot.multi.trees(model, feature_names = NULL, features_keep = 5, \item{plot_height}{height in pixels of the graph to produce} +\item{render}{a logical flag for whether the graph should be rendered (see Value).} + \item{...}{currently not used} } \value{ -Two graphs showing the distribution of the model deepness. +When \code{render = TRUE}: +returns a rendered graph object which is an \code{htmlwidget} of class \code{grViz}. +Similar to ggplot objects, it needs to be printed to see it when not running from command line. + +When \code{render = FALSE}: +silently returns a graph object which is of DiagrammeR's class \code{dgr_graph}. +This could be useful if one wants to modify some of the graph attributes +before rendering the graph with \code{\link[DiagrammeR]{render_graph}}. } \description{ Visualization of the ensemble of trees as a single collective unit. } \details{ -This function tries to capture the complexity of a gradient boosted tree model +This function tries to capture the complexity of a gradient boosted tree model in a cohesive way by compressing an ensemble of trees into a single tree-graph representation. The goal is to improve the interpretability of a model generally seen as black box. Note: this function is applicable to tree booster-based models only. -It takes advantage of the fact that the shape of a binary tree is only defined by -its depth (therefore, in a boosting model, all trees have similar shape). +It takes advantage of the fact that the shape of a binary tree is only defined by +its depth (therefore, in a boosting model, all trees have similar shape). Moreover, the trees tend to reuse the same features. -The function projects each tree onto one, and keeps for each position the +The function projects each tree onto one, and keeps for each position the \code{features_keep} first features (based on the Gain per feature measure). This function is inspired by this blog post: \url{https://wellecks.wordpress.com/2015/02/21/peering-into-the-black-box-visualizing-lambdamart/} } \examples{ + data(agaricus.train, package='xgboost') bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max_depth = 15, - eta = 1, nthread = 2, nrounds = 30, objective = "binary:logistic", - min_child_weight = 50) + eta = 1, nthread = 2, nrounds = 30, objective = "binary:logistic", + min_child_weight = 50, verbose = 0) -p <- xgb.plot.multi.trees(model = bst, feature_names = colnames(agaricus.train$data), - features_keep = 3) +p <- xgb.plot.multi.trees(model = bst, features_keep = 3) print(p) +\dontrun{ +# Below is an example of how to save this plot to a file. +# Note that for `export_graph` to work, the DiagrammeRsvg and rsvg packages must also be installed. +library(DiagrammeR) +gr <- xgb.plot.multi.trees(model=bst, features_keep = 3, render=FALSE) +export_graph(gr, 'tree.pdf', width=1500, height=600) +} + } diff --git a/R-package/man/xgb.plot.shap.Rd b/R-package/man/xgb.plot.shap.Rd new file mode 100644 index 000000000..9dcf7fce8 --- /dev/null +++ b/R-package/man/xgb.plot.shap.Rd @@ -0,0 +1,135 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/xgb.plot.shap.R +\name{xgb.plot.shap} +\alias{xgb.plot.shap} +\title{SHAP contribution dependency plots} +\usage{ +xgb.plot.shap(data, shap_contrib = NULL, features = NULL, top_n = 1, + model = NULL, trees = NULL, target_class = NULL, + approxcontrib = FALSE, subsample = NULL, n_col = 1, col = rgb(0, 0, 1, + 0.2), pch = ".", discrete_n_uniq = 5, discrete_jitter = 0.01, + ylab = "SHAP", plot_NA = TRUE, col_NA = rgb(0.7, 0, 1, 0.6), + pch_NA = ".", pos_NA = 1.07, plot_loess = TRUE, col_loess = 2, + span_loess = 0.5, which = c("1d", "2d"), plot = TRUE, ...) +} +\arguments{ +\item{data}{data as a \code{matrix} or \code{dgCMatrix}.} + +\item{shap_contrib}{a matrix of SHAP contributions that was computed earlier for the above +\code{data}. When it is NULL, it is computed internally using \code{model} and \code{data}.} + +\item{features}{a vector of either column indices or of feature names to plot. When it is NULL, +feature importance is calculated, and \code{top_n} high ranked features are taken.} + +\item{top_n}{when \code{features} is NULL, top_n [1, 100] most important features in a model are taken.} + +\item{model}{an \code{xgb.Booster} model. It has to be provided when either \code{shap_contrib} +or \code{features} is missing.} + +\item{trees}{passed to \code{\link{xgb.importance}} when \code{features = NULL}.} + +\item{target_class}{is only relevant for multiclass models. When it is set to a 0-based class index, +only SHAP contributions for that specific class are used. +If it is not set, SHAP importances are averaged over all classes.} + +\item{approxcontrib}{passed to \code{\link{predict.xgb.Booster}} when \code{shap_contrib = NULL}.} + +\item{subsample}{a random fraction of data points to use for plotting. When it is NULL, +it is set so that up to 100K data points are used.} + +\item{n_col}{a number of columns in a grid of plots.} + +\item{col}{color of the scatterplot markers.} + +\item{pch}{scatterplot marker.} + +\item{discrete_n_uniq}{a maximal number of unique values in a feature to consider it as discrete.} + +\item{discrete_jitter}{an \code{amount} parameter of jitter added to discrete features' positions.} + +\item{ylab}{a y-axis label in 1D plots.} + +\item{plot_NA}{whether the contributions of cases with missing values should also be plotted.} + +\item{col_NA}{a color of marker for missing value contributions.} + +\item{pch_NA}{a marker type for NA values.} + +\item{pos_NA}{a relative position of the x-location where NA values are shown: +\code{min(x) + (max(x) - min(x)) * pos_NA}.} + +\item{plot_loess}{whether to plot loess-smoothed curves. The smoothing is only done for features with +more than 5 distinct values.} + +\item{col_loess}{a color to use for the loess curves.} + +\item{span_loess}{the \code{span} paramerer in \code{\link[stats]{loess}}'s call.} + +\item{which}{whether to do univariate or bivariate plotting. NOTE: only 1D is implemented so far.} + +\item{plot}{whether a plot should be drawn. If FALSE, only a lits of matrices is returned.} + +\item{...}{other parameters passed to \code{plot}.} +} +\value{ +In addition to producing plots (when \code{plot=TRUE}), it silently returns a list of two matrices: +\itemize{ + \item \code{data} the values of selected features; + \item \code{shap_contrib} the contributions of selected features. +} +} +\description{ +Visualizing the SHAP feature contribution to prediction dependencies on feature value. +} +\details{ +These scatterplots represent how SHAP feature contributions depend of feature values. +The similarity to partial dependency plots is that they also give an idea for how feature values +affect predictions. However, in partial dependency plots, we usually see marginal dependencies +of model prediction on feature value, while SHAP contribution dependency plots display the estimated +contributions of a feature to model prediction for each individual case. + +When \code{plot_loess = TRUE} is set, feature values are rounded to 3 significant digits and +weighted LOESS is computed and plotted, where weights are the numbers of data points +at each rounded value. + +Note: SHAP contributions are shown on the scale of model margin. E.g., for a logistic binomial objective, +the margin is prediction before a sigmoidal transform into probability-like values. +Also, since SHAP stands for "SHapley Additive exPlanation" (model prediction = sum of SHAP +contributions for all features + bias), depending on the objective used, transforming SHAP +contributions for a feature from the marginal to the prediction space is not necessarily +a meaningful thing to do. +} +\examples{ + +data(agaricus.train, package='xgboost') +data(agaricus.test, package='xgboost') + +bst <- xgboost(agaricus.train$data, agaricus.train$label, nrounds = 50, + eta = 0.1, max_depth = 3, subsample = .5, + method = "hist", objective = "binary:logistic", nthread = 2, verbose = 0) + +xgb.plot.shap(agaricus.test$data, model = bst, features = "odor=none") +contr <- predict(bst, agaricus.test$data, predcontrib = TRUE) +xgb.plot.shap(agaricus.test$data, contr, model = bst, top_n = 12, n_col = 3) + +# multiclass example - plots for each class separately: +nclass <- 3 +nrounds <- 20 +x <- as.matrix(iris[, -5]) +set.seed(123) +is.na(x[sample(nrow(x) * 4, 30)]) <- TRUE # introduce some missing values +mbst <- xgboost(data = x, label = as.numeric(iris$Species) - 1, nrounds = nrounds, + max_depth = 2, eta = 0.3, subsample = .5, nthread = 2, + objective = "multi:softprob", num_class = nclass, verbose = 0) +trees0 <- seq(from=0, by=nclass, length.out=nrounds) +col <- rgb(0, 0, 1, 0.5) +xgb.plot.shap(x, model = mbst, trees = trees0, target_class = 0, top_n = 4, n_col = 2, col = col, pch = 16, pch_NA = 17) +xgb.plot.shap(x, model = mbst, trees = trees0 + 1, target_class = 1, top_n = 4, n_col = 2, col = col, pch = 16, pch_NA = 17) +xgb.plot.shap(x, model = mbst, trees = trees0 + 2, target_class = 2, top_n = 4, n_col = 2, col = col, pch = 16, pch_NA = 17) + +} +\references{ +Scott M. Lundberg, Su-In Lee, "A Unified Approach to Interpreting Model Predictions", NIPS Proceedings 2017, \url{https://arxiv.org/abs/1705.07874} + +Scott M. Lundberg, Su-In Lee, "Consistent feature attribution for tree ensembles", \url{https://arxiv.org/abs/1706.06060} +} diff --git a/R-package/man/xgb.train.Rd b/R-package/man/xgb.train.Rd index 37a5db995..e4e4c145f 100644 --- a/R-package/man/xgb.train.Rd +++ b/R-package/man/xgb.train.Rd @@ -258,6 +258,10 @@ bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, objective = "binary:logistic") pred <- predict(bst, agaricus.test$data) +} +\references{ +Tianqi Chen and Carlos Guestrin, "XGBoost: A Scalable Tree Boosting System", +22nd SIGKDD Conference on Knowledge Discovery and Data Mining, 2016, \url{https://arxiv.org/abs/1603.02754} } \seealso{ \code{\link{callbacks}}, diff --git a/R-package/tests/testthat/test_helpers.R b/R-package/tests/testthat/test_helpers.R index 95e57929d..67f4ab40b 100644 --- a/R-package/tests/testthat/test_helpers.R +++ b/R-package/tests/testthat/test_helpers.R @@ -81,6 +81,11 @@ test_that("predict feature contributions works", { expect_equal(colnames(pred_contr), c(colnames(sparse_matrix), "BIAS")) pred <- predict(bst.Tree, sparse_matrix, outputmargin = TRUE) expect_lt(max(abs(rowSums(pred_contr) - pred)), 1e-5) + # must work with data that has no column names + X <- sparse_matrix + colnames(X) <- NULL + expect_error(pred_contr_ <- predict(bst.Tree, X, predcontrib = TRUE), regexp = NA) + expect_equal(pred_contr, pred_contr_, check.attributes = FALSE) # gbtree binary classifier (approximate method) expect_error(pred_contr <- predict(bst.Tree, sparse_matrix, predcontrib = TRUE, approxcontrib = TRUE), regexp = NA) @@ -289,6 +294,13 @@ test_that("xgb.plot.deepness works", { xgb.ggplot.deepness(model = bst.Tree) }) +test_that("xgb.plot.shap works", { + sh <- xgb.plot.shap(data = sparse_matrix, model = bst.Tree, top_n = 2, col = 4) + expect_equal(names(sh), c("data", "shap_contrib")) + expect_equal(NCOL(sh$data), 2) + expect_equal(NCOL(sh$shap_contrib), 2) +}) + test_that("check.deprecation works", { ttt <- function(a = NNULL, DUMMY=NULL, ...) { check.deprecation(...) diff --git a/cmake/Utils.cmake b/cmake/Utils.cmake index 75cbebb1f..5e34a408b 100644 --- a/cmake/Utils.cmake +++ b/cmake/Utils.cmake @@ -78,9 +78,9 @@ function(setup_rpackage_install_target rlib_target build_dir) install(CODE "file(WRITE \"${build_dir}/R-package/src/Makevars\" \"all:\")") install(CODE "file(WRITE \"${build_dir}/R-package/src/Makevars.win\" \"all:\")") set(XGB_DEPS_SCRIPT - "deps = setdiff(c('statar','data.table', 'magrittr', 'stringi'), rownames(installed.packages()));\ + "deps = setdiff(c('data.table', 'magrittr', 'stringi'), rownames(installed.packages()));\ if(length(deps)>0) install.packages(deps, repo = 'https://cloud.r-project.org/')") install(CODE "execute_process(COMMAND \"${LIBR_EXECUTABLE}\" \"-q\" \"-e\" \"${XGB_DEPS_SCRIPT}\")") install(CODE "execute_process(COMMAND \"${LIBR_EXECUTABLE}\" CMD INSTALL\ \"--no-multiarch\" \"${build_dir}/R-package\")") -endfunction(setup_rpackage_install_target) \ No newline at end of file +endfunction(setup_rpackage_install_target) diff --git a/include/xgboost/tree_model.h b/include/xgboost/tree_model.h index e95b83e77..57b673b5e 100644 --- a/include/xgboost/tree_model.h +++ b/include/xgboost/tree_model.h @@ -516,7 +516,7 @@ class RegTree: public TreeModel { * \param out_contribs output vector to hold the contributions */ inline void CalculateContributionsApprox(const RegTree::FVec& feat, unsigned root_id, - bst_float *out_contribs) const; + bst_float *out_contribs) const; /*! * \brief get next position of the tree given current pid * \param pid Current node id. @@ -619,7 +619,7 @@ inline bst_float RegTree::FillNodeMeanValue(int nid) { } inline void RegTree::CalculateContributionsApprox(const RegTree::FVec& feat, unsigned root_id, - bst_float *out_contribs) const { + bst_float *out_contribs) const { CHECK_GT(this->node_mean_values.size(), 0U); // this follows the idea of http://blog.datadive.net/interpreting-random-forests/ bst_float node_value; @@ -647,16 +647,16 @@ inline void RegTree::CalculateContributionsApprox(const RegTree::FVec& feat, uns // extend our decision path with a fraction of one and zero extensions inline void ExtendPath(PathElement *unique_path, unsigned unique_depth, - bst_float zero_fraction, bst_float one_fraction, int feature_index) { + bst_float zero_fraction, bst_float one_fraction, int feature_index) { unique_path[unique_depth].feature_index = feature_index; unique_path[unique_depth].zero_fraction = zero_fraction; unique_path[unique_depth].one_fraction = one_fraction; - unique_path[unique_depth].pweight = static_cast(unique_depth == 0 ? 1 : 0); - for (int i = unique_depth-1; i >= 0; i--) { - unique_path[i+1].pweight += one_fraction*unique_path[i].pweight*(i+1) - / static_cast(unique_depth+1); - unique_path[i].pweight = zero_fraction*unique_path[i].pweight*(unique_depth-i) - / static_cast(unique_depth+1); + unique_path[unique_depth].pweight = (unique_depth == 0 ? 1.0f : 0.0f); + for (int i = unique_depth - 1; i >= 0; i--) { + unique_path[i+1].pweight += one_fraction * unique_path[i].pweight * (i + 1) + / static_cast(unique_depth + 1); + unique_path[i].pweight = zero_fraction * unique_path[i].pweight * (unique_depth - i) + / static_cast(unique_depth + 1); } } @@ -666,16 +666,16 @@ inline void UnwindPath(PathElement *unique_path, unsigned unique_depth, unsigned const bst_float zero_fraction = unique_path[path_index].zero_fraction; bst_float next_one_portion = unique_path[unique_depth].pweight; - for (int i = unique_depth-1; i >= 0; --i) { + for (int i = unique_depth - 1; i >= 0; --i) { if (one_fraction != 0) { const bst_float tmp = unique_path[i].pweight; - unique_path[i].pweight = next_one_portion*(unique_depth+1) - / static_cast((i+1)*one_fraction); - next_one_portion = tmp - unique_path[i].pweight*zero_fraction*(unique_depth-i) - / static_cast(unique_depth+1); + unique_path[i].pweight = next_one_portion * (unique_depth + 1) + / static_cast((i + 1) * one_fraction); + next_one_portion = tmp - unique_path[i].pweight * zero_fraction * (unique_depth - i) + / static_cast(unique_depth + 1); } else { - unique_path[i].pweight = (unique_path[i].pweight*(unique_depth+1)) - / static_cast(zero_fraction*(unique_depth-i)); + unique_path[i].pweight = (unique_path[i].pweight * (unique_depth + 1)) + / static_cast(zero_fraction * (unique_depth - i)); } } @@ -694,16 +694,16 @@ inline bst_float UnwoundPathSum(const PathElement *unique_path, unsigned unique_ const bst_float zero_fraction = unique_path[path_index].zero_fraction; bst_float next_one_portion = unique_path[unique_depth].pweight; bst_float total = 0; - for (int i = unique_depth-1; i >= 0; --i) { + for (int i = unique_depth - 1; i >= 0; --i) { if (one_fraction != 0) { - const bst_float tmp = next_one_portion*(unique_depth+1) - / static_cast((i+1)*one_fraction); + const bst_float tmp = next_one_portion * (unique_depth + 1) + / static_cast((i + 1) * one_fraction); total += tmp; - next_one_portion = unique_path[i].pweight - tmp*zero_fraction*((unique_depth-i) + next_one_portion = unique_path[i].pweight - tmp * zero_fraction * ((unique_depth - i) / static_cast(unique_depth+1)); } else { - total += (unique_path[i].pweight/zero_fraction)/((unique_depth-i) - / static_cast(unique_depth+1)); + total += (unique_path[i].pweight / zero_fraction) / ((unique_depth - i) + / static_cast(unique_depth + 1)); } } return total; @@ -718,7 +718,8 @@ inline void RegTree::TreeShap(const RegTree::FVec& feat, bst_float *phi, // extend the unique path PathElement *unique_path = parent_unique_path + unique_depth; - if (unique_depth > 0) std::copy(parent_unique_path, parent_unique_path+unique_depth, unique_path); + if (unique_depth > 0) std::copy(parent_unique_path, + parent_unique_path + unique_depth, unique_path); ExtendPath(unique_path, unique_depth, parent_zero_fraction, parent_one_fraction, parent_feature_index); const unsigned split_index = node.split_index(); @@ -728,7 +729,7 @@ inline void RegTree::TreeShap(const RegTree::FVec& feat, bst_float *phi, for (unsigned i = 1; i <= unique_depth; ++i) { const bst_float w = UnwoundPathSum(unique_path, unique_depth, i); const PathElement &el = unique_path[i]; - phi[el.feature_index] += w*(el.one_fraction-el.zero_fraction)*node.leaf_value(); + phi[el.feature_index] += w * (el.one_fraction - el.zero_fraction) * node.leaf_value(); } // internal node @@ -742,10 +743,11 @@ inline void RegTree::TreeShap(const RegTree::FVec& feat, bst_float *phi, } else { hot_index = node.cright(); } - const unsigned cold_index = (hot_index == node.cleft() ? node.cright() : node.cleft()); + const unsigned cold_index = (static_cast(hot_index) == node.cleft() ? + node.cright() : node.cleft()); const bst_float w = this->stat(node_index).sum_hess; - const bst_float hot_zero_fraction = this->stat(hot_index).sum_hess/w; - const bst_float cold_zero_fraction = this->stat(cold_index).sum_hess/w; + const bst_float hot_zero_fraction = this->stat(hot_index).sum_hess / w; + const bst_float cold_zero_fraction = this->stat(cold_index).sum_hess / w; bst_float incoming_zero_fraction = 1; bst_float incoming_one_fraction = 1; @@ -753,19 +755,19 @@ inline void RegTree::TreeShap(const RegTree::FVec& feat, bst_float *phi, // if so we undo that split so we can redo it for this node unsigned path_index = 0; for (; path_index <= unique_depth; ++path_index) { - if (unique_path[path_index].feature_index == split_index) break; + if (static_cast(unique_path[path_index].feature_index) == split_index) break; } - if (path_index != unique_depth+1) { + if (path_index != unique_depth + 1) { incoming_zero_fraction = unique_path[path_index].zero_fraction; incoming_one_fraction = unique_path[path_index].one_fraction; UnwindPath(unique_path, unique_depth, path_index); unique_depth -= 1; } - TreeShap(feat, phi, hot_index, unique_depth+1, unique_path, + TreeShap(feat, phi, hot_index, unique_depth + 1, unique_path, hot_zero_fraction*incoming_zero_fraction, incoming_one_fraction, split_index); - TreeShap(feat, phi, cold_index, unique_depth+1, unique_path, + TreeShap(feat, phi, cold_index, unique_depth + 1, unique_path, cold_zero_fraction*incoming_zero_fraction, 0, split_index); } } @@ -773,21 +775,21 @@ inline void RegTree::TreeShap(const RegTree::FVec& feat, bst_float *phi, inline void RegTree::CalculateContributions(const RegTree::FVec& feat, unsigned root_id, bst_float *out_contribs) const { // find the expected value of the tree's predictions - bst_float base_value = 0.0; - bst_float total_cover = 0; + bst_float base_value = 0.0f; + bst_float total_cover = 0.0f; for (int i = 0; i < (*this).param.num_nodes; ++i) { const auto node = (*this)[i]; if (node.is_leaf()) { const auto cover = this->stat(i).sum_hess; - base_value += cover*node.leaf_value(); + base_value += cover * node.leaf_value(); total_cover += cover; } } out_contribs[feat.size()] += base_value / total_cover; // Preallocate space for the unique path data - const int maxd = this->MaxDepth(root_id)+1; - PathElement *unique_path_data = new PathElement[(maxd*(maxd+1))/2]; + const int maxd = this->MaxDepth(root_id) + 1; + PathElement *unique_path_data = new PathElement[(maxd * (maxd + 1)) / 2]; TreeShap(feat, out_contribs, root_id, 0, unique_path_data, 1, 1, -1); delete[] unique_path_data;