From 78c4188cec31425f708d238160ea3afb67a7250a Mon Sep 17 00:00:00 2001 From: Scott Lundberg Date: Thu, 12 Oct 2017 12:35:51 -0700 Subject: [PATCH] SHAP values for feature contributions (#2438) * SHAP values for feature contributions * Fix commenting error * New polynomial time SHAP value estimation algorithm * Update API to support SHAP values * Fix merge conflicts with updates in master * Correct submodule hashes * Fix variable sized stack allocation * Make lint happy * Add docs * Fix typo * Adjust tolerances * Remove unneeded def * Fixed cpp test setup * Updated R API and cleaned up * Fixed test typo --- R-package/R/xgb.Booster.R | 206 +++++++++++----------- R-package/tests/testthat/test_helpers.R | 37 ++-- include/xgboost/gbm.h | 3 +- include/xgboost/learner.h | 4 +- include/xgboost/predictor.h | 4 +- include/xgboost/tree_model.h | 180 ++++++++++++++++++- python-package/xgboost/core.py | 13 +- src/c_api/c_api.cc | 3 +- src/gbm/gblinear.cc | 2 +- src/gbm/gbtree.cc | 4 +- src/learner.cc | 4 +- src/predictor/cpu_predictor.cc | 22 ++- src/predictor/gpu_predictor.cu | 5 +- tests/cpp/predictor/test_cpu_predictor.cc | 9 +- tests/cpp/predictor/test_gpu_predictor.cu | 1 + tests/python/test_basic.py | 15 ++ 16 files changed, 369 insertions(+), 143 deletions(-) diff --git a/R-package/R/xgb.Booster.R b/R-package/R/xgb.Booster.R index 3bdc8a58b..21edbfa7f 100644 --- a/R-package/R/xgb.Booster.R +++ b/R-package/R/xgb.Booster.R @@ -39,7 +39,7 @@ xgb.handleToBooster <- function(handle, raw = NULL) { is.null.handle <- function(handle) { if (!identical(class(handle), "xgb.Booster.handle")) stop("argument type must be xgb.Booster.handle") - + if (is.null(handle) || .Call(XGCheckNullPtr_R, handle)) return(TRUE) return(FALSE) @@ -61,49 +61,49 @@ xgb.get.handle <- function(object) { #' Restore missing parts of an incomplete xgb.Booster object. #' -#' It attempts to complete an \code{xgb.Booster} object by restoring either its missing +#' It attempts to complete an \code{xgb.Booster} object by restoring either its missing #' raw model memory dump (when it has no \code{raw} data but its \code{xgb.Booster.handle} is valid) -#' or its missing internal handle (when its \code{xgb.Booster.handle} is not valid +#' or its missing internal handle (when its \code{xgb.Booster.handle} is not valid #' but it has a raw Booster memory dump). -#' +#' #' @param object object of class \code{xgb.Booster} -#' @param saveraw a flag indicating whether to append \code{raw} Booster memory dump data +#' @param saveraw a flag indicating whether to append \code{raw} Booster memory dump data #' when it doesn't already exist. -#' +#' #' @details -#' +#' #' While this method is primarily for internal use, it might be useful in some practical situations. -#' +#' #' E.g., when an \code{xgb.Booster} model is saved as an R object and then is loaded as an R object, -#' its handle (pointer) to an internal xgboost model would be invalid. The majority of xgboost methods -#' should still work for such a model object since those methods would be using -#' \code{xgb.Booster.complete} internally. However, one might find it to be more efficient to call the +#' its handle (pointer) to an internal xgboost model would be invalid. The majority of xgboost methods +#' should still work for such a model object since those methods would be using +#' \code{xgb.Booster.complete} internally. However, one might find it to be more efficient to call the #' \code{xgb.Booster.complete} function explicitely once after loading a model as an R-object. #' That would prevent further repeated implicit reconstruction of an internal booster model. -#' -#' @return +#' +#' @return #' An object of \code{xgb.Booster} class. -#' +#' #' @examples -#' +#' #' data(agaricus.train, package='xgboost') -#' bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max_depth = 2, +#' bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max_depth = 2, #' eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic") #' saveRDS(bst, "xgb.model.rds") -#' +#' #' bst1 <- readRDS("xgb.model.rds") #' # the handle is invalid: #' print(bst1$handle) -#' +#' #' bst1 <- xgb.Booster.complete(bst1) #' # now the handle points to a valid internal booster model: #' print(bst1$handle) -#' +#' #' @export xgb.Booster.complete <- function(object, saveraw = TRUE) { if (!inherits(object, "xgb.Booster")) stop("argument type must be xgb.Booster") - + if (is.null.handle(object$handle)) { object$handle <- xgb.Booster.handle(modelfile = object$raw) } else { @@ -114,88 +114,90 @@ xgb.Booster.complete <- function(object, saveraw = TRUE) { } #' Predict method for eXtreme Gradient Boosting model -#' +#' #' Predicted values based on either xgboost model or model handle object. -#' +#' #' @param object Object of class \code{xgb.Booster} or \code{xgb.Booster.handle} #' @param newdata takes \code{matrix}, \code{dgCMatrix}, local data file or \code{xgb.DMatrix}. #' @param missing Missing is only used when input is dense matrix. Pick a float value that represents #' missing values in data (e.g., sometimes 0 or some other extreme value is used). -#' @param outputmargin whether the prediction should be returned in the for of original untransformed -#' sum of predictions from boosting iterations' results. E.g., setting \code{outputmargin=TRUE} for +#' @param outputmargin whether the prediction should be returned in the for of original untransformed +#' sum of predictions from boosting iterations' results. E.g., setting \code{outputmargin=TRUE} for #' logistic regression would result in predictions for log-odds instead of probabilities. #' @param ntreelimit limit the number of model's trees or boosting iterations used in prediction (see Details). #' It will use all the trees by default (\code{NULL} value). #' @param predleaf whether predict leaf index instead. #' @param predcontrib whether to return feature contributions to individual predictions instead (see Details). -#' @param reshape whether to reshape the vector of predictions to a matrix form when there are several +#' @param approxcontrib whether to use a fast approximation for feature contributions (see Details). +#' @param reshape whether to reshape the vector of predictions to a matrix form when there are several #' prediction outputs per case. This option has no effect when \code{predleaf = TRUE}. #' @param ... Parameters passed to \code{predict.xgb.Booster} -#' -#' @details +#' +#' @details #' Note that \code{ntreelimit} is not necessarily equal to the number of boosting iterations #' and it is not necessarily equal to the number of trees in a model. #' E.g., in a random forest-like model, \code{ntreelimit} would limit the number of trees. -#' But for multiclass classification, while there are multiple trees per iteration, +#' But for multiclass classification, while there are multiple trees per iteration, #' \code{ntreelimit} limits the number of boosting iterations. -#' -#' Also note that \code{ntreelimit} would currently do nothing for predictions from gblinear, +#' +#' Also note that \code{ntreelimit} would currently do nothing for predictions from gblinear, #' since gblinear doesn't keep its boosting history. -#' -#' One possible practical applications of the \code{predleaf} option is to use the model -#' as a generator of new features which capture non-linearity and interactions, +#' +#' One possible practical applications of the \code{predleaf} option is to use the model +#' as a generator of new features which capture non-linearity and interactions, #' e.g., as implemented in \code{\link{xgb.create.features}}. -#' +#' #' Setting \code{predcontrib = TRUE} allows to calculate contributions of each feature to #' individual predictions. For "gblinear" booster, feature contributions are simply linear terms -#' (feature_beta * feature_value). For "gbtree" booster, feature contribution is calculated -#' as a sum of average contribution of that feature's split nodes across all trees to an -#' individual prediction, following the idea explained in -#' \url{http://blog.datadive.net/interpreting-random-forests/}. -#' -#' @return +#' (feature_beta * feature_value). For "gbtree" booster, feature contributions are SHAP +#' values (https://arxiv.org/abs/1706.06060) that sum to the difference between the expected output +#' of the model and the current prediction (where the hessian weights are used to compute the expectations). +#' Setting \code{approxcontrib = TRUE} approximates these values following the idea explained +#' in \url{http://blog.datadive.net/interpreting-random-forests/}. +#' +#' @return #' For regression or binary classification, it returns a vector of length \code{nrows(newdata)}. -#' For multiclass classification, either a \code{num_class * nrows(newdata)} vector or -#' a \code{(nrows(newdata), num_class)} dimension matrix is returned, depending on +#' For multiclass classification, either a \code{num_class * nrows(newdata)} vector or +#' a \code{(nrows(newdata), num_class)} dimension matrix is returned, depending on #' the \code{reshape} value. -#' -#' When \code{predleaf = TRUE}, the output is a matrix object with the +#' +#' When \code{predleaf = TRUE}, the output is a matrix object with the #' number of columns corresponding to the number of trees. -#' +#' #' When \code{predcontrib = TRUE} and it is not a multiclass setting, the output is a matrix object with #' \code{num_features + 1} columns. The last "+ 1" column in a matrix corresponds to bias. #' For a multiclass case, a list of \code{num_class} elements is returned, where each element is -#' such a matrix. The contribution values are on the scale of untransformed margin +#' such a matrix. The contribution values are on the scale of untransformed margin #' (e.g., for binary classification would mean that the contributions are log-odds deviations from bias). -#' +#' #' @seealso #' \code{\link{xgb.train}}. -#' +#' #' @examples #' ## binary classification: -#' +#' #' data(agaricus.train, package='xgboost') #' data(agaricus.test, package='xgboost') #' train <- agaricus.train #' test <- agaricus.test -#' -#' bst <- xgboost(data = train$data, label = train$label, max_depth = 2, +#' +#' bst <- xgboost(data = train$data, label = train$label, max_depth = 2, #' eta = 0.5, nthread = 2, nrounds = 5, objective = "binary:logistic") #' # use all trees by default #' pred <- predict(bst, test$data) #' # use only the 1st tree #' pred1 <- predict(bst, test$data, ntreelimit = 1) -#' +#' #' # Predicting tree leafs: #' # the result is an nsamples X ntrees matrix #' pred_leaf <- predict(bst, test$data, predleaf = TRUE) #' str(pred_leaf) -#' +#' #' # Predicting feature contributions to predictions: #' # the result is an nsamples X (nfeatures + 1) matrix #' pred_contr <- predict(bst, test$data, predcontrib = TRUE) #' str(pred_contr) -#' # verify that contributions' sums are equal to log-odds of predictions (up to foat precision): +#' # verify that contributions' sums are equal to log-odds of predictions (up to float precision): #' summary(rowSums(pred_contr) - qlogis(pred)) #' # for the 1st record, let's inspect its features that had non-zero contribution to prediction: #' contr1 <- pred_contr[1,] @@ -206,10 +208,10 @@ xgb.Booster.complete <- function(object, saveraw = TRUE) { #' par(mar = old_mar + c(0,7,0,0)) #' barplot(contr1, horiz = TRUE, las = 2, xlab = "contribution to prediction in log-odds") #' par(mar = old_mar) -#' -#' +#' +#' #' ## multiclass classification in iris dataset: -#' +#' #' lb <- as.numeric(iris$Species) - 1 #' num_class <- 3 #' set.seed(11) @@ -225,7 +227,7 @@ xgb.Booster.complete <- function(object, saveraw = TRUE) { #' pred_labels <- max.col(pred) - 1 #' # the following should result in the same error as seen in the last iteration #' sum(pred_labels != lb)/length(lb) -#' +#' #' # compare that to the predictions from softmax: #' set.seed(11) #' bst <- xgboost(data = as.matrix(iris[, -5]), label = lb, @@ -234,14 +236,14 @@ xgb.Booster.complete <- function(object, saveraw = TRUE) { #' pred <- predict(bst, as.matrix(iris[, -5])) #' str(pred) #' all.equal(pred, pred_labels) -#' # prediction from using only 5 iterations should result +#' # prediction from using only 5 iterations should result #' # in the same error as seen in iteration 5: #' pred5 <- predict(bst, as.matrix(iris[, -5]), ntreelimit=5) #' sum(pred5 != lb)/length(lb) -#' -#' +#' +#' #' ## random forest-like model of 25 trees for binary classification: -#' +#' #' set.seed(11) #' bst <- xgboost(data = train$data, label = train$label, max_depth = 5, #' nthread = 2, nrounds = 1, objective = "binary:logistic", @@ -258,7 +260,7 @@ xgb.Booster.complete <- function(object, saveraw = TRUE) { #' @rdname predict.xgb.Booster #' @export predict.xgb.Booster <- function(object, newdata, missing = NA, outputmargin = FALSE, ntreelimit = NULL, - predleaf = FALSE, predcontrib = FALSE, reshape = FALSE, ...) { + predleaf = FALSE, predcontrib = FALSE, approxcontrib = FALSE, reshape = FALSE, ...) { object <- xgb.Booster.complete(object, saveraw = FALSE) if (!inherits(newdata, "xgb.DMatrix")) @@ -269,18 +271,18 @@ predict.xgb.Booster <- function(object, newdata, missing = NA, outputmargin = FA ntreelimit <- 0 if (ntreelimit < 0) stop("ntreelimit cannot be negative") - - option <- 0L + 1L * as.logical(outputmargin) + 2L * as.logical(predleaf) + 4L * as.logical(predcontrib) - + + option <- 0L + 1L * as.logical(outputmargin) + 2L * as.logical(predleaf) + 4L * as.logical(predcontrib) + 8L * as.logical(approxcontrib) + ret <- .Call(XGBoosterPredict_R, object$handle, newdata, option[1], as.integer(ntreelimit)) - + n_ret <- length(ret) n_row <- nrow(newdata) npred_per_case <- n_ret / n_row - + if (n_ret %% n_row != 0) stop("prediction length ", n_ret, " is not multiple of nrows(newdata) ", n_row) - + if (predleaf) { ret <- if (n_ret == n_row) { matrix(ret, ncol = 1) @@ -325,9 +327,9 @@ predict.xgb.Booster.handle <- function(object, ...) { #' #' @param object Object of class \code{xgb.Booster} or \code{xgb.Booster.handle}. #' @param name a non-empty character string specifying which attribute is to be accessed. -#' @param value a value of an attribute for \code{xgb.attr<-}; for \code{xgb.attributes<-} -#' it's a list (or an object coercible to a list) with the names of attributes to set -#' and the elements corresponding to attribute values. +#' @param value a value of an attribute for \code{xgb.attr<-}; for \code{xgb.attributes<-} +#' it's a list (or an object coercible to a list) with the names of attributes to set +#' and the elements corresponding to attribute values. #' Non-character values are converted to character. #' When attribute value is not a scalar, only the first index is used. #' Use \code{NULL} to remove an attribute. @@ -336,32 +338,32 @@ predict.xgb.Booster.handle <- function(object, ...) { #' The primary purpose of xgboost model attributes is to store some meta-data about the model. #' Note that they are a separate concept from the object attributes in R. #' Specifically, they refer to key-value strings that can be attached to an xgboost model, -#' stored together with the model's binary representation, and accessed later +#' stored together with the model's binary representation, and accessed later #' (from R or any other interface). #' In contrast, any R-attribute assigned to an R-object of \code{xgb.Booster} class #' would not be saved by \code{xgb.save} because an xgboost model is an external memory object #' and its serialization is handled externally. -#' Also, setting an attribute that has the same name as one of xgboost's parameters wouldn't -#' change the value of that parameter for a model. +#' Also, setting an attribute that has the same name as one of xgboost's parameters wouldn't +#' change the value of that parameter for a model. #' Use \code{\link{xgb.parameters<-}} to set or change model parameters. -#' +#' #' The attribute setters would usually work more efficiently for \code{xgb.Booster.handle} #' than for \code{xgb.Booster}, since only just a handle (pointer) would need to be copied. #' That would only matter if attributes need to be set many times. #' Note, however, that when feeding a handle of an \code{xgb.Booster} object to the attribute setters, -#' the raw model cache of an \code{xgb.Booster} object would not be automatically updated, +#' the raw model cache of an \code{xgb.Booster} object would not be automatically updated, #' and it would be user's responsibility to call \code{xgb.save.raw} to update it. -#' -#' The \code{xgb.attributes<-} setter either updates the existing or adds one or several attributes, +#' +#' The \code{xgb.attributes<-} setter either updates the existing or adds one or several attributes, #' but it doesn't delete the other existing attributes. -#' +#' #' @return -#' \code{xgb.attr} returns either a string value of an attribute +#' \code{xgb.attr} returns either a string value of an attribute #' or \code{NULL} if an attribute wasn't stored in a model. -#' -#' \code{xgb.attributes} returns a list of all attribute stored in a model +#' +#' \code{xgb.attributes} returns a list of all attribute stored in a model #' or \code{NULL} if a model has no stored attributes. -#' +#' #' @examples #' data(agaricus.train, package='xgboost') #' train <- agaricus.train @@ -377,13 +379,13 @@ predict.xgb.Booster.handle <- function(object, ...) { #' bst1 <- xgb.load('xgb.model') #' print(xgb.attr(bst1, "my_attribute")) #' print(xgb.attributes(bst1)) -#' +#' #' # deletion: #' xgb.attr(bst1, "my_attribute") <- NULL #' print(xgb.attributes(bst1)) #' xgb.attributes(bst1) <- list(a = NULL, b = NULL) #' print(xgb.attributes(bst1)) -#' +#' #' @rdname xgb.attr #' @export xgb.attr <- function(object, name) { @@ -464,7 +466,7 @@ xgb.attributes <- function(object) { #' @details #' Note that the setter would usually work more efficiently for \code{xgb.Booster.handle} #' than for \code{xgb.Booster}, since only just a handle would need to be copied. -#' +#' #' @examples #' data(agaricus.train, package='xgboost') #' train <- agaricus.train @@ -473,7 +475,7 @@ xgb.attributes <- function(object) { #' eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic") #' #' xgb.parameters(bst) <- list(eta = 0.1) -#' +#' #' @rdname xgb.parameters #' @export `xgb.parameters<-` <- function(object, value) { @@ -503,28 +505,28 @@ xgb.ntree <- function(bst) { #' Print xgb.Booster -#' +#' #' Print information about xgb.Booster. -#' +#' #' @param x an xgb.Booster object #' @param verbose whether to print detailed data (e.g., attribute values) #' @param ... not currently used -#' +#' #' @examples #' data(agaricus.train, package='xgboost') #' train <- agaricus.train #' bst <- xgboost(data = train$data, label = train$label, max_depth = 2, #' eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic") #' attr(bst, 'myattr') <- 'memo' -#' +#' #' print(bst) #' print(bst, verbose=TRUE) #' -#' @method print xgb.Booster +#' @method print xgb.Booster #' @export print.xgb.Booster <- function(x, verbose = FALSE, ...) { cat('##### xgb.Booster\n') - + valid_handle <- is.null.handle(x$handle) if (!valid_handle) cat("Handle is invalid! Suggest using xgb.Booster.complete\n") @@ -539,10 +541,10 @@ print.xgb.Booster <- function(x, verbose = FALSE, ...) { cat('call:\n ') print(x$call) } - + if (!is.null(x$params)) { cat('params (as set within xgb.train):\n') - cat( ' ', + cat( ' ', paste(names(x$params), paste0('"', unlist(x$params), '"'), sep = ' = ', collapse = ', '), '\n', sep = '') @@ -562,7 +564,7 @@ print.xgb.Booster <- function(x, verbose = FALSE, ...) { cat(' ', paste(names(attrs), collapse = ', '), '\n', sep = '') } } - + if (!is.null(x$callbacks) && length(x$callbacks) > 0) { cat('callbacks:\n') lapply(callback.calls(x$callbacks), function(x) { @@ -570,14 +572,14 @@ print.xgb.Booster <- function(x, verbose = FALSE, ...) { print(x) }) } - + if (!is.null(x$feature_names)) cat('# of features:', length(x$feature_names), '\n') - + cat('niter: ', x$niter, '\n', sep = '') # TODO: uncomment when faster xgb.ntree is implemented #cat('ntree: ', xgb.ntree(x), '\n', sep='') - + for (n in setdiff(names(x), c('handle', 'raw', 'call', 'params', 'callbacks', 'evaluation_log','niter','feature_names'))) { if (is.atomic(x[[n]])) { @@ -587,11 +589,11 @@ print.xgb.Booster <- function(x, verbose = FALSE, ...) { print(x[[n]]) } } - + if (!is.null(x$evaluation_log)) { cat('evaluation_log:\n') print(x$evaluation_log, row.names = FALSE, topn = 2) } - + invisible(x) } diff --git a/R-package/tests/testthat/test_helpers.R b/R-package/tests/testthat/test_helpers.R index 53f5deab0..95e57929d 100644 --- a/R-package/tests/testthat/test_helpers.R +++ b/R-package/tests/testthat/test_helpers.R @@ -52,9 +52,9 @@ test_that("xgb.dump works", { test_that("xgb.dump works for gblinear", { expect_length(xgb.dump(bst.GLM), 14) - # also make sure that it works properly for a sparse model where some coefficients + # also make sure that it works properly for a sparse model where some coefficients # are 0 from setting large L1 regularization: - bst.GLM.sp <- xgboost(data = sparse_matrix, label = label, eta = 1, nthread = 2, nrounds = 1, + bst.GLM.sp <- xgboost(data = sparse_matrix, label = label, eta = 1, nthread = 2, nrounds = 1, alpha=2, objective = "binary:logistic", booster = "gblinear") d.sp <- xgb.dump(bst.GLM.sp) expect_length(d.sp, 14) @@ -80,28 +80,35 @@ test_that("predict feature contributions works", { expect_equal(dim(pred_contr), c(nrow(sparse_matrix), ncol(sparse_matrix) + 1)) expect_equal(colnames(pred_contr), c(colnames(sparse_matrix), "BIAS")) pred <- predict(bst.Tree, sparse_matrix, outputmargin = TRUE) - expect_lt(max(abs(rowSums(pred_contr) - pred)), 1e-6) - + expect_lt(max(abs(rowSums(pred_contr) - pred)), 1e-5) + + # gbtree binary classifier (approximate method) + expect_error(pred_contr <- predict(bst.Tree, sparse_matrix, predcontrib = TRUE, approxcontrib = TRUE), regexp = NA) + expect_equal(dim(pred_contr), c(nrow(sparse_matrix), ncol(sparse_matrix) + 1)) + expect_equal(colnames(pred_contr), c(colnames(sparse_matrix), "BIAS")) + pred <- predict(bst.Tree, sparse_matrix, outputmargin = TRUE) + expect_lt(max(abs(rowSums(pred_contr) - pred)), 1e-5) + # gblinear binary classifier expect_error(pred_contr <- predict(bst.GLM, sparse_matrix, predcontrib = TRUE), regexp = NA) expect_equal(dim(pred_contr), c(nrow(sparse_matrix), ncol(sparse_matrix) + 1)) expect_equal(colnames(pred_contr), c(colnames(sparse_matrix), "BIAS")) pred <- predict(bst.GLM, sparse_matrix, outputmargin = TRUE) - expect_lt(max(abs(rowSums(pred_contr) - pred)), 2e-6) + expect_lt(max(abs(rowSums(pred_contr) - pred)), 1e-5) # manual calculation of linear terms coefs <- xgb.dump(bst.GLM)[-c(1,2,4)] %>% as.numeric coefs <- c(coefs[-1], coefs[1]) # intercept must be the last pred_contr_manual <- sweep(cbind(sparse_matrix, 1), 2, coefs, FUN="*") - expect_equal(as.numeric(pred_contr), as.numeric(pred_contr_manual), 2e-6) + expect_equal(as.numeric(pred_contr), as.numeric(pred_contr_manual), 1e-5) - # gbtree multiclass + # gbtree multiclass pred <- predict(mbst.Tree, as.matrix(iris[, -5]), outputmargin = TRUE, reshape = TRUE) pred_contr <- predict(mbst.Tree, as.matrix(iris[, -5]), predcontrib = TRUE) expect_is(pred_contr, "list") expect_length(pred_contr, 3) for (g in seq_along(pred_contr)) { expect_equal(colnames(pred_contr[[g]]), c(colnames(iris[, -5]), "BIAS")) - expect_lt(max(abs(rowSums(pred_contr[[g]]) - pred[, g])), 2e-6) + expect_lt(max(abs(rowSums(pred_contr[[g]]) - pred[, g])), 1e-5) } # gblinear multiclass (set base_score = 0, which is base margin in multiclass) @@ -192,10 +199,10 @@ test_that("xgb.model.dt.tree works with and without feature names", { expect_equal(names.dt.trees, names(dt.tree)) expect_equal(dim(dt.tree), c(188, 10)) expect_output(str(dt.tree), 'Feature.*\\"Age\\"') - + dt.tree.0 <- xgb.model.dt.tree(model = bst.Tree) expect_equal(dt.tree, dt.tree.0) - + # when model contains no feature names: bst.Tree.x <- bst.Tree bst.Tree.x$feature_names <- NULL @@ -219,20 +226,20 @@ test_that("xgb.importance works with and without feature names", { expect_equal(dim(importance.Tree), c(7, 4)) expect_equal(colnames(importance.Tree), c("Feature", "Gain", "Cover", "Frequency")) expect_output(str(importance.Tree), 'Feature.*\\"Age\\"') - + importance.Tree.0 <- xgb.importance(model = bst.Tree) expect_equal(importance.Tree, importance.Tree.0) - + # when model contains no feature names: bst.Tree.x <- bst.Tree bst.Tree.x$feature_names <- NULL importance.Tree.x <- xgb.importance(model = bst.Tree) expect_equal(importance.Tree[, -1, with=FALSE], importance.Tree.x[, -1, with=FALSE]) - + imp2plot <- xgb.plot.importance(importance_matrix = importance.Tree) expect_equal(colnames(imp2plot), c("Feature", "Gain", "Cover", "Frequency", "Importance")) xgb.ggplot.importance(importance_matrix = importance.Tree) - + # for multiclass imp.Tree <- xgb.importance(model = mbst.Tree) expect_equal(dim(imp.Tree), c(4, 4)) @@ -247,7 +254,7 @@ test_that("xgb.importance works with GLM model", { imp2plot <- xgb.plot.importance(importance.GLM) expect_equal(colnames(imp2plot), c("Feature", "Weight", "Importance")) xgb.ggplot.importance(importance.GLM) - + # for multiclass imp.GLM <- xgb.importance(model = mbst.GLM) expect_equal(dim(imp.GLM), c(12, 3)) diff --git a/include/xgboost/gbm.h b/include/xgboost/gbm.h index d5e86677c..877890509 100644 --- a/include/xgboost/gbm.h +++ b/include/xgboost/gbm.h @@ -115,10 +115,11 @@ class GradientBooster { * \param out_contribs output vector to hold the contributions * \param ntree_limit limit the number of trees used in prediction, when it equals 0, this means * we do not limit number of trees + * \param approximate use a faster (inconsistent) approximation of SHAP values */ virtual void PredictContribution(DMatrix* dmat, std::vector* out_contribs, - unsigned ntree_limit = 0) = 0; + unsigned ntree_limit = 0, bool approximate = false) = 0; /*! * \brief dump the model in the requested format diff --git a/include/xgboost/learner.h b/include/xgboost/learner.h index bcd200bf3..9b4149355 100644 --- a/include/xgboost/learner.h +++ b/include/xgboost/learner.h @@ -104,13 +104,15 @@ class Learner : public rabit::Serializable { * predictor, when it equals 0, this means we are using all the trees * \param pred_leaf whether to only predict the leaf index of each tree in a boosted tree predictor * \param pred_contribs whether to only predict the feature contributions + * \param approx_contribs whether to approximate the feature contributions for speed */ virtual void Predict(DMatrix* data, bool output_margin, std::vector *out_preds, unsigned ntree_limit = 0, bool pred_leaf = false, - bool pred_contribs = false) const = 0; + bool pred_contribs = false, + bool approx_contribs = false) const = 0; /*! * \brief Set additional attribute to the Booster. * The property will be saved along the booster. diff --git a/include/xgboost/predictor.h b/include/xgboost/predictor.h index 8f0104f5a..cc89bb60a 100644 --- a/include/xgboost/predictor.h +++ b/include/xgboost/predictor.h @@ -144,12 +144,14 @@ class Predictor { * \param [in,out] out_contribs The output feature contribs. * \param model Model to make predictions from. * \param ntree_limit (Optional) The ntree limit. + * \param approximate Use fast approximate algorithm. */ virtual void PredictContribution(DMatrix* dmat, std::vector* out_contribs, const gbm::GBTreeModel& model, - unsigned ntree_limit = 0) = 0; + unsigned ntree_limit = 0, + bool approximate = false) = 0; /** * \fn static Predictor* Predictor::Create(std::string name); diff --git a/include/xgboost/tree_model.h b/include/xgboost/tree_model.h index 3cf33780d..fb8b16ec1 100644 --- a/include/xgboost/tree_model.h +++ b/include/xgboost/tree_model.h @@ -14,6 +14,7 @@ #include #include #include +#include #include "./base.h" #include "./data.h" #include "./logging.h" @@ -411,6 +412,20 @@ struct RTreeNodeStat { int leaf_child_cnt; }; +// Used by TreeShap +// data we keep about our decision path +// note that pweight is included for convenience and is not tied with the other attributes +// the pweight of the i'th path element is the permuation weight of paths with i-1 ones in them +struct PathElement { + int feature_index; + bst_float zero_fraction; + bst_float one_fraction; + bst_float pweight; + PathElement() {} + PathElement(int i, bst_float z, bst_float o, bst_float w) : + feature_index(i), zero_fraction(z), one_fraction(o), pweight(w) {} +}; + /*! * \brief define regression tree to be the most common tree model. * This is the data structure used in xgboost's major tree models. @@ -482,13 +497,26 @@ class RegTree: public TreeModel { */ inline bst_float Predict(const FVec& feat, unsigned root_id = 0) const; /*! - * \brief calculate the feature contributions for the given root + * \brief calculate the feature contributions (https://arxiv.org/abs/1706.06060) for the tree * \param feat dense feature vector, if the feature is missing the field is set to NaN * \param root_id starting root index of the instance * \param out_contribs output vector to hold the contributions */ inline void CalculateContributions(const RegTree::FVec& feat, unsigned root_id, bst_float *out_contribs) const; + inline void TreeShap(const RegTree::FVec& feat, bst_float *phi, + unsigned node_index, unsigned unique_depth, + PathElement *parent_unique_path, bst_float parent_zero_fraction, + bst_float parent_one_fraction, int parent_feature_index) const; + + /*! + * \brief calculate the approximate feature contributions for the given root + * \param feat dense feature vector, if the feature is missing the field is set to NaN + * \param root_id starting root index of the instance + * \param out_contribs output vector to hold the contributions + */ + inline void CalculateContributionsApprox(const RegTree::FVec& feat, unsigned root_id, + bst_float *out_contribs) const; /*! * \brief get next position of the tree given current pid * \param pid Current node id. @@ -590,7 +618,7 @@ inline bst_float RegTree::FillNodeMeanValue(int nid) { return result; } -inline void RegTree::CalculateContributions(const RegTree::FVec& feat, unsigned root_id, +inline void RegTree::CalculateContributionsApprox(const RegTree::FVec& feat, unsigned root_id, bst_float *out_contribs) const { CHECK_GT(this->node_mean_values.size(), 0U); // this follows the idea of http://blog.datadive.net/interpreting-random-forests/ @@ -617,6 +645,154 @@ inline void RegTree::CalculateContributions(const RegTree::FVec& feat, unsigned out_contribs[split_index] += leaf_value - node_value; } +// extend our decision path with a fraction of one and zero extensions +inline void ExtendPath(PathElement *unique_path, unsigned unique_depth, + bst_float zero_fraction, bst_float one_fraction, int feature_index) { + unique_path[unique_depth].feature_index = feature_index; + unique_path[unique_depth].zero_fraction = zero_fraction; + unique_path[unique_depth].one_fraction = one_fraction; + unique_path[unique_depth].pweight = (unique_depth == 0 ? 1 : 0); + for (int i = unique_depth-1; i >= 0; i--) { + unique_path[i+1].pweight += one_fraction*unique_path[i].pweight*(i+1) + / static_cast(unique_depth+1); + unique_path[i].pweight = zero_fraction*unique_path[i].pweight*(unique_depth-i) + / static_cast(unique_depth+1); + } +} + +// undo a previous extension of the decision path +inline void UnwindPath(PathElement *unique_path, unsigned unique_depth, unsigned path_index) { + const bst_float one_fraction = unique_path[path_index].one_fraction; + const bst_float zero_fraction = unique_path[path_index].zero_fraction; + bst_float next_one_portion = unique_path[unique_depth].pweight; + + for (int i = unique_depth-1; i >= 0; --i) { + if (one_fraction != 0) { + const bst_float tmp = unique_path[i].pweight; + unique_path[i].pweight = next_one_portion*(unique_depth+1) + / static_cast((i+1)*one_fraction); + next_one_portion = tmp - unique_path[i].pweight*zero_fraction*(unique_depth-i) + / static_cast(unique_depth+1); + } else { + unique_path[i].pweight = (unique_path[i].pweight*(unique_depth+1)) + / static_cast(zero_fraction*(unique_depth-i)); + } + } + + for (int i = path_index; i < unique_depth; ++i) { + unique_path[i].feature_index = unique_path[i+1].feature_index; + unique_path[i].zero_fraction = unique_path[i+1].zero_fraction; + unique_path[i].one_fraction = unique_path[i+1].one_fraction; + } +} + +// determine what the total permuation weight would be if +// we unwound a previous extension in the decision path +inline bst_float UnwoundPathSum(const PathElement *unique_path, unsigned unique_depth, + unsigned path_index) { + const bst_float one_fraction = unique_path[path_index].one_fraction; + const bst_float zero_fraction = unique_path[path_index].zero_fraction; + bst_float next_one_portion = unique_path[unique_depth].pweight; + bst_float total = 0; + for (int i = unique_depth-1; i >= 0; --i) { + if (one_fraction != 0) { + const bst_float tmp = next_one_portion*(unique_depth+1) + / static_cast((i+1)*one_fraction); + total += tmp; + next_one_portion = unique_path[i].pweight - tmp*zero_fraction*((unique_depth-i) + / static_cast(unique_depth+1)); + } else { + total += (unique_path[i].pweight/zero_fraction)/((unique_depth-i) + / static_cast(unique_depth+1)); + } + } + return total; +} + +// recursive computation of SHAP values for a decision tree +inline void RegTree::TreeShap(const RegTree::FVec& feat, bst_float *phi, + unsigned node_index, unsigned unique_depth, + PathElement *parent_unique_path, bst_float parent_zero_fraction, + bst_float parent_one_fraction, int parent_feature_index) const { + const auto node = (*this)[node_index]; + + // extend the unique path + PathElement *unique_path = parent_unique_path + unique_depth; + if (unique_depth > 0) std::copy(parent_unique_path, parent_unique_path+unique_depth, unique_path); + ExtendPath(unique_path, unique_depth, parent_zero_fraction, + parent_one_fraction, parent_feature_index); + const unsigned split_index = node.split_index(); + + // leaf node + if (node.is_leaf()) { + for (int i = 1; i <= unique_depth; ++i) { + const bst_float w = UnwoundPathSum(unique_path, unique_depth, i); + const PathElement &el = unique_path[i]; + phi[el.feature_index] += w*(el.one_fraction-el.zero_fraction)*node.leaf_value(); + } + + // internal node + } else { + // find which branch is "hot" (meaning x would follow it) + unsigned hot_index = 0; + if (feat.is_missing(split_index)) { + hot_index = node.cdefault(); + } else if (feat.fvalue(split_index) < node.split_cond()) { + hot_index = node.cleft(); + } else { + hot_index = node.cright(); + } + const unsigned cold_index = (hot_index == node.cleft() ? node.cright() : node.cleft()); + const bst_float w = this->stat(node_index).sum_hess; + const bst_float hot_zero_fraction = this->stat(hot_index).sum_hess/w; + const bst_float cold_zero_fraction = this->stat(cold_index).sum_hess/w; + bst_float incoming_zero_fraction = 1; + bst_float incoming_one_fraction = 1; + + // see if we have already split on this feature, + // if so we undo that split so we can redo it for this node + unsigned path_index = 0; + for (; path_index <= unique_depth; ++path_index) { + if (unique_path[path_index].feature_index == split_index) break; + } + if (path_index != unique_depth+1) { + incoming_zero_fraction = unique_path[path_index].zero_fraction; + incoming_one_fraction = unique_path[path_index].one_fraction; + UnwindPath(unique_path, unique_depth, path_index); + unique_depth -= 1; + } + + TreeShap(feat, phi, hot_index, unique_depth+1, unique_path, + hot_zero_fraction*incoming_zero_fraction, incoming_one_fraction, split_index); + + TreeShap(feat, phi, cold_index, unique_depth+1, unique_path, + cold_zero_fraction*incoming_zero_fraction, 0, split_index); + } +} + +inline void RegTree::CalculateContributions(const RegTree::FVec& feat, unsigned root_id, + bst_float *out_contribs) const { + // find the expected value of the tree's predictions + bst_float base_value = 0.0; + bst_float total_cover = 0; + for (unsigned i = 0; i < (*this).param.num_nodes; ++i) { + const auto node = (*this)[i]; + if (node.is_leaf()) { + const auto cover = this->stat(i).sum_hess; + base_value += cover*node.leaf_value(); + total_cover += cover; + } + } + out_contribs[feat.size()] += base_value / total_cover; + + // Preallocate space for the unique path data + const int maxd = this->MaxDepth(root_id)+1; + PathElement *unique_path_data = new PathElement[(maxd*(maxd+1))/2]; + + TreeShap(feat, out_contribs, root_id, 0, unique_path_data, 1, 1, -1); + delete[] unique_path_data; +} + /*! \brief get next position of the tree given current pid */ inline int RegTree::GetNext(int pid, bst_float fvalue, bool is_unknown) const { bst_float split_value = (*this)[pid].split_cond(); diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py index 802e93a60..ca544f6d7 100644 --- a/python-package/xgboost/core.py +++ b/python-package/xgboost/core.py @@ -990,7 +990,7 @@ class Booster(object): return self.eval_set([(data, name)], iteration) def predict(self, data, output_margin=False, ntree_limit=0, pred_leaf=False, - pred_contribs=False): + pred_contribs=False, approx_contribs=False): """ Predict with data. @@ -1018,9 +1018,12 @@ class Booster(object): pred_contribs : bool When this option is on, the output will be a matrix of (nsample, nfeats+1) - with each record indicating the feature contributions of all trees. The sum of - all feature contributions is equal to the prediction. Note that the bias is added - as the final column, on top of the regular features. + with each record indicating the feature contributions (SHAP values) for that + prediction. The sum of all feature contributions is equal to the prediction. + Note that the bias is added as the final column, on top of the regular features. + + approx_contribs : bool + Approximate the contributions of each feature Returns ------- @@ -1033,6 +1036,8 @@ class Booster(object): option_mask |= 0x02 if pred_contribs: option_mask |= 0x04 + if approx_contribs: + option_mask |= 0x08 self._validate_features(data) diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc index e626c2fa4..7756628d0 100644 --- a/src/c_api/c_api.cc +++ b/src/c_api/c_api.cc @@ -758,7 +758,8 @@ XGB_DLL int XGBoosterPredict(BoosterHandle handle, (option_mask & 1) != 0, &preds, ntree_limit, (option_mask & 2) != 0, - (option_mask & 4) != 0); + (option_mask & 4) != 0, + (option_mask & 8) != 0); *out_result = dmlc::BeginPtr(preds); *len = static_cast(preds.size()); API_END(); diff --git a/src/gbm/gblinear.cc b/src/gbm/gblinear.cc index d839532ad..4bda25f92 100644 --- a/src/gbm/gblinear.cc +++ b/src/gbm/gblinear.cc @@ -224,7 +224,7 @@ class GBLinear : public GradientBooster { void PredictContribution(DMatrix* p_fmat, std::vector* out_contribs, - unsigned ntree_limit) override { + unsigned ntree_limit, bool approximate) override { if (model.weight.size() == 0) { model.InitModel(); } diff --git a/src/gbm/gbtree.cc b/src/gbm/gbtree.cc index 2235b79e6..990327e5f 100644 --- a/src/gbm/gbtree.cc +++ b/src/gbm/gbtree.cc @@ -233,8 +233,8 @@ class GBTree : public GradientBooster { void PredictContribution(DMatrix* p_fmat, std::vector* out_contribs, - unsigned ntree_limit) override { - predictor->PredictContribution(p_fmat, out_contribs, model_, ntree_limit); + unsigned ntree_limit, bool approximate) override { + predictor->PredictContribution(p_fmat, out_contribs, model_, ntree_limit, approximate); } std::vector DumpModel(const FeatureMap& fmap, diff --git a/src/learner.cc b/src/learner.cc index 7555fadc6..7a9a6cc2d 100644 --- a/src/learner.cc +++ b/src/learner.cc @@ -433,9 +433,9 @@ class LearnerImpl : public Learner { void Predict(DMatrix* data, bool output_margin, std::vector* out_preds, unsigned ntree_limit, - bool pred_leaf, bool pred_contribs) const override { + bool pred_leaf, bool pred_contribs, bool approx_contribs) const override { if (pred_contribs) { - gbm_->PredictContribution(data, out_preds, ntree_limit); + gbm_->PredictContribution(data, out_preds, ntree_limit, approx_contribs); } else if (pred_leaf) { gbm_->PredictLeaf(data, out_preds, ntree_limit); } else { diff --git a/src/predictor/cpu_predictor.cc b/src/predictor/cpu_predictor.cc index 0b5190bd6..de0a85c49 100644 --- a/src/predictor/cpu_predictor.cc +++ b/src/predictor/cpu_predictor.cc @@ -206,9 +206,9 @@ class CPUPredictor : public Predictor { } } - void PredictContribution(DMatrix* p_fmat, - std::vector* out_contribs, - const gbm::GBTreeModel& model, unsigned ntree_limit) override { + void PredictContribution(DMatrix* p_fmat, std::vector* out_contribs, + const gbm::GBTreeModel& model, unsigned ntree_limit, + bool approximate) override { const int nthread = omp_get_max_threads(); InitThreadTemp(nthread, model.param.num_feature); const MetaInfo& info = p_fmat->info(); @@ -225,10 +225,12 @@ class CPUPredictor : public Predictor { // make sure contributions is zeroed, we could be reusing a previously // allocated one std::fill(contribs.begin(), contribs.end(), 0); -// initialize tree node mean values -#pragma omp parallel for schedule(static) - for (bst_omp_uint i = 0; i < ntree_limit; ++i) { - model.trees[i]->FillNodeMeanValues(); + if (approximate) { + // initialize tree node mean values + #pragma omp parallel for schedule(static) + for (bst_omp_uint i = 0; i < ntree_limit; ++i) { + model.trees[i]->FillNodeMeanValues(); + } } // start collecting the contributions dmlc::DataIter* iter = p_fmat->RowIterator(); @@ -253,7 +255,11 @@ class CPUPredictor : public Predictor { if (model.tree_info[j] != gid) { continue; } - model.trees[j]->CalculateContributions(feats, root_id, p_contribs); + if (!approximate) { + model.trees[j]->CalculateContributions(feats, root_id, p_contribs); + } else { + model.trees[j]->CalculateContributionsApprox(feats, root_id, p_contribs); + } } feats.Drop(batch[i]); // add base margin to BIAS diff --git a/src/predictor/gpu_predictor.cu b/src/predictor/gpu_predictor.cu index 8bf921d5c..be63bd385 100644 --- a/src/predictor/gpu_predictor.cu +++ b/src/predictor/gpu_predictor.cu @@ -384,9 +384,10 @@ class GPUPredictor : public xgboost::Predictor { void PredictContribution(DMatrix* p_fmat, std::vector* out_contribs, const gbm::GBTreeModel& model, - unsigned ntree_limit) override { + unsigned ntree_limit, + bool approximate) override { cpu_predictor->PredictContribution(p_fmat, out_contribs, model, - ntree_limit); + ntree_limit, approximate); } void Init(const std::vector>& cfg, diff --git a/tests/cpp/predictor/test_cpu_predictor.cc b/tests/cpp/predictor/test_cpu_predictor.cc index 011e2c392..025aa11e5 100644 --- a/tests/cpp/predictor/test_cpu_predictor.cc +++ b/tests/cpp/predictor/test_cpu_predictor.cc @@ -12,6 +12,7 @@ TEST(cpu_predictor, Test) { trees.push_back(std::unique_ptr(new RegTree)); trees.back()->InitModel(); (*trees.back())[0].set_leaf(1.5f); + (*trees.back()).stat(0).sum_hess = 1.0f; gbm::GBTreeModel model(0.5); model.CommitModel(std::move(trees), 0); model.param.num_output_group = 1; @@ -50,5 +51,11 @@ TEST(cpu_predictor, Test) { for (int i = 0; i < out_contribution.size(); i++) { ASSERT_EQ(out_contribution[i], 1.5); } + + // Test predict contribution (approximate method) + cpu_predictor->PredictContribution(dmat.get(), &out_contribution, model, true); + for (int i = 0; i < out_contribution.size(); i++) { + ASSERT_EQ(out_contribution[i], 1.5); + } } -} // namespace xgboost \ No newline at end of file +} // namespace xgboost diff --git a/tests/cpp/predictor/test_gpu_predictor.cu b/tests/cpp/predictor/test_gpu_predictor.cu index 4a7399460..712fad986 100644 --- a/tests/cpp/predictor/test_gpu_predictor.cu +++ b/tests/cpp/predictor/test_gpu_predictor.cu @@ -19,6 +19,7 @@ TEST(gpu_predictor, Test) { trees.push_back(std::unique_ptr()); trees.back()->InitModel(); (*trees.back())[0].set_leaf(1.5f); + (*trees.back()).stat(0).sum_hess = 1.0f; gbm::GBTreeModel model(0.5); model.CommitModel(std::move(trees), 0); model.param.num_output_group = 1; diff --git a/tests/python/test_basic.py b/tests/python/test_basic.py index 5d5eccaab..60209f263 100644 --- a/tests/python/test_basic.py +++ b/tests/python/test_basic.py @@ -291,3 +291,18 @@ def test_contributions(): for max_depth, num_rounds in itertools.product(range(0, 3), range(1, 5)): yield test_fn, max_depth, num_rounds + + # check that we get the right SHAP values for a basic AND example + # (https://arxiv.org/abs/1706.06060) + X = np.zeros((4, 2)) + X[0, :] = 1 + X[1, 0] = 1 + X[2, 1] = 1 + y = np.zeros(4) + y[0] = 1 + param = {"max_depth": 2, "base_score": 0.0, "eta": 1.0, "lambda": 0} + bst = xgb.train(param, xgb.DMatrix(X, label=y), 1) + out = bst.predict(xgb.DMatrix(X[0:1, :]), pred_contribs=True) + assert out[0, 0] == 0.375 + assert out[0, 1] == 0.375 + assert out[0, 2] == 0.25