SHAP values for feature contributions (#2438)

* SHAP values for feature contributions

* Fix commenting error

* New polynomial time SHAP value estimation algorithm

* Update API to support SHAP values

* Fix merge conflicts with updates in master

* Correct submodule hashes

* Fix variable sized stack allocation

* Make lint happy

* Add docs

* Fix typo

* Adjust tolerances

* Remove unneeded def

* Fixed cpp test setup

* Updated R API and cleaned up

* Fixed test typo
This commit is contained in:
Scott Lundberg 2017-10-12 12:35:51 -07:00 committed by GitHub
parent ff9180cd73
commit 78c4188cec
16 changed files with 369 additions and 143 deletions

View File

@ -39,7 +39,7 @@ xgb.handleToBooster <- function(handle, raw = NULL) {
is.null.handle <- function(handle) { is.null.handle <- function(handle) {
if (!identical(class(handle), "xgb.Booster.handle")) if (!identical(class(handle), "xgb.Booster.handle"))
stop("argument type must be xgb.Booster.handle") stop("argument type must be xgb.Booster.handle")
if (is.null(handle) || .Call(XGCheckNullPtr_R, handle)) if (is.null(handle) || .Call(XGCheckNullPtr_R, handle))
return(TRUE) return(TRUE)
return(FALSE) return(FALSE)
@ -61,49 +61,49 @@ xgb.get.handle <- function(object) {
#' Restore missing parts of an incomplete xgb.Booster object. #' Restore missing parts of an incomplete xgb.Booster object.
#' #'
#' It attempts to complete an \code{xgb.Booster} object by restoring either its missing #' It attempts to complete an \code{xgb.Booster} object by restoring either its missing
#' raw model memory dump (when it has no \code{raw} data but its \code{xgb.Booster.handle} is valid) #' raw model memory dump (when it has no \code{raw} data but its \code{xgb.Booster.handle} is valid)
#' or its missing internal handle (when its \code{xgb.Booster.handle} is not valid #' or its missing internal handle (when its \code{xgb.Booster.handle} is not valid
#' but it has a raw Booster memory dump). #' but it has a raw Booster memory dump).
#' #'
#' @param object object of class \code{xgb.Booster} #' @param object object of class \code{xgb.Booster}
#' @param saveraw a flag indicating whether to append \code{raw} Booster memory dump data #' @param saveraw a flag indicating whether to append \code{raw} Booster memory dump data
#' when it doesn't already exist. #' when it doesn't already exist.
#' #'
#' @details #' @details
#' #'
#' While this method is primarily for internal use, it might be useful in some practical situations. #' While this method is primarily for internal use, it might be useful in some practical situations.
#' #'
#' E.g., when an \code{xgb.Booster} model is saved as an R object and then is loaded as an R object, #' E.g., when an \code{xgb.Booster} model is saved as an R object and then is loaded as an R object,
#' its handle (pointer) to an internal xgboost model would be invalid. The majority of xgboost methods #' its handle (pointer) to an internal xgboost model would be invalid. The majority of xgboost methods
#' should still work for such a model object since those methods would be using #' should still work for such a model object since those methods would be using
#' \code{xgb.Booster.complete} internally. However, one might find it to be more efficient to call the #' \code{xgb.Booster.complete} internally. However, one might find it to be more efficient to call the
#' \code{xgb.Booster.complete} function explicitely once after loading a model as an R-object. #' \code{xgb.Booster.complete} function explicitely once after loading a model as an R-object.
#' That would prevent further repeated implicit reconstruction of an internal booster model. #' That would prevent further repeated implicit reconstruction of an internal booster model.
#' #'
#' @return #' @return
#' An object of \code{xgb.Booster} class. #' An object of \code{xgb.Booster} class.
#' #'
#' @examples #' @examples
#' #'
#' data(agaricus.train, package='xgboost') #' data(agaricus.train, package='xgboost')
#' bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max_depth = 2, #' bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max_depth = 2,
#' eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic") #' eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic")
#' saveRDS(bst, "xgb.model.rds") #' saveRDS(bst, "xgb.model.rds")
#' #'
#' bst1 <- readRDS("xgb.model.rds") #' bst1 <- readRDS("xgb.model.rds")
#' # the handle is invalid: #' # the handle is invalid:
#' print(bst1$handle) #' print(bst1$handle)
#' #'
#' bst1 <- xgb.Booster.complete(bst1) #' bst1 <- xgb.Booster.complete(bst1)
#' # now the handle points to a valid internal booster model: #' # now the handle points to a valid internal booster model:
#' print(bst1$handle) #' print(bst1$handle)
#' #'
#' @export #' @export
xgb.Booster.complete <- function(object, saveraw = TRUE) { xgb.Booster.complete <- function(object, saveraw = TRUE) {
if (!inherits(object, "xgb.Booster")) if (!inherits(object, "xgb.Booster"))
stop("argument type must be xgb.Booster") stop("argument type must be xgb.Booster")
if (is.null.handle(object$handle)) { if (is.null.handle(object$handle)) {
object$handle <- xgb.Booster.handle(modelfile = object$raw) object$handle <- xgb.Booster.handle(modelfile = object$raw)
} else { } else {
@ -114,88 +114,90 @@ xgb.Booster.complete <- function(object, saveraw = TRUE) {
} }
#' Predict method for eXtreme Gradient Boosting model #' Predict method for eXtreme Gradient Boosting model
#' #'
#' Predicted values based on either xgboost model or model handle object. #' Predicted values based on either xgboost model or model handle object.
#' #'
#' @param object Object of class \code{xgb.Booster} or \code{xgb.Booster.handle} #' @param object Object of class \code{xgb.Booster} or \code{xgb.Booster.handle}
#' @param newdata takes \code{matrix}, \code{dgCMatrix}, local data file or \code{xgb.DMatrix}. #' @param newdata takes \code{matrix}, \code{dgCMatrix}, local data file or \code{xgb.DMatrix}.
#' @param missing Missing is only used when input is dense matrix. Pick a float value that represents #' @param missing Missing is only used when input is dense matrix. Pick a float value that represents
#' missing values in data (e.g., sometimes 0 or some other extreme value is used). #' missing values in data (e.g., sometimes 0 or some other extreme value is used).
#' @param outputmargin whether the prediction should be returned in the for of original untransformed #' @param outputmargin whether the prediction should be returned in the for of original untransformed
#' sum of predictions from boosting iterations' results. E.g., setting \code{outputmargin=TRUE} for #' sum of predictions from boosting iterations' results. E.g., setting \code{outputmargin=TRUE} for
#' logistic regression would result in predictions for log-odds instead of probabilities. #' logistic regression would result in predictions for log-odds instead of probabilities.
#' @param ntreelimit limit the number of model's trees or boosting iterations used in prediction (see Details). #' @param ntreelimit limit the number of model's trees or boosting iterations used in prediction (see Details).
#' It will use all the trees by default (\code{NULL} value). #' It will use all the trees by default (\code{NULL} value).
#' @param predleaf whether predict leaf index instead. #' @param predleaf whether predict leaf index instead.
#' @param predcontrib whether to return feature contributions to individual predictions instead (see Details). #' @param predcontrib whether to return feature contributions to individual predictions instead (see Details).
#' @param reshape whether to reshape the vector of predictions to a matrix form when there are several #' @param approxcontrib whether to use a fast approximation for feature contributions (see Details).
#' @param reshape whether to reshape the vector of predictions to a matrix form when there are several
#' prediction outputs per case. This option has no effect when \code{predleaf = TRUE}. #' prediction outputs per case. This option has no effect when \code{predleaf = TRUE}.
#' @param ... Parameters passed to \code{predict.xgb.Booster} #' @param ... Parameters passed to \code{predict.xgb.Booster}
#' #'
#' @details #' @details
#' Note that \code{ntreelimit} is not necessarily equal to the number of boosting iterations #' Note that \code{ntreelimit} is not necessarily equal to the number of boosting iterations
#' and it is not necessarily equal to the number of trees in a model. #' and it is not necessarily equal to the number of trees in a model.
#' E.g., in a random forest-like model, \code{ntreelimit} would limit the number of trees. #' E.g., in a random forest-like model, \code{ntreelimit} would limit the number of trees.
#' But for multiclass classification, while there are multiple trees per iteration, #' But for multiclass classification, while there are multiple trees per iteration,
#' \code{ntreelimit} limits the number of boosting iterations. #' \code{ntreelimit} limits the number of boosting iterations.
#' #'
#' Also note that \code{ntreelimit} would currently do nothing for predictions from gblinear, #' Also note that \code{ntreelimit} would currently do nothing for predictions from gblinear,
#' since gblinear doesn't keep its boosting history. #' since gblinear doesn't keep its boosting history.
#' #'
#' One possible practical applications of the \code{predleaf} option is to use the model #' One possible practical applications of the \code{predleaf} option is to use the model
#' as a generator of new features which capture non-linearity and interactions, #' as a generator of new features which capture non-linearity and interactions,
#' e.g., as implemented in \code{\link{xgb.create.features}}. #' e.g., as implemented in \code{\link{xgb.create.features}}.
#' #'
#' Setting \code{predcontrib = TRUE} allows to calculate contributions of each feature to #' Setting \code{predcontrib = TRUE} allows to calculate contributions of each feature to
#' individual predictions. For "gblinear" booster, feature contributions are simply linear terms #' individual predictions. For "gblinear" booster, feature contributions are simply linear terms
#' (feature_beta * feature_value). For "gbtree" booster, feature contribution is calculated #' (feature_beta * feature_value). For "gbtree" booster, feature contributions are SHAP
#' as a sum of average contribution of that feature's split nodes across all trees to an #' values (https://arxiv.org/abs/1706.06060) that sum to the difference between the expected output
#' individual prediction, following the idea explained in #' of the model and the current prediction (where the hessian weights are used to compute the expectations).
#' \url{http://blog.datadive.net/interpreting-random-forests/}. #' Setting \code{approxcontrib = TRUE} approximates these values following the idea explained
#' #' in \url{http://blog.datadive.net/interpreting-random-forests/}.
#' @return #'
#' @return
#' For regression or binary classification, it returns a vector of length \code{nrows(newdata)}. #' For regression or binary classification, it returns a vector of length \code{nrows(newdata)}.
#' For multiclass classification, either a \code{num_class * nrows(newdata)} vector or #' For multiclass classification, either a \code{num_class * nrows(newdata)} vector or
#' a \code{(nrows(newdata), num_class)} dimension matrix is returned, depending on #' a \code{(nrows(newdata), num_class)} dimension matrix is returned, depending on
#' the \code{reshape} value. #' the \code{reshape} value.
#' #'
#' When \code{predleaf = TRUE}, the output is a matrix object with the #' When \code{predleaf = TRUE}, the output is a matrix object with the
#' number of columns corresponding to the number of trees. #' number of columns corresponding to the number of trees.
#' #'
#' When \code{predcontrib = TRUE} and it is not a multiclass setting, the output is a matrix object with #' When \code{predcontrib = TRUE} and it is not a multiclass setting, the output is a matrix object with
#' \code{num_features + 1} columns. The last "+ 1" column in a matrix corresponds to bias. #' \code{num_features + 1} columns. The last "+ 1" column in a matrix corresponds to bias.
#' For a multiclass case, a list of \code{num_class} elements is returned, where each element is #' For a multiclass case, a list of \code{num_class} elements is returned, where each element is
#' such a matrix. The contribution values are on the scale of untransformed margin #' such a matrix. The contribution values are on the scale of untransformed margin
#' (e.g., for binary classification would mean that the contributions are log-odds deviations from bias). #' (e.g., for binary classification would mean that the contributions are log-odds deviations from bias).
#' #'
#' @seealso #' @seealso
#' \code{\link{xgb.train}}. #' \code{\link{xgb.train}}.
#' #'
#' @examples #' @examples
#' ## binary classification: #' ## binary classification:
#' #'
#' data(agaricus.train, package='xgboost') #' data(agaricus.train, package='xgboost')
#' data(agaricus.test, package='xgboost') #' data(agaricus.test, package='xgboost')
#' train <- agaricus.train #' train <- agaricus.train
#' test <- agaricus.test #' test <- agaricus.test
#' #'
#' bst <- xgboost(data = train$data, label = train$label, max_depth = 2, #' bst <- xgboost(data = train$data, label = train$label, max_depth = 2,
#' eta = 0.5, nthread = 2, nrounds = 5, objective = "binary:logistic") #' eta = 0.5, nthread = 2, nrounds = 5, objective = "binary:logistic")
#' # use all trees by default #' # use all trees by default
#' pred <- predict(bst, test$data) #' pred <- predict(bst, test$data)
#' # use only the 1st tree #' # use only the 1st tree
#' pred1 <- predict(bst, test$data, ntreelimit = 1) #' pred1 <- predict(bst, test$data, ntreelimit = 1)
#' #'
#' # Predicting tree leafs: #' # Predicting tree leafs:
#' # the result is an nsamples X ntrees matrix #' # the result is an nsamples X ntrees matrix
#' pred_leaf <- predict(bst, test$data, predleaf = TRUE) #' pred_leaf <- predict(bst, test$data, predleaf = TRUE)
#' str(pred_leaf) #' str(pred_leaf)
#' #'
#' # Predicting feature contributions to predictions: #' # Predicting feature contributions to predictions:
#' # the result is an nsamples X (nfeatures + 1) matrix #' # the result is an nsamples X (nfeatures + 1) matrix
#' pred_contr <- predict(bst, test$data, predcontrib = TRUE) #' pred_contr <- predict(bst, test$data, predcontrib = TRUE)
#' str(pred_contr) #' str(pred_contr)
#' # verify that contributions' sums are equal to log-odds of predictions (up to foat precision): #' # verify that contributions' sums are equal to log-odds of predictions (up to float precision):
#' summary(rowSums(pred_contr) - qlogis(pred)) #' summary(rowSums(pred_contr) - qlogis(pred))
#' # for the 1st record, let's inspect its features that had non-zero contribution to prediction: #' # for the 1st record, let's inspect its features that had non-zero contribution to prediction:
#' contr1 <- pred_contr[1,] #' contr1 <- pred_contr[1,]
@ -206,10 +208,10 @@ xgb.Booster.complete <- function(object, saveraw = TRUE) {
#' par(mar = old_mar + c(0,7,0,0)) #' par(mar = old_mar + c(0,7,0,0))
#' barplot(contr1, horiz = TRUE, las = 2, xlab = "contribution to prediction in log-odds") #' barplot(contr1, horiz = TRUE, las = 2, xlab = "contribution to prediction in log-odds")
#' par(mar = old_mar) #' par(mar = old_mar)
#' #'
#' #'
#' ## multiclass classification in iris dataset: #' ## multiclass classification in iris dataset:
#' #'
#' lb <- as.numeric(iris$Species) - 1 #' lb <- as.numeric(iris$Species) - 1
#' num_class <- 3 #' num_class <- 3
#' set.seed(11) #' set.seed(11)
@ -225,7 +227,7 @@ xgb.Booster.complete <- function(object, saveraw = TRUE) {
#' pred_labels <- max.col(pred) - 1 #' pred_labels <- max.col(pred) - 1
#' # the following should result in the same error as seen in the last iteration #' # the following should result in the same error as seen in the last iteration
#' sum(pred_labels != lb)/length(lb) #' sum(pred_labels != lb)/length(lb)
#' #'
#' # compare that to the predictions from softmax: #' # compare that to the predictions from softmax:
#' set.seed(11) #' set.seed(11)
#' bst <- xgboost(data = as.matrix(iris[, -5]), label = lb, #' bst <- xgboost(data = as.matrix(iris[, -5]), label = lb,
@ -234,14 +236,14 @@ xgb.Booster.complete <- function(object, saveraw = TRUE) {
#' pred <- predict(bst, as.matrix(iris[, -5])) #' pred <- predict(bst, as.matrix(iris[, -5]))
#' str(pred) #' str(pred)
#' all.equal(pred, pred_labels) #' all.equal(pred, pred_labels)
#' # prediction from using only 5 iterations should result #' # prediction from using only 5 iterations should result
#' # in the same error as seen in iteration 5: #' # in the same error as seen in iteration 5:
#' pred5 <- predict(bst, as.matrix(iris[, -5]), ntreelimit=5) #' pred5 <- predict(bst, as.matrix(iris[, -5]), ntreelimit=5)
#' sum(pred5 != lb)/length(lb) #' sum(pred5 != lb)/length(lb)
#' #'
#' #'
#' ## random forest-like model of 25 trees for binary classification: #' ## random forest-like model of 25 trees for binary classification:
#' #'
#' set.seed(11) #' set.seed(11)
#' bst <- xgboost(data = train$data, label = train$label, max_depth = 5, #' bst <- xgboost(data = train$data, label = train$label, max_depth = 5,
#' nthread = 2, nrounds = 1, objective = "binary:logistic", #' nthread = 2, nrounds = 1, objective = "binary:logistic",
@ -258,7 +260,7 @@ xgb.Booster.complete <- function(object, saveraw = TRUE) {
#' @rdname predict.xgb.Booster #' @rdname predict.xgb.Booster
#' @export #' @export
predict.xgb.Booster <- function(object, newdata, missing = NA, outputmargin = FALSE, ntreelimit = NULL, predict.xgb.Booster <- function(object, newdata, missing = NA, outputmargin = FALSE, ntreelimit = NULL,
predleaf = FALSE, predcontrib = FALSE, reshape = FALSE, ...) { predleaf = FALSE, predcontrib = FALSE, approxcontrib = FALSE, reshape = FALSE, ...) {
object <- xgb.Booster.complete(object, saveraw = FALSE) object <- xgb.Booster.complete(object, saveraw = FALSE)
if (!inherits(newdata, "xgb.DMatrix")) if (!inherits(newdata, "xgb.DMatrix"))
@ -269,18 +271,18 @@ predict.xgb.Booster <- function(object, newdata, missing = NA, outputmargin = FA
ntreelimit <- 0 ntreelimit <- 0
if (ntreelimit < 0) if (ntreelimit < 0)
stop("ntreelimit cannot be negative") stop("ntreelimit cannot be negative")
option <- 0L + 1L * as.logical(outputmargin) + 2L * as.logical(predleaf) + 4L * as.logical(predcontrib) option <- 0L + 1L * as.logical(outputmargin) + 2L * as.logical(predleaf) + 4L * as.logical(predcontrib) + 8L * as.logical(approxcontrib)
ret <- .Call(XGBoosterPredict_R, object$handle, newdata, option[1], as.integer(ntreelimit)) ret <- .Call(XGBoosterPredict_R, object$handle, newdata, option[1], as.integer(ntreelimit))
n_ret <- length(ret) n_ret <- length(ret)
n_row <- nrow(newdata) n_row <- nrow(newdata)
npred_per_case <- n_ret / n_row npred_per_case <- n_ret / n_row
if (n_ret %% n_row != 0) if (n_ret %% n_row != 0)
stop("prediction length ", n_ret, " is not multiple of nrows(newdata) ", n_row) stop("prediction length ", n_ret, " is not multiple of nrows(newdata) ", n_row)
if (predleaf) { if (predleaf) {
ret <- if (n_ret == n_row) { ret <- if (n_ret == n_row) {
matrix(ret, ncol = 1) matrix(ret, ncol = 1)
@ -325,9 +327,9 @@ predict.xgb.Booster.handle <- function(object, ...) {
#' #'
#' @param object Object of class \code{xgb.Booster} or \code{xgb.Booster.handle}. #' @param object Object of class \code{xgb.Booster} or \code{xgb.Booster.handle}.
#' @param name a non-empty character string specifying which attribute is to be accessed. #' @param name a non-empty character string specifying which attribute is to be accessed.
#' @param value a value of an attribute for \code{xgb.attr<-}; for \code{xgb.attributes<-} #' @param value a value of an attribute for \code{xgb.attr<-}; for \code{xgb.attributes<-}
#' it's a list (or an object coercible to a list) with the names of attributes to set #' it's a list (or an object coercible to a list) with the names of attributes to set
#' and the elements corresponding to attribute values. #' and the elements corresponding to attribute values.
#' Non-character values are converted to character. #' Non-character values are converted to character.
#' When attribute value is not a scalar, only the first index is used. #' When attribute value is not a scalar, only the first index is used.
#' Use \code{NULL} to remove an attribute. #' Use \code{NULL} to remove an attribute.
@ -336,32 +338,32 @@ predict.xgb.Booster.handle <- function(object, ...) {
#' The primary purpose of xgboost model attributes is to store some meta-data about the model. #' The primary purpose of xgboost model attributes is to store some meta-data about the model.
#' Note that they are a separate concept from the object attributes in R. #' Note that they are a separate concept from the object attributes in R.
#' Specifically, they refer to key-value strings that can be attached to an xgboost model, #' Specifically, they refer to key-value strings that can be attached to an xgboost model,
#' stored together with the model's binary representation, and accessed later #' stored together with the model's binary representation, and accessed later
#' (from R or any other interface). #' (from R or any other interface).
#' In contrast, any R-attribute assigned to an R-object of \code{xgb.Booster} class #' In contrast, any R-attribute assigned to an R-object of \code{xgb.Booster} class
#' would not be saved by \code{xgb.save} because an xgboost model is an external memory object #' would not be saved by \code{xgb.save} because an xgboost model is an external memory object
#' and its serialization is handled externally. #' and its serialization is handled externally.
#' Also, setting an attribute that has the same name as one of xgboost's parameters wouldn't #' Also, setting an attribute that has the same name as one of xgboost's parameters wouldn't
#' change the value of that parameter for a model. #' change the value of that parameter for a model.
#' Use \code{\link{xgb.parameters<-}} to set or change model parameters. #' Use \code{\link{xgb.parameters<-}} to set or change model parameters.
#' #'
#' The attribute setters would usually work more efficiently for \code{xgb.Booster.handle} #' The attribute setters would usually work more efficiently for \code{xgb.Booster.handle}
#' than for \code{xgb.Booster}, since only just a handle (pointer) would need to be copied. #' than for \code{xgb.Booster}, since only just a handle (pointer) would need to be copied.
#' That would only matter if attributes need to be set many times. #' That would only matter if attributes need to be set many times.
#' Note, however, that when feeding a handle of an \code{xgb.Booster} object to the attribute setters, #' Note, however, that when feeding a handle of an \code{xgb.Booster} object to the attribute setters,
#' the raw model cache of an \code{xgb.Booster} object would not be automatically updated, #' the raw model cache of an \code{xgb.Booster} object would not be automatically updated,
#' and it would be user's responsibility to call \code{xgb.save.raw} to update it. #' and it would be user's responsibility to call \code{xgb.save.raw} to update it.
#' #'
#' The \code{xgb.attributes<-} setter either updates the existing or adds one or several attributes, #' The \code{xgb.attributes<-} setter either updates the existing or adds one or several attributes,
#' but it doesn't delete the other existing attributes. #' but it doesn't delete the other existing attributes.
#' #'
#' @return #' @return
#' \code{xgb.attr} returns either a string value of an attribute #' \code{xgb.attr} returns either a string value of an attribute
#' or \code{NULL} if an attribute wasn't stored in a model. #' or \code{NULL} if an attribute wasn't stored in a model.
#' #'
#' \code{xgb.attributes} returns a list of all attribute stored in a model #' \code{xgb.attributes} returns a list of all attribute stored in a model
#' or \code{NULL} if a model has no stored attributes. #' or \code{NULL} if a model has no stored attributes.
#' #'
#' @examples #' @examples
#' data(agaricus.train, package='xgboost') #' data(agaricus.train, package='xgboost')
#' train <- agaricus.train #' train <- agaricus.train
@ -377,13 +379,13 @@ predict.xgb.Booster.handle <- function(object, ...) {
#' bst1 <- xgb.load('xgb.model') #' bst1 <- xgb.load('xgb.model')
#' print(xgb.attr(bst1, "my_attribute")) #' print(xgb.attr(bst1, "my_attribute"))
#' print(xgb.attributes(bst1)) #' print(xgb.attributes(bst1))
#' #'
#' # deletion: #' # deletion:
#' xgb.attr(bst1, "my_attribute") <- NULL #' xgb.attr(bst1, "my_attribute") <- NULL
#' print(xgb.attributes(bst1)) #' print(xgb.attributes(bst1))
#' xgb.attributes(bst1) <- list(a = NULL, b = NULL) #' xgb.attributes(bst1) <- list(a = NULL, b = NULL)
#' print(xgb.attributes(bst1)) #' print(xgb.attributes(bst1))
#' #'
#' @rdname xgb.attr #' @rdname xgb.attr
#' @export #' @export
xgb.attr <- function(object, name) { xgb.attr <- function(object, name) {
@ -464,7 +466,7 @@ xgb.attributes <- function(object) {
#' @details #' @details
#' Note that the setter would usually work more efficiently for \code{xgb.Booster.handle} #' Note that the setter would usually work more efficiently for \code{xgb.Booster.handle}
#' than for \code{xgb.Booster}, since only just a handle would need to be copied. #' than for \code{xgb.Booster}, since only just a handle would need to be copied.
#' #'
#' @examples #' @examples
#' data(agaricus.train, package='xgboost') #' data(agaricus.train, package='xgboost')
#' train <- agaricus.train #' train <- agaricus.train
@ -473,7 +475,7 @@ xgb.attributes <- function(object) {
#' eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic") #' eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic")
#' #'
#' xgb.parameters(bst) <- list(eta = 0.1) #' xgb.parameters(bst) <- list(eta = 0.1)
#' #'
#' @rdname xgb.parameters #' @rdname xgb.parameters
#' @export #' @export
`xgb.parameters<-` <- function(object, value) { `xgb.parameters<-` <- function(object, value) {
@ -503,28 +505,28 @@ xgb.ntree <- function(bst) {
#' Print xgb.Booster #' Print xgb.Booster
#' #'
#' Print information about xgb.Booster. #' Print information about xgb.Booster.
#' #'
#' @param x an xgb.Booster object #' @param x an xgb.Booster object
#' @param verbose whether to print detailed data (e.g., attribute values) #' @param verbose whether to print detailed data (e.g., attribute values)
#' @param ... not currently used #' @param ... not currently used
#' #'
#' @examples #' @examples
#' data(agaricus.train, package='xgboost') #' data(agaricus.train, package='xgboost')
#' train <- agaricus.train #' train <- agaricus.train
#' bst <- xgboost(data = train$data, label = train$label, max_depth = 2, #' bst <- xgboost(data = train$data, label = train$label, max_depth = 2,
#' eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic") #' eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic")
#' attr(bst, 'myattr') <- 'memo' #' attr(bst, 'myattr') <- 'memo'
#' #'
#' print(bst) #' print(bst)
#' print(bst, verbose=TRUE) #' print(bst, verbose=TRUE)
#' #'
#' @method print xgb.Booster #' @method print xgb.Booster
#' @export #' @export
print.xgb.Booster <- function(x, verbose = FALSE, ...) { print.xgb.Booster <- function(x, verbose = FALSE, ...) {
cat('##### xgb.Booster\n') cat('##### xgb.Booster\n')
valid_handle <- is.null.handle(x$handle) valid_handle <- is.null.handle(x$handle)
if (!valid_handle) if (!valid_handle)
cat("Handle is invalid! Suggest using xgb.Booster.complete\n") cat("Handle is invalid! Suggest using xgb.Booster.complete\n")
@ -539,10 +541,10 @@ print.xgb.Booster <- function(x, verbose = FALSE, ...) {
cat('call:\n ') cat('call:\n ')
print(x$call) print(x$call)
} }
if (!is.null(x$params)) { if (!is.null(x$params)) {
cat('params (as set within xgb.train):\n') cat('params (as set within xgb.train):\n')
cat( ' ', cat( ' ',
paste(names(x$params), paste(names(x$params),
paste0('"', unlist(x$params), '"'), paste0('"', unlist(x$params), '"'),
sep = ' = ', collapse = ', '), '\n', sep = '') sep = ' = ', collapse = ', '), '\n', sep = '')
@ -562,7 +564,7 @@ print.xgb.Booster <- function(x, verbose = FALSE, ...) {
cat(' ', paste(names(attrs), collapse = ', '), '\n', sep = '') cat(' ', paste(names(attrs), collapse = ', '), '\n', sep = '')
} }
} }
if (!is.null(x$callbacks) && length(x$callbacks) > 0) { if (!is.null(x$callbacks) && length(x$callbacks) > 0) {
cat('callbacks:\n') cat('callbacks:\n')
lapply(callback.calls(x$callbacks), function(x) { lapply(callback.calls(x$callbacks), function(x) {
@ -570,14 +572,14 @@ print.xgb.Booster <- function(x, verbose = FALSE, ...) {
print(x) print(x)
}) })
} }
if (!is.null(x$feature_names)) if (!is.null(x$feature_names))
cat('# of features:', length(x$feature_names), '\n') cat('# of features:', length(x$feature_names), '\n')
cat('niter: ', x$niter, '\n', sep = '') cat('niter: ', x$niter, '\n', sep = '')
# TODO: uncomment when faster xgb.ntree is implemented # TODO: uncomment when faster xgb.ntree is implemented
#cat('ntree: ', xgb.ntree(x), '\n', sep='') #cat('ntree: ', xgb.ntree(x), '\n', sep='')
for (n in setdiff(names(x), c('handle', 'raw', 'call', 'params', 'callbacks', for (n in setdiff(names(x), c('handle', 'raw', 'call', 'params', 'callbacks',
'evaluation_log','niter','feature_names'))) { 'evaluation_log','niter','feature_names'))) {
if (is.atomic(x[[n]])) { if (is.atomic(x[[n]])) {
@ -587,11 +589,11 @@ print.xgb.Booster <- function(x, verbose = FALSE, ...) {
print(x[[n]]) print(x[[n]])
} }
} }
if (!is.null(x$evaluation_log)) { if (!is.null(x$evaluation_log)) {
cat('evaluation_log:\n') cat('evaluation_log:\n')
print(x$evaluation_log, row.names = FALSE, topn = 2) print(x$evaluation_log, row.names = FALSE, topn = 2)
} }
invisible(x) invisible(x)
} }

View File

@ -52,9 +52,9 @@ test_that("xgb.dump works", {
test_that("xgb.dump works for gblinear", { test_that("xgb.dump works for gblinear", {
expect_length(xgb.dump(bst.GLM), 14) expect_length(xgb.dump(bst.GLM), 14)
# also make sure that it works properly for a sparse model where some coefficients # also make sure that it works properly for a sparse model where some coefficients
# are 0 from setting large L1 regularization: # are 0 from setting large L1 regularization:
bst.GLM.sp <- xgboost(data = sparse_matrix, label = label, eta = 1, nthread = 2, nrounds = 1, bst.GLM.sp <- xgboost(data = sparse_matrix, label = label, eta = 1, nthread = 2, nrounds = 1,
alpha=2, objective = "binary:logistic", booster = "gblinear") alpha=2, objective = "binary:logistic", booster = "gblinear")
d.sp <- xgb.dump(bst.GLM.sp) d.sp <- xgb.dump(bst.GLM.sp)
expect_length(d.sp, 14) expect_length(d.sp, 14)
@ -80,28 +80,35 @@ test_that("predict feature contributions works", {
expect_equal(dim(pred_contr), c(nrow(sparse_matrix), ncol(sparse_matrix) + 1)) expect_equal(dim(pred_contr), c(nrow(sparse_matrix), ncol(sparse_matrix) + 1))
expect_equal(colnames(pred_contr), c(colnames(sparse_matrix), "BIAS")) expect_equal(colnames(pred_contr), c(colnames(sparse_matrix), "BIAS"))
pred <- predict(bst.Tree, sparse_matrix, outputmargin = TRUE) pred <- predict(bst.Tree, sparse_matrix, outputmargin = TRUE)
expect_lt(max(abs(rowSums(pred_contr) - pred)), 1e-6) expect_lt(max(abs(rowSums(pred_contr) - pred)), 1e-5)
# gbtree binary classifier (approximate method)
expect_error(pred_contr <- predict(bst.Tree, sparse_matrix, predcontrib = TRUE, approxcontrib = TRUE), regexp = NA)
expect_equal(dim(pred_contr), c(nrow(sparse_matrix), ncol(sparse_matrix) + 1))
expect_equal(colnames(pred_contr), c(colnames(sparse_matrix), "BIAS"))
pred <- predict(bst.Tree, sparse_matrix, outputmargin = TRUE)
expect_lt(max(abs(rowSums(pred_contr) - pred)), 1e-5)
# gblinear binary classifier # gblinear binary classifier
expect_error(pred_contr <- predict(bst.GLM, sparse_matrix, predcontrib = TRUE), regexp = NA) expect_error(pred_contr <- predict(bst.GLM, sparse_matrix, predcontrib = TRUE), regexp = NA)
expect_equal(dim(pred_contr), c(nrow(sparse_matrix), ncol(sparse_matrix) + 1)) expect_equal(dim(pred_contr), c(nrow(sparse_matrix), ncol(sparse_matrix) + 1))
expect_equal(colnames(pred_contr), c(colnames(sparse_matrix), "BIAS")) expect_equal(colnames(pred_contr), c(colnames(sparse_matrix), "BIAS"))
pred <- predict(bst.GLM, sparse_matrix, outputmargin = TRUE) pred <- predict(bst.GLM, sparse_matrix, outputmargin = TRUE)
expect_lt(max(abs(rowSums(pred_contr) - pred)), 2e-6) expect_lt(max(abs(rowSums(pred_contr) - pred)), 1e-5)
# manual calculation of linear terms # manual calculation of linear terms
coefs <- xgb.dump(bst.GLM)[-c(1,2,4)] %>% as.numeric coefs <- xgb.dump(bst.GLM)[-c(1,2,4)] %>% as.numeric
coefs <- c(coefs[-1], coefs[1]) # intercept must be the last coefs <- c(coefs[-1], coefs[1]) # intercept must be the last
pred_contr_manual <- sweep(cbind(sparse_matrix, 1), 2, coefs, FUN="*") pred_contr_manual <- sweep(cbind(sparse_matrix, 1), 2, coefs, FUN="*")
expect_equal(as.numeric(pred_contr), as.numeric(pred_contr_manual), 2e-6) expect_equal(as.numeric(pred_contr), as.numeric(pred_contr_manual), 1e-5)
# gbtree multiclass # gbtree multiclass
pred <- predict(mbst.Tree, as.matrix(iris[, -5]), outputmargin = TRUE, reshape = TRUE) pred <- predict(mbst.Tree, as.matrix(iris[, -5]), outputmargin = TRUE, reshape = TRUE)
pred_contr <- predict(mbst.Tree, as.matrix(iris[, -5]), predcontrib = TRUE) pred_contr <- predict(mbst.Tree, as.matrix(iris[, -5]), predcontrib = TRUE)
expect_is(pred_contr, "list") expect_is(pred_contr, "list")
expect_length(pred_contr, 3) expect_length(pred_contr, 3)
for (g in seq_along(pred_contr)) { for (g in seq_along(pred_contr)) {
expect_equal(colnames(pred_contr[[g]]), c(colnames(iris[, -5]), "BIAS")) expect_equal(colnames(pred_contr[[g]]), c(colnames(iris[, -5]), "BIAS"))
expect_lt(max(abs(rowSums(pred_contr[[g]]) - pred[, g])), 2e-6) expect_lt(max(abs(rowSums(pred_contr[[g]]) - pred[, g])), 1e-5)
} }
# gblinear multiclass (set base_score = 0, which is base margin in multiclass) # gblinear multiclass (set base_score = 0, which is base margin in multiclass)
@ -192,10 +199,10 @@ test_that("xgb.model.dt.tree works with and without feature names", {
expect_equal(names.dt.trees, names(dt.tree)) expect_equal(names.dt.trees, names(dt.tree))
expect_equal(dim(dt.tree), c(188, 10)) expect_equal(dim(dt.tree), c(188, 10))
expect_output(str(dt.tree), 'Feature.*\\"Age\\"') expect_output(str(dt.tree), 'Feature.*\\"Age\\"')
dt.tree.0 <- xgb.model.dt.tree(model = bst.Tree) dt.tree.0 <- xgb.model.dt.tree(model = bst.Tree)
expect_equal(dt.tree, dt.tree.0) expect_equal(dt.tree, dt.tree.0)
# when model contains no feature names: # when model contains no feature names:
bst.Tree.x <- bst.Tree bst.Tree.x <- bst.Tree
bst.Tree.x$feature_names <- NULL bst.Tree.x$feature_names <- NULL
@ -219,20 +226,20 @@ test_that("xgb.importance works with and without feature names", {
expect_equal(dim(importance.Tree), c(7, 4)) expect_equal(dim(importance.Tree), c(7, 4))
expect_equal(colnames(importance.Tree), c("Feature", "Gain", "Cover", "Frequency")) expect_equal(colnames(importance.Tree), c("Feature", "Gain", "Cover", "Frequency"))
expect_output(str(importance.Tree), 'Feature.*\\"Age\\"') expect_output(str(importance.Tree), 'Feature.*\\"Age\\"')
importance.Tree.0 <- xgb.importance(model = bst.Tree) importance.Tree.0 <- xgb.importance(model = bst.Tree)
expect_equal(importance.Tree, importance.Tree.0) expect_equal(importance.Tree, importance.Tree.0)
# when model contains no feature names: # when model contains no feature names:
bst.Tree.x <- bst.Tree bst.Tree.x <- bst.Tree
bst.Tree.x$feature_names <- NULL bst.Tree.x$feature_names <- NULL
importance.Tree.x <- xgb.importance(model = bst.Tree) importance.Tree.x <- xgb.importance(model = bst.Tree)
expect_equal(importance.Tree[, -1, with=FALSE], importance.Tree.x[, -1, with=FALSE]) expect_equal(importance.Tree[, -1, with=FALSE], importance.Tree.x[, -1, with=FALSE])
imp2plot <- xgb.plot.importance(importance_matrix = importance.Tree) imp2plot <- xgb.plot.importance(importance_matrix = importance.Tree)
expect_equal(colnames(imp2plot), c("Feature", "Gain", "Cover", "Frequency", "Importance")) expect_equal(colnames(imp2plot), c("Feature", "Gain", "Cover", "Frequency", "Importance"))
xgb.ggplot.importance(importance_matrix = importance.Tree) xgb.ggplot.importance(importance_matrix = importance.Tree)
# for multiclass # for multiclass
imp.Tree <- xgb.importance(model = mbst.Tree) imp.Tree <- xgb.importance(model = mbst.Tree)
expect_equal(dim(imp.Tree), c(4, 4)) expect_equal(dim(imp.Tree), c(4, 4))
@ -247,7 +254,7 @@ test_that("xgb.importance works with GLM model", {
imp2plot <- xgb.plot.importance(importance.GLM) imp2plot <- xgb.plot.importance(importance.GLM)
expect_equal(colnames(imp2plot), c("Feature", "Weight", "Importance")) expect_equal(colnames(imp2plot), c("Feature", "Weight", "Importance"))
xgb.ggplot.importance(importance.GLM) xgb.ggplot.importance(importance.GLM)
# for multiclass # for multiclass
imp.GLM <- xgb.importance(model = mbst.GLM) imp.GLM <- xgb.importance(model = mbst.GLM)
expect_equal(dim(imp.GLM), c(12, 3)) expect_equal(dim(imp.GLM), c(12, 3))

View File

@ -115,10 +115,11 @@ class GradientBooster {
* \param out_contribs output vector to hold the contributions * \param out_contribs output vector to hold the contributions
* \param ntree_limit limit the number of trees used in prediction, when it equals 0, this means * \param ntree_limit limit the number of trees used in prediction, when it equals 0, this means
* we do not limit number of trees * we do not limit number of trees
* \param approximate use a faster (inconsistent) approximation of SHAP values
*/ */
virtual void PredictContribution(DMatrix* dmat, virtual void PredictContribution(DMatrix* dmat,
std::vector<bst_float>* out_contribs, std::vector<bst_float>* out_contribs,
unsigned ntree_limit = 0) = 0; unsigned ntree_limit = 0, bool approximate = false) = 0;
/*! /*!
* \brief dump the model in the requested format * \brief dump the model in the requested format

View File

@ -104,13 +104,15 @@ class Learner : public rabit::Serializable {
* predictor, when it equals 0, this means we are using all the trees * predictor, when it equals 0, this means we are using all the trees
* \param pred_leaf whether to only predict the leaf index of each tree in a boosted tree predictor * \param pred_leaf whether to only predict the leaf index of each tree in a boosted tree predictor
* \param pred_contribs whether to only predict the feature contributions * \param pred_contribs whether to only predict the feature contributions
* \param approx_contribs whether to approximate the feature contributions for speed
*/ */
virtual void Predict(DMatrix* data, virtual void Predict(DMatrix* data,
bool output_margin, bool output_margin,
std::vector<bst_float> *out_preds, std::vector<bst_float> *out_preds,
unsigned ntree_limit = 0, unsigned ntree_limit = 0,
bool pred_leaf = false, bool pred_leaf = false,
bool pred_contribs = false) const = 0; bool pred_contribs = false,
bool approx_contribs = false) const = 0;
/*! /*!
* \brief Set additional attribute to the Booster. * \brief Set additional attribute to the Booster.
* The property will be saved along the booster. * The property will be saved along the booster.

View File

@ -144,12 +144,14 @@ class Predictor {
* \param [in,out] out_contribs The output feature contribs. * \param [in,out] out_contribs The output feature contribs.
* \param model Model to make predictions from. * \param model Model to make predictions from.
* \param ntree_limit (Optional) The ntree limit. * \param ntree_limit (Optional) The ntree limit.
* \param approximate Use fast approximate algorithm.
*/ */
virtual void PredictContribution(DMatrix* dmat, virtual void PredictContribution(DMatrix* dmat,
std::vector<bst_float>* out_contribs, std::vector<bst_float>* out_contribs,
const gbm::GBTreeModel& model, const gbm::GBTreeModel& model,
unsigned ntree_limit = 0) = 0; unsigned ntree_limit = 0,
bool approximate = false) = 0;
/** /**
* \fn static Predictor* Predictor::Create(std::string name); * \fn static Predictor* Predictor::Create(std::string name);

View File

@ -14,6 +14,7 @@
#include <string> #include <string>
#include <cstring> #include <cstring>
#include <algorithm> #include <algorithm>
#include <tuple>
#include "./base.h" #include "./base.h"
#include "./data.h" #include "./data.h"
#include "./logging.h" #include "./logging.h"
@ -411,6 +412,20 @@ struct RTreeNodeStat {
int leaf_child_cnt; int leaf_child_cnt;
}; };
// Used by TreeShap
// data we keep about our decision path
// note that pweight is included for convenience and is not tied with the other attributes
// the pweight of the i'th path element is the permuation weight of paths with i-1 ones in them
struct PathElement {
int feature_index;
bst_float zero_fraction;
bst_float one_fraction;
bst_float pweight;
PathElement() {}
PathElement(int i, bst_float z, bst_float o, bst_float w) :
feature_index(i), zero_fraction(z), one_fraction(o), pweight(w) {}
};
/*! /*!
* \brief define regression tree to be the most common tree model. * \brief define regression tree to be the most common tree model.
* This is the data structure used in xgboost's major tree models. * This is the data structure used in xgboost's major tree models.
@ -482,13 +497,26 @@ class RegTree: public TreeModel<bst_float, RTreeNodeStat> {
*/ */
inline bst_float Predict(const FVec& feat, unsigned root_id = 0) const; inline bst_float Predict(const FVec& feat, unsigned root_id = 0) const;
/*! /*!
* \brief calculate the feature contributions for the given root * \brief calculate the feature contributions (https://arxiv.org/abs/1706.06060) for the tree
* \param feat dense feature vector, if the feature is missing the field is set to NaN * \param feat dense feature vector, if the feature is missing the field is set to NaN
* \param root_id starting root index of the instance * \param root_id starting root index of the instance
* \param out_contribs output vector to hold the contributions * \param out_contribs output vector to hold the contributions
*/ */
inline void CalculateContributions(const RegTree::FVec& feat, unsigned root_id, inline void CalculateContributions(const RegTree::FVec& feat, unsigned root_id,
bst_float *out_contribs) const; bst_float *out_contribs) const;
inline void TreeShap(const RegTree::FVec& feat, bst_float *phi,
unsigned node_index, unsigned unique_depth,
PathElement *parent_unique_path, bst_float parent_zero_fraction,
bst_float parent_one_fraction, int parent_feature_index) const;
/*!
* \brief calculate the approximate feature contributions for the given root
* \param feat dense feature vector, if the feature is missing the field is set to NaN
* \param root_id starting root index of the instance
* \param out_contribs output vector to hold the contributions
*/
inline void CalculateContributionsApprox(const RegTree::FVec& feat, unsigned root_id,
bst_float *out_contribs) const;
/*! /*!
* \brief get next position of the tree given current pid * \brief get next position of the tree given current pid
* \param pid Current node id. * \param pid Current node id.
@ -590,7 +618,7 @@ inline bst_float RegTree::FillNodeMeanValue(int nid) {
return result; return result;
} }
inline void RegTree::CalculateContributions(const RegTree::FVec& feat, unsigned root_id, inline void RegTree::CalculateContributionsApprox(const RegTree::FVec& feat, unsigned root_id,
bst_float *out_contribs) const { bst_float *out_contribs) const {
CHECK_GT(this->node_mean_values.size(), 0U); CHECK_GT(this->node_mean_values.size(), 0U);
// this follows the idea of http://blog.datadive.net/interpreting-random-forests/ // this follows the idea of http://blog.datadive.net/interpreting-random-forests/
@ -617,6 +645,154 @@ inline void RegTree::CalculateContributions(const RegTree::FVec& feat, unsigned
out_contribs[split_index] += leaf_value - node_value; out_contribs[split_index] += leaf_value - node_value;
} }
// extend our decision path with a fraction of one and zero extensions
inline void ExtendPath(PathElement *unique_path, unsigned unique_depth,
bst_float zero_fraction, bst_float one_fraction, int feature_index) {
unique_path[unique_depth].feature_index = feature_index;
unique_path[unique_depth].zero_fraction = zero_fraction;
unique_path[unique_depth].one_fraction = one_fraction;
unique_path[unique_depth].pweight = (unique_depth == 0 ? 1 : 0);
for (int i = unique_depth-1; i >= 0; i--) {
unique_path[i+1].pweight += one_fraction*unique_path[i].pweight*(i+1)
/ static_cast<bst_float>(unique_depth+1);
unique_path[i].pweight = zero_fraction*unique_path[i].pweight*(unique_depth-i)
/ static_cast<bst_float>(unique_depth+1);
}
}
// undo a previous extension of the decision path
inline void UnwindPath(PathElement *unique_path, unsigned unique_depth, unsigned path_index) {
const bst_float one_fraction = unique_path[path_index].one_fraction;
const bst_float zero_fraction = unique_path[path_index].zero_fraction;
bst_float next_one_portion = unique_path[unique_depth].pweight;
for (int i = unique_depth-1; i >= 0; --i) {
if (one_fraction != 0) {
const bst_float tmp = unique_path[i].pweight;
unique_path[i].pweight = next_one_portion*(unique_depth+1)
/ static_cast<bst_float>((i+1)*one_fraction);
next_one_portion = tmp - unique_path[i].pweight*zero_fraction*(unique_depth-i)
/ static_cast<bst_float>(unique_depth+1);
} else {
unique_path[i].pweight = (unique_path[i].pweight*(unique_depth+1))
/ static_cast<bst_float>(zero_fraction*(unique_depth-i));
}
}
for (int i = path_index; i < unique_depth; ++i) {
unique_path[i].feature_index = unique_path[i+1].feature_index;
unique_path[i].zero_fraction = unique_path[i+1].zero_fraction;
unique_path[i].one_fraction = unique_path[i+1].one_fraction;
}
}
// determine what the total permuation weight would be if
// we unwound a previous extension in the decision path
inline bst_float UnwoundPathSum(const PathElement *unique_path, unsigned unique_depth,
unsigned path_index) {
const bst_float one_fraction = unique_path[path_index].one_fraction;
const bst_float zero_fraction = unique_path[path_index].zero_fraction;
bst_float next_one_portion = unique_path[unique_depth].pweight;
bst_float total = 0;
for (int i = unique_depth-1; i >= 0; --i) {
if (one_fraction != 0) {
const bst_float tmp = next_one_portion*(unique_depth+1)
/ static_cast<bst_float>((i+1)*one_fraction);
total += tmp;
next_one_portion = unique_path[i].pweight - tmp*zero_fraction*((unique_depth-i)
/ static_cast<bst_float>(unique_depth+1));
} else {
total += (unique_path[i].pweight/zero_fraction)/((unique_depth-i)
/ static_cast<bst_float>(unique_depth+1));
}
}
return total;
}
// recursive computation of SHAP values for a decision tree
inline void RegTree::TreeShap(const RegTree::FVec& feat, bst_float *phi,
unsigned node_index, unsigned unique_depth,
PathElement *parent_unique_path, bst_float parent_zero_fraction,
bst_float parent_one_fraction, int parent_feature_index) const {
const auto node = (*this)[node_index];
// extend the unique path
PathElement *unique_path = parent_unique_path + unique_depth;
if (unique_depth > 0) std::copy(parent_unique_path, parent_unique_path+unique_depth, unique_path);
ExtendPath(unique_path, unique_depth, parent_zero_fraction,
parent_one_fraction, parent_feature_index);
const unsigned split_index = node.split_index();
// leaf node
if (node.is_leaf()) {
for (int i = 1; i <= unique_depth; ++i) {
const bst_float w = UnwoundPathSum(unique_path, unique_depth, i);
const PathElement &el = unique_path[i];
phi[el.feature_index] += w*(el.one_fraction-el.zero_fraction)*node.leaf_value();
}
// internal node
} else {
// find which branch is "hot" (meaning x would follow it)
unsigned hot_index = 0;
if (feat.is_missing(split_index)) {
hot_index = node.cdefault();
} else if (feat.fvalue(split_index) < node.split_cond()) {
hot_index = node.cleft();
} else {
hot_index = node.cright();
}
const unsigned cold_index = (hot_index == node.cleft() ? node.cright() : node.cleft());
const bst_float w = this->stat(node_index).sum_hess;
const bst_float hot_zero_fraction = this->stat(hot_index).sum_hess/w;
const bst_float cold_zero_fraction = this->stat(cold_index).sum_hess/w;
bst_float incoming_zero_fraction = 1;
bst_float incoming_one_fraction = 1;
// see if we have already split on this feature,
// if so we undo that split so we can redo it for this node
unsigned path_index = 0;
for (; path_index <= unique_depth; ++path_index) {
if (unique_path[path_index].feature_index == split_index) break;
}
if (path_index != unique_depth+1) {
incoming_zero_fraction = unique_path[path_index].zero_fraction;
incoming_one_fraction = unique_path[path_index].one_fraction;
UnwindPath(unique_path, unique_depth, path_index);
unique_depth -= 1;
}
TreeShap(feat, phi, hot_index, unique_depth+1, unique_path,
hot_zero_fraction*incoming_zero_fraction, incoming_one_fraction, split_index);
TreeShap(feat, phi, cold_index, unique_depth+1, unique_path,
cold_zero_fraction*incoming_zero_fraction, 0, split_index);
}
}
inline void RegTree::CalculateContributions(const RegTree::FVec& feat, unsigned root_id,
bst_float *out_contribs) const {
// find the expected value of the tree's predictions
bst_float base_value = 0.0;
bst_float total_cover = 0;
for (unsigned i = 0; i < (*this).param.num_nodes; ++i) {
const auto node = (*this)[i];
if (node.is_leaf()) {
const auto cover = this->stat(i).sum_hess;
base_value += cover*node.leaf_value();
total_cover += cover;
}
}
out_contribs[feat.size()] += base_value / total_cover;
// Preallocate space for the unique path data
const int maxd = this->MaxDepth(root_id)+1;
PathElement *unique_path_data = new PathElement[(maxd*(maxd+1))/2];
TreeShap(feat, out_contribs, root_id, 0, unique_path_data, 1, 1, -1);
delete[] unique_path_data;
}
/*! \brief get next position of the tree given current pid */ /*! \brief get next position of the tree given current pid */
inline int RegTree::GetNext(int pid, bst_float fvalue, bool is_unknown) const { inline int RegTree::GetNext(int pid, bst_float fvalue, bool is_unknown) const {
bst_float split_value = (*this)[pid].split_cond(); bst_float split_value = (*this)[pid].split_cond();

View File

@ -990,7 +990,7 @@ class Booster(object):
return self.eval_set([(data, name)], iteration) return self.eval_set([(data, name)], iteration)
def predict(self, data, output_margin=False, ntree_limit=0, pred_leaf=False, def predict(self, data, output_margin=False, ntree_limit=0, pred_leaf=False,
pred_contribs=False): pred_contribs=False, approx_contribs=False):
""" """
Predict with data. Predict with data.
@ -1018,9 +1018,12 @@ class Booster(object):
pred_contribs : bool pred_contribs : bool
When this option is on, the output will be a matrix of (nsample, nfeats+1) When this option is on, the output will be a matrix of (nsample, nfeats+1)
with each record indicating the feature contributions of all trees. The sum of with each record indicating the feature contributions (SHAP values) for that
all feature contributions is equal to the prediction. Note that the bias is added prediction. The sum of all feature contributions is equal to the prediction.
as the final column, on top of the regular features. Note that the bias is added as the final column, on top of the regular features.
approx_contribs : bool
Approximate the contributions of each feature
Returns Returns
------- -------
@ -1033,6 +1036,8 @@ class Booster(object):
option_mask |= 0x02 option_mask |= 0x02
if pred_contribs: if pred_contribs:
option_mask |= 0x04 option_mask |= 0x04
if approx_contribs:
option_mask |= 0x08
self._validate_features(data) self._validate_features(data)

View File

@ -758,7 +758,8 @@ XGB_DLL int XGBoosterPredict(BoosterHandle handle,
(option_mask & 1) != 0, (option_mask & 1) != 0,
&preds, ntree_limit, &preds, ntree_limit,
(option_mask & 2) != 0, (option_mask & 2) != 0,
(option_mask & 4) != 0); (option_mask & 4) != 0,
(option_mask & 8) != 0);
*out_result = dmlc::BeginPtr(preds); *out_result = dmlc::BeginPtr(preds);
*len = static_cast<xgboost::bst_ulong>(preds.size()); *len = static_cast<xgboost::bst_ulong>(preds.size());
API_END(); API_END();

View File

@ -224,7 +224,7 @@ class GBLinear : public GradientBooster {
void PredictContribution(DMatrix* p_fmat, void PredictContribution(DMatrix* p_fmat,
std::vector<bst_float>* out_contribs, std::vector<bst_float>* out_contribs,
unsigned ntree_limit) override { unsigned ntree_limit, bool approximate) override {
if (model.weight.size() == 0) { if (model.weight.size() == 0) {
model.InitModel(); model.InitModel();
} }

View File

@ -233,8 +233,8 @@ class GBTree : public GradientBooster {
void PredictContribution(DMatrix* p_fmat, void PredictContribution(DMatrix* p_fmat,
std::vector<bst_float>* out_contribs, std::vector<bst_float>* out_contribs,
unsigned ntree_limit) override { unsigned ntree_limit, bool approximate) override {
predictor->PredictContribution(p_fmat, out_contribs, model_, ntree_limit); predictor->PredictContribution(p_fmat, out_contribs, model_, ntree_limit, approximate);
} }
std::vector<std::string> DumpModel(const FeatureMap& fmap, std::vector<std::string> DumpModel(const FeatureMap& fmap,

View File

@ -433,9 +433,9 @@ class LearnerImpl : public Learner {
void Predict(DMatrix* data, bool output_margin, void Predict(DMatrix* data, bool output_margin,
std::vector<bst_float>* out_preds, unsigned ntree_limit, std::vector<bst_float>* out_preds, unsigned ntree_limit,
bool pred_leaf, bool pred_contribs) const override { bool pred_leaf, bool pred_contribs, bool approx_contribs) const override {
if (pred_contribs) { if (pred_contribs) {
gbm_->PredictContribution(data, out_preds, ntree_limit); gbm_->PredictContribution(data, out_preds, ntree_limit, approx_contribs);
} else if (pred_leaf) { } else if (pred_leaf) {
gbm_->PredictLeaf(data, out_preds, ntree_limit); gbm_->PredictLeaf(data, out_preds, ntree_limit);
} else { } else {

View File

@ -206,9 +206,9 @@ class CPUPredictor : public Predictor {
} }
} }
void PredictContribution(DMatrix* p_fmat, void PredictContribution(DMatrix* p_fmat, std::vector<bst_float>* out_contribs,
std::vector<bst_float>* out_contribs, const gbm::GBTreeModel& model, unsigned ntree_limit,
const gbm::GBTreeModel& model, unsigned ntree_limit) override { bool approximate) override {
const int nthread = omp_get_max_threads(); const int nthread = omp_get_max_threads();
InitThreadTemp(nthread, model.param.num_feature); InitThreadTemp(nthread, model.param.num_feature);
const MetaInfo& info = p_fmat->info(); const MetaInfo& info = p_fmat->info();
@ -225,10 +225,12 @@ class CPUPredictor : public Predictor {
// make sure contributions is zeroed, we could be reusing a previously // make sure contributions is zeroed, we could be reusing a previously
// allocated one // allocated one
std::fill(contribs.begin(), contribs.end(), 0); std::fill(contribs.begin(), contribs.end(), 0);
// initialize tree node mean values if (approximate) {
#pragma omp parallel for schedule(static) // initialize tree node mean values
for (bst_omp_uint i = 0; i < ntree_limit; ++i) { #pragma omp parallel for schedule(static)
model.trees[i]->FillNodeMeanValues(); for (bst_omp_uint i = 0; i < ntree_limit; ++i) {
model.trees[i]->FillNodeMeanValues();
}
} }
// start collecting the contributions // start collecting the contributions
dmlc::DataIter<RowBatch>* iter = p_fmat->RowIterator(); dmlc::DataIter<RowBatch>* iter = p_fmat->RowIterator();
@ -253,7 +255,11 @@ class CPUPredictor : public Predictor {
if (model.tree_info[j] != gid) { if (model.tree_info[j] != gid) {
continue; continue;
} }
model.trees[j]->CalculateContributions(feats, root_id, p_contribs); if (!approximate) {
model.trees[j]->CalculateContributions(feats, root_id, p_contribs);
} else {
model.trees[j]->CalculateContributionsApprox(feats, root_id, p_contribs);
}
} }
feats.Drop(batch[i]); feats.Drop(batch[i]);
// add base margin to BIAS // add base margin to BIAS

View File

@ -384,9 +384,10 @@ class GPUPredictor : public xgboost::Predictor {
void PredictContribution(DMatrix* p_fmat, void PredictContribution(DMatrix* p_fmat,
std::vector<bst_float>* out_contribs, std::vector<bst_float>* out_contribs,
const gbm::GBTreeModel& model, const gbm::GBTreeModel& model,
unsigned ntree_limit) override { unsigned ntree_limit,
bool approximate) override {
cpu_predictor->PredictContribution(p_fmat, out_contribs, model, cpu_predictor->PredictContribution(p_fmat, out_contribs, model,
ntree_limit); ntree_limit, approximate);
} }
void Init(const std::vector<std::pair<std::string, std::string>>& cfg, void Init(const std::vector<std::pair<std::string, std::string>>& cfg,

View File

@ -12,6 +12,7 @@ TEST(cpu_predictor, Test) {
trees.push_back(std::unique_ptr<RegTree>(new RegTree)); trees.push_back(std::unique_ptr<RegTree>(new RegTree));
trees.back()->InitModel(); trees.back()->InitModel();
(*trees.back())[0].set_leaf(1.5f); (*trees.back())[0].set_leaf(1.5f);
(*trees.back()).stat(0).sum_hess = 1.0f;
gbm::GBTreeModel model(0.5); gbm::GBTreeModel model(0.5);
model.CommitModel(std::move(trees), 0); model.CommitModel(std::move(trees), 0);
model.param.num_output_group = 1; model.param.num_output_group = 1;
@ -50,5 +51,11 @@ TEST(cpu_predictor, Test) {
for (int i = 0; i < out_contribution.size(); i++) { for (int i = 0; i < out_contribution.size(); i++) {
ASSERT_EQ(out_contribution[i], 1.5); ASSERT_EQ(out_contribution[i], 1.5);
} }
// Test predict contribution (approximate method)
cpu_predictor->PredictContribution(dmat.get(), &out_contribution, model, true);
for (int i = 0; i < out_contribution.size(); i++) {
ASSERT_EQ(out_contribution[i], 1.5);
}
} }
} // namespace xgboost } // namespace xgboost

View File

@ -19,6 +19,7 @@ TEST(gpu_predictor, Test) {
trees.push_back(std::unique_ptr<RegTree>()); trees.push_back(std::unique_ptr<RegTree>());
trees.back()->InitModel(); trees.back()->InitModel();
(*trees.back())[0].set_leaf(1.5f); (*trees.back())[0].set_leaf(1.5f);
(*trees.back()).stat(0).sum_hess = 1.0f;
gbm::GBTreeModel model(0.5); gbm::GBTreeModel model(0.5);
model.CommitModel(std::move(trees), 0); model.CommitModel(std::move(trees), 0);
model.param.num_output_group = 1; model.param.num_output_group = 1;

View File

@ -291,3 +291,18 @@ def test_contributions():
for max_depth, num_rounds in itertools.product(range(0, 3), range(1, 5)): for max_depth, num_rounds in itertools.product(range(0, 3), range(1, 5)):
yield test_fn, max_depth, num_rounds yield test_fn, max_depth, num_rounds
# check that we get the right SHAP values for a basic AND example
# (https://arxiv.org/abs/1706.06060)
X = np.zeros((4, 2))
X[0, :] = 1
X[1, 0] = 1
X[2, 1] = 1
y = np.zeros(4)
y[0] = 1
param = {"max_depth": 2, "base_score": 0.0, "eta": 1.0, "lambda": 0}
bst = xgb.train(param, xgb.DMatrix(X, label=y), 1)
out = bst.predict(xgb.DMatrix(X[0:1, :]), pred_contribs=True)
assert out[0, 0] == 0.375
assert out[0, 1] == 0.375
assert out[0, 2] == 0.25