From b52db87d5c3e81440b6b0f0a33f33ad080eaeb68 Mon Sep 17 00:00:00 2001
From: Vadim Khotilovich <khotilovich@gmail.com>
Date: Sun, 21 May 2017 06:41:51 -0500
Subject: [PATCH] adding feature contributions to R and gblinear (#2295)

* [gblinear] add features contribution prediction; fix DumpModel bug

* [gbtree] minor changes to PredContrib

* [R] add feature contribution prediction to R

* [R] bump up version; update NEWS

* [gblinear] fix the base_margin issue; fixes #1969

* [R] list of matrices as output of multiclass feature contributions

* [gblinear] make order of DumpModel coefficients consistent: group index changes the fastest
---
 NEWS.md                                 |  4 ++
 R-package/DESCRIPTION                   |  2 +-
 R-package/R/xgb.Booster.R               | 90 +++++++++++++++++++------
 R-package/man/predict.xgb.Booster.Rd    | 48 +++++++++++--
 R-package/tests/testthat/test_helpers.R | 78 +++++++++++++++++++--
 include/xgboost/c_api.h                 |  2 +-
 include/xgboost/gbm.h                   |  4 +-
 include/xgboost/learner.h               |  2 +-
 src/gbm/gblinear.cc                     | 74 +++++++++++++++-----
 src/gbm/gbtree.cc                       | 11 +--
 10 files changed, 255 insertions(+), 60 deletions(-)

diff --git a/NEWS.md b/NEWS.md
index 61da8a36b..064041988 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -12,10 +12,14 @@ This file records the changes in xgboost library in reverse chronological order.
   - Thread local variable is upgraded so it is automatically freed at thread exit.
 * Migrate to C++11
   - The current master version now requires C++11 enabled compiled(g++4.8 or higher)
+* New functionality
+  - Ability to adjust tree model's statistics to a new dataset without changing tree structures.
+  - Extracting feature contributions to individual predictions.
 * R package:
   - New parameters:
     - `silent` in `xgb.DMatrix()`
     - `use_int_id` in `xgb.model.dt.tree()`
+    - `predcontrib` in `predict()`
   - Default value of the `save_period` parameter in `xgboost()` changed to NULL (consistent with `xgb.train()`).
 
 ## v0.6 (2016.07.29)
diff --git a/R-package/DESCRIPTION b/R-package/DESCRIPTION
index b1a91be17..758c454af 100644
--- a/R-package/DESCRIPTION
+++ b/R-package/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: xgboost
 Type: Package
 Title: Extreme Gradient Boosting
-Version: 0.6.4.4
+Version: 0.6.4.5
 Date: 2017-01-04
 Author: Tianqi Chen <tianqi.tchen@gmail.com>, Tong He <hetong007@gmail.com>,
     Michael Benesty <michael@benesty.fr>, Vadim Khotilovich <khotilovich@gmail.com>,
diff --git a/R-package/R/xgb.Booster.R b/R-package/R/xgb.Booster.R
index 94bd27028..8e72486bd 100644
--- a/R-package/R/xgb.Booster.R
+++ b/R-package/R/xgb.Booster.R
@@ -126,7 +126,8 @@ xgb.Booster.complete <- function(object, saveraw = TRUE) {
 #'        logistic regression would result in predictions for log-odds instead of probabilities.
 #' @param ntreelimit limit the number of model's trees or boosting iterations used in prediction (see Details).
 #'        It will use all the trees by default (\code{NULL} value).
-#' @param predleaf whether predict leaf index instead. 
+#' @param predleaf whether predict leaf index instead.
+#' @param predcontrib whether to return feature contributions to individual predictions instead (see Details).
 #' @param reshape whether to reshape the vector of predictions to a matrix form when there are several 
 #'        prediction outputs per case. This option has no effect when \code{predleaf = TRUE}.
 #' @param ... Parameters passed to \code{predict.xgb.Booster}
@@ -135,15 +136,22 @@ xgb.Booster.complete <- function(object, saveraw = TRUE) {
 #' Note that \code{ntreelimit} is not necessarily equal to the number of boosting iterations
 #' and it is not necessarily equal to the number of trees in a model.
 #' E.g., in a random forest-like model, \code{ntreelimit} would limit the number of trees.
-#' But for multiclass classification, there are multiple trees per iteration, 
-#' but \code{ntreelimit} limits the number of boosting iterations.
+#' But for multiclass classification, while there are multiple trees per iteration, 
+#' \code{ntreelimit} limits the number of boosting iterations.
 #' 
 #' Also note that \code{ntreelimit} would currently do nothing for predictions from gblinear, 
-#' since gblinear doesn't keep its boosting history. 
+#' since gblinear doesn't keep its boosting history.
 #' 
 #' One possible practical applications of the \code{predleaf} option is to use the model 
 #' as a generator of new features which capture non-linearity and interactions, 
-#' e.g., as implemented in \code{\link{xgb.create.features}}. 
+#' e.g., as implemented in \code{\link{xgb.create.features}}.
+#' 
+#' Setting \code{predcontrib = TRUE} allows to calculate contributions of each feature to
+#' individual predictions. For "gblinear" booster, feature contributions are simply linear terms
+#' (feature_beta * feature_value). For "gbtree" booster, feature contribution is calculated 
+#' as a sum of average contribution of that feature's split nodes across all trees to an 
+#' individual prediction, following the idea explained in 
+#' \url{http://blog.datadive.net/interpreting-random-forests/}.
 #' 
 #' @return 
 #' For regression or binary classification, it returns a vector of length \code{nrows(newdata)}.
@@ -154,6 +162,12 @@ xgb.Booster.complete <- function(object, saveraw = TRUE) {
 #' When \code{predleaf = TRUE}, the output is a matrix object with the 
 #' number of columns corresponding to the number of trees.
 #' 
+#' When \code{predcontrib = TRUE} and it is not a multiclass setting, the output is a matrix object with
+#' \code{num_features + 1} columns. The last "+ 1" column in a matrix corresponds to bias.
+#' For a multiclass case, a list of \code{num_class} elements is returned, where each element is
+#' such a matrix. The contribution values are on the scale of untransformed margin 
+#' (e.g., for binary classification would mean that the contributions are log-odds deviations from bias).
+#' 
 #' @seealso
 #' \code{\link{xgb.train}}.
 #' 
@@ -166,11 +180,32 @@ xgb.Booster.complete <- function(object, saveraw = TRUE) {
 #' test <- agaricus.test
 #' 
 #' bst <- xgboost(data = train$data, label = train$label, max_depth = 2, 
-#'                eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic")
+#'                eta = 0.5, nthread = 2, nrounds = 5, objective = "binary:logistic")
 #' # use all trees by default
 #' pred <- predict(bst, test$data)
 #' # use only the 1st tree
-#' pred <- predict(bst, test$data, ntreelimit = 1)
+#' pred1 <- predict(bst, test$data, ntreelimit = 1)
+#' 
+#' # Predicting tree leafs:
+#' # the result is an nsamples X ntrees matrix
+#' pred_leaf <- predict(bst, test$data, predleaf = TRUE)
+#' str(pred_leaf)
+#' 
+#' # Predicting feature contributions to predictions:
+#' # the result is an nsamples X (nfeatures + 1) matrix
+#' pred_contr <- predict(bst, test$data, predcontrib = TRUE)
+#' str(pred_contr)
+#' # verify that contributions' sums are equal to log-odds of predictions (up to foat precision):
+#' summary(rowSums(pred_contr) - qlogis(pred))
+#' # for the 1st record, let's inspect its features that had non-zero contribution to prediction:
+#' contr1 <- pred_contr[1,]
+#' contr1 <- contr1[-length(contr1)]    # drop BIAS
+#' contr1 <- contr1[contr1 != 0]        # drop non-contributing features
+#' contr1 <- contr1[order(abs(contr1))] # order by contribution magnitude
+#' old_mar <- par("mar")
+#' par(mar = old_mar + c(0,7,0,0))
+#' barplot(contr1, horiz = TRUE, las = 2, xlab = "contribution to prediction in log-odds")
+#' par(mar = old_mar)
 #' 
 #' 
 #' ## multiclass classification in iris dataset:
@@ -222,8 +257,8 @@ xgb.Booster.complete <- function(object, saveraw = TRUE) {
 #'
 #' @rdname predict.xgb.Booster
 #' @export
-predict.xgb.Booster <- function(object, newdata, missing = NA,
-    outputmargin = FALSE, ntreelimit = NULL, predleaf = FALSE, reshape = FALSE, ...) {
+predict.xgb.Booster <- function(object, newdata, missing = NA, outputmargin = FALSE, ntreelimit = NULL,
+                                predleaf = FALSE, predcontrib = FALSE, reshape = FALSE, ...) {
 
   object <- xgb.Booster.complete(object, saveraw = FALSE)
   if (!inherits(newdata, "xgb.DMatrix"))
@@ -235,23 +270,40 @@ predict.xgb.Booster <- function(object, newdata, missing = NA,
   if (ntreelimit < 0)
     stop("ntreelimit cannot be negative")
   
-  option <- 0L + 1L * as.logical(outputmargin) + 2L * as.logical(predleaf)
+  option <- 0L + 1L * as.logical(outputmargin) + 2L * as.logical(predleaf) + 4L * as.logical(predcontrib)
   
   ret <- .Call(XGBoosterPredict_R, object$handle, newdata, option[1], as.integer(ntreelimit))
   
-  if (length(ret) %% nrow(newdata) != 0)
-    stop("prediction length ", length(ret)," is not multiple of nrows(newdata) ", nrow(newdata))
-  npred_per_case <- length(ret) / nrow(newdata)
-
-  if (predleaf){
-    len <- nrow(newdata)
-    ret <- if (length(ret) == len) {
+  n_ret <- length(ret)
+  n_row <- nrow(newdata)
+  npred_per_case <- n_ret / n_row
+  
+  if (n_ret %% n_row != 0)
+    stop("prediction length ", n_ret, " is not multiple of nrows(newdata) ", n_row)
+  
+  if (predleaf) {
+    ret <- if (n_ret == n_row) {
       matrix(ret, ncol = 1)
     } else {
-      t(matrix(ret, ncol = len))
+      matrix(ret, nrow = n_row, byrow = TRUE)
+    }
+  } else if (predcontrib) {
+    n_col1 <- ncol(newdata) + 1
+    n_group <- npred_per_case / n_col1
+    dnames <- list(NULL, c(colnames(newdata), "BIAS"))
+    ret <- if (n_ret == n_row) {
+      matrix(ret, ncol = 1, dimnames = dnames)
+    } else if (n_group == 1) {
+      matrix(ret, nrow = n_row, byrow = TRUE, dimnames = dnames)
+    } else {
+      grp_mask <- rep(1:n_col1, n_row) +
+        rep((0:(n_row - 1)) * n_col1 * n_group, each = n_col1)
+      lapply(1:n_group, function(g) {
+        matrix(ret[grp_mask + n_col1 * (g - 1)], nrow = n_row, byrow = TRUE, dimnames = dnames)
+      })
     }
   } else if (reshape && npred_per_case > 1) {
-    ret <- matrix(ret, ncol = length(ret) / nrow(newdata), byrow = TRUE)
+    ret <- matrix(ret, nrow = n_row, byrow = TRUE)
   }
   return(ret)
 }
diff --git a/R-package/man/predict.xgb.Booster.Rd b/R-package/man/predict.xgb.Booster.Rd
index 67a06fe2a..ef93c12a7 100644
--- a/R-package/man/predict.xgb.Booster.Rd
+++ b/R-package/man/predict.xgb.Booster.Rd
@@ -7,7 +7,7 @@
 \usage{
 \method{predict}{xgb.Booster}(object, newdata, missing = NA,
   outputmargin = FALSE, ntreelimit = NULL, predleaf = FALSE,
-  reshape = FALSE, ...)
+  predcontrib = FALSE, reshape = FALSE, ...)
 
 \method{predict}{xgb.Booster.handle}(object, ...)
 }
@@ -28,6 +28,8 @@ It will use all the trees by default (\code{NULL} value).}
 
 \item{predleaf}{whether predict leaf index instead.}
 
+\item{predcontrib}{whether to return feature contributions to individual predictions instead (see Details).}
+
 \item{reshape}{whether to reshape the vector of predictions to a matrix form when there are several 
 prediction outputs per case. This option has no effect when \code{predleaf = TRUE}.}
 
@@ -41,6 +43,12 @@ the \code{reshape} value.
 
 When \code{predleaf = TRUE}, the output is a matrix object with the 
 number of columns corresponding to the number of trees.
+
+When \code{predcontrib = TRUE} and it is not a multiclass setting, the output is a matrix object with
+\code{num_features + 1} columns. The last "+ 1" column in a matrix corresponds to bias.
+For a multiclass case, a list of \code{num_class} elements is returned, where each element is
+such a matrix. The contribution values are on the scale of untransformed margin 
+(e.g., for binary classification would mean that the contributions are log-odds deviations from bias).
 }
 \description{
 Predicted values based on either xgboost model or model handle object.
@@ -49,15 +57,22 @@ Predicted values based on either xgboost model or model handle object.
 Note that \code{ntreelimit} is not necessarily equal to the number of boosting iterations
 and it is not necessarily equal to the number of trees in a model.
 E.g., in a random forest-like model, \code{ntreelimit} would limit the number of trees.
-But for multiclass classification, there are multiple trees per iteration, 
-but \code{ntreelimit} limits the number of boosting iterations.
+But for multiclass classification, while there are multiple trees per iteration, 
+\code{ntreelimit} limits the number of boosting iterations.
 
 Also note that \code{ntreelimit} would currently do nothing for predictions from gblinear, 
-since gblinear doesn't keep its boosting history. 
+since gblinear doesn't keep its boosting history.
 
 One possible practical applications of the \code{predleaf} option is to use the model 
 as a generator of new features which capture non-linearity and interactions, 
 e.g., as implemented in \code{\link{xgb.create.features}}.
+
+Setting \code{predcontrib = TRUE} allows to calculate contributions of each feature to
+individual predictions. For "gblinear" booster, feature contributions are simply linear terms
+(feature_beta * feature_value). For "gbtree" booster, feature contribution is calculated 
+as a sum of average contribution of that feature's split nodes across all trees to an 
+individual prediction, following the idea explained in 
+\url{http://blog.datadive.net/interpreting-random-forests/}.
 }
 \examples{
 ## binary classification:
@@ -68,11 +83,32 @@ train <- agaricus.train
 test <- agaricus.test
 
 bst <- xgboost(data = train$data, label = train$label, max_depth = 2, 
-               eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic")
+               eta = 0.5, nthread = 2, nrounds = 5, objective = "binary:logistic")
 # use all trees by default
 pred <- predict(bst, test$data)
 # use only the 1st tree
-pred <- predict(bst, test$data, ntreelimit = 1)
+pred1 <- predict(bst, test$data, ntreelimit = 1)
+
+# Predicting tree leafs:
+# the result is an nsamples X ntrees matrix
+pred_leaf <- predict(bst, test$data, predleaf = TRUE)
+str(pred_leaf)
+
+# Predicting feature contributions to predictions:
+# the result is an nsamples X (nfeatures + 1) matrix
+pred_contr <- predict(bst, test$data, predcontrib = TRUE)
+str(pred_contr)
+# verify that contributions' sums are equal to log-odds of predictions (up to foat precision):
+summary(rowSums(pred_contr) - qlogis(pred))
+# for the 1st record, let's inspect its features that had non-zero contribution to prediction:
+contr1 <- pred_contr[1,]
+contr1 <- contr1[-length(contr1)]    # drop BIAS
+contr1 <- contr1[contr1 != 0]        # drop non-contributing features
+contr1 <- contr1[order(abs(contr1))] # order by contribution magnitude
+old_mar <- par("mar")
+par(mar = old_mar + c(0,7,0,0))
+barplot(contr1, horiz = TRUE, las = 2, xlab = "contribution to prediction in log-odds")
+par(mar = old_mar)
 
 
 ## multiclass classification in iris dataset:
diff --git a/R-package/tests/testthat/test_helpers.R b/R-package/tests/testthat/test_helpers.R
index 84df814c8..64814c403 100644
--- a/R-package/tests/testthat/test_helpers.R
+++ b/R-package/tests/testthat/test_helpers.R
@@ -14,18 +14,19 @@ df[,ID := NULL]
 sparse_matrix <- sparse.model.matrix(Improved~.-1, data = df)
 label <- df[, ifelse(Improved == "Marked", 1, 0)]
 
+nrounds <- 12
 bst.Tree <- xgboost(data = sparse_matrix, label = label, max_depth = 9,
-                    eta = 1, nthread = 2, nrounds = 10, verbose = 0,
+                    eta = 1, nthread = 2, nrounds = nrounds, verbose = 0,
                     objective = "binary:logistic", booster = "gbtree")
 
 bst.GLM <- xgboost(data = sparse_matrix, label = label,
-                   eta = 1, nthread = 2, nrounds = 10, verbose = 0,
+                   eta = 1, nthread = 1, nrounds = nrounds, verbose = 0,
                    objective = "binary:logistic", booster = "gblinear")
 
 feature.names <- colnames(sparse_matrix)
 
 test_that("xgb.dump works", {
-  expect_length(xgb.dump(bst.Tree), 172)
+  expect_length(xgb.dump(bst.Tree), 200)
   expect_true(xgb.dump(bst.Tree, 'xgb.model.dump', with_stats = T))
   expect_true(file.exists('xgb.model.dump'))
   expect_gt(file.size('xgb.model.dump'), 8000)
@@ -33,7 +34,7 @@ test_that("xgb.dump works", {
   # JSON format
   dmp <- xgb.dump(bst.Tree, dump_format = "json")
   expect_length(dmp, 1)
-  expect_length(grep('nodeid', strsplit(dmp, '\n')[[1]]), 162)
+  expect_length(grep('nodeid', strsplit(dmp, '\n')[[1]]), 188)
 })
 
 test_that("xgb.dump works for gblinear", {
@@ -52,13 +53,74 @@ test_that("xgb.dump works for gblinear", {
   expect_length(grep('\\d', strsplit(dmp, '\n')[[1]]), 11)
 })
 
+test_that("predict leafs works", {
+  # no error for gbtree
+  expect_error(pred_leaf <- predict(bst.Tree, sparse_matrix, predleaf = TRUE), regexp = NA)
+  expect_equal(dim(pred_leaf), c(nrow(sparse_matrix), nrounds))
+  # error for gblinear
+  expect_error(predict(bst.GLM, sparse_matrix, predleaf = TRUE))
+})
+
+test_that("predict feature contributions works", {
+  # gbtree binary classifier
+  expect_error(pred_contr <- predict(bst.Tree, sparse_matrix, predcontrib = TRUE), regexp = NA)
+  expect_equal(dim(pred_contr), c(nrow(sparse_matrix), ncol(sparse_matrix) + 1))
+  expect_equal(colnames(pred_contr), c(colnames(sparse_matrix), "BIAS"))
+  pred <- predict(bst.Tree, sparse_matrix, outputmargin = TRUE)
+  expect_lt(max(abs(rowSums(pred_contr) - pred)), 1e-6)
+  
+  # gblinear binary classifier
+  expect_error(pred_contr <- predict(bst.GLM, sparse_matrix, predcontrib = TRUE), regexp = NA)
+  expect_equal(dim(pred_contr), c(nrow(sparse_matrix), ncol(sparse_matrix) + 1))
+  expect_equal(colnames(pred_contr), c(colnames(sparse_matrix), "BIAS"))
+  pred <- predict(bst.GLM, sparse_matrix, outputmargin = TRUE)
+  expect_lt(max(abs(rowSums(pred_contr) - pred)), 2e-6)
+  # manual calculation of linear terms
+  coefs <- xgb.dump(bst.GLM)[-c(1,2,4)] %>% as.numeric
+  coefs <- c(coefs[-1], coefs[1]) # intercept must be the last
+  pred_contr_manual <- sweep(cbind(sparse_matrix, 1), 2, coefs, FUN="*")
+  expect_equal(as.numeric(pred_contr), as.numeric(pred_contr_manual), 2e-6)
+
+  # gbtree multiclass  
+  lb <- as.numeric(iris$Species) - 1
+  bst <- xgboost(data = as.matrix(iris[, -5]), label = lb, verbose = 0,
+                 max_depth = 3, eta = 0.5, nthread = 2, nrounds = 5,
+                 objective = "multi:softprob", num_class = 3)
+  pred <- predict(bst, as.matrix(iris[, -5]), outputmargin = TRUE, reshape = TRUE)
+  pred_contr <- predict(bst, as.matrix(iris[, -5]), predcontrib = TRUE)
+  expect_is(pred_contr, "list")
+  expect_length(pred_contr, 3)
+  for (g in seq_along(pred_contr)) {
+    expect_equal(colnames(pred_contr[[g]]), c(colnames(iris[, -5]), "BIAS"))
+    expect_lt(max(abs(rowSums(pred_contr[[g]]) - pred[, g])), 2e-6)
+  }
+
+  # gblinear multiclass (set base_score = 0, which is base margin in multiclass)
+  bst <- xgboost(data = as.matrix(iris[, -5]), label = lb, verbose = 0,
+                 booster = "gblinear", eta = 0.1, nthread = 1, nrounds = 10,
+                 objective = "multi:softprob", num_class = 3, base_score = 0)
+  pred <- predict(bst, as.matrix(iris[, -5]), outputmargin = TRUE, reshape = TRUE)
+  pred_contr <- predict(bst, as.matrix(iris[, -5]), predcontrib = TRUE)
+  expect_length(pred_contr, 3)
+  coefs_all <- xgb.dump(bst)[-c(1,2,6)] %>% as.numeric
+  for (g in seq_along(pred_contr)) {
+    expect_equal(colnames(pred_contr[[g]]), c(colnames(iris[, -5]), "BIAS"))
+    expect_lt(max(abs(rowSums(pred_contr[[g]]) - pred[, g])), 2e-6)
+    # manual calculation of linear terms
+    coefs <- coefs_all[seq(g, length(coefs_all), by = 3)]
+    coefs <- c(coefs[-1], coefs[1]) # intercept needs to be the last
+    pred_contr_manual <- sweep(as.matrix(cbind(iris[,-5], 1)), 2, coefs, FUN="*")
+    expect_equal(as.numeric(pred_contr[[g]]), as.numeric(pred_contr_manual), 2e-6)
+  }
+})
+
 test_that("xgb-attribute functionality", {
   val <- "my attribute value"
   list.val <- list(my_attr=val, a=123, b='ok')
   list.ch <- list.val[order(names(list.val))]
   list.ch <- lapply(list.ch, as.character)
   # note: iter is 0-index in xgb attributes
-  list.default <- list(niter = "9")
+  list.default <- list(niter = as.character(nrounds - 1))
   list.ch <- c(list.ch, list.default)
   # proper input:
   expect_error(xgb.attr(bst.Tree, NULL))
@@ -85,7 +147,9 @@ test_that("xgb-attribute functionality", {
   expect_null(xgb.attributes(bst))
 })
 
-if (grepl('Windows', Sys.info()[['sysname']]) || grepl('Linux', Sys.info()[['sysname']]) || grepl('Darwin', Sys.info()[['sysname']])) {
+if (grepl('Windows', Sys.info()[['sysname']]) ||
+    grepl('Linux', Sys.info()[['sysname']]) ||
+    grepl('Darwin', Sys.info()[['sysname']])) {
     test_that("xgb-attribute numeric precision", {
       # check that lossless conversion works with 17 digits
       # numeric -> character -> numeric
@@ -121,7 +185,7 @@ test_that("xgb.model.dt.tree works with and without feature names", {
   names.dt.trees <- c("Tree", "Node", "ID", "Feature", "Split", "Yes", "No", "Missing", "Quality", "Cover")
   dt.tree <- xgb.model.dt.tree(feature_names = feature.names, model = bst.Tree)
   expect_equal(names.dt.trees, names(dt.tree))
-  expect_equal(dim(dt.tree), c(162, 10))
+  expect_equal(dim(dt.tree), c(188, 10))
   expect_output(str(dt.tree), 'Feature.*\\"Age\\"')
   
   dt.tree.0 <- xgb.model.dt.tree(model = bst.Tree)
diff --git a/include/xgboost/c_api.h b/include/xgboost/c_api.h
index decdcd0bf..8b37d2dc1 100644
--- a/include/xgboost/c_api.h
+++ b/include/xgboost/c_api.h
@@ -384,7 +384,7 @@ XGB_DLL int XGBoosterEvalOneIter(BoosterHandle handle,
  *          0:normal prediction
  *          1:output margin instead of transformed value
  *          2:output leaf index of trees instead of leaf value, note leaf index is unique per tree
- *          4:output feature contributions of all trees instead of predictions
+ *          4:output feature contributions to individual predictions
  * \param ntree_limit limit number of trees used for prediction, this is only valid for boosted trees
  *    when the parameter is set to 0, we will use all the trees
  * \param out_len used to store length of returning result
diff --git a/include/xgboost/gbm.h b/include/xgboost/gbm.h
index aadd074f8..22f99a7a5 100644
--- a/include/xgboost/gbm.h
+++ b/include/xgboost/gbm.h
@@ -109,8 +109,8 @@ class GradientBooster {
                            unsigned ntree_limit = 0) = 0;
 
   /*!
-   * \brief predict the feature contributions of each tree, the output will be nsample * (nfeats + 1) vector
-   *        this is only valid in gbtree predictor
+   * \brief feature contributions to individual predictions; the output will be a vector
+   *         of length (nfeats + 1) * num_output_group * nsample, arranged in that order
    * \param dmat feature matrix
    * \param out_contribs output vector to hold the contributions
    * \param ntree_limit limit the number of trees used in prediction, when it equals 0, this means
diff --git a/include/xgboost/learner.h b/include/xgboost/learner.h
index 4733f5522..ab41d337b 100644
--- a/include/xgboost/learner.h
+++ b/include/xgboost/learner.h
@@ -103,7 +103,7 @@ class Learner : public rabit::Serializable {
    * \param ntree_limit limit number of trees used for boosted tree
    *   predictor, when it equals 0, this means we are using all the trees
    * \param pred_leaf whether to only predict the leaf index of each tree in a boosted tree predictor
-   * \param pred_contribs whether to only predict the feature contributions of all trees
+   * \param pred_contribs whether to only predict the feature contributions
    */
   virtual void Predict(DMatrix* data,
                        bool output_margin,
diff --git a/src/gbm/gblinear.cc b/src/gbm/gblinear.cc
index 168fe4b57..9aa661f6b 100644
--- a/src/gbm/gblinear.cc
+++ b/src/gbm/gblinear.cc
@@ -180,10 +180,6 @@ class GBLinear : public GradientBooster {
         << "GBLinear::Predict ntrees is only valid for gbtree predictor";
     std::vector<bst_float> &preds = *out_preds;
     const std::vector<bst_float>& base_margin = p_fmat->info().base_margin;
-    if (base_margin.size() != 0) {
-      CHECK_EQ(preds.size(), base_margin.size())
-          << "base_margin.size does not match with prediction size";
-    }
     preds.resize(0);
     // start collecting the prediction
     dmlc::DataIter<RowBatch> *iter = p_fmat->RowIterator();
@@ -218,45 +214,87 @@ class GBLinear : public GradientBooster {
       this->Pred(inst, dmlc::BeginPtr(*out_preds), gid, base_margin_);
     }
   }
+
   void PredictLeaf(DMatrix *p_fmat,
                    std::vector<bst_float> *out_preds,
                    unsigned ntree_limit) override {
-    LOG(FATAL) << "gblinear does not support predict leaf index";
+    LOG(FATAL) << "gblinear does not support prediction of leaf index";
   }
+
   void PredictContribution(DMatrix* p_fmat,
                            std::vector<bst_float>* out_contribs,
                            unsigned ntree_limit) override {
-    LOG(FATAL) << "gblinear does not support predict contributions";
+    if (model.weight.size() == 0) {
+      model.InitModel();
+    }
+    CHECK_EQ(ntree_limit, 0U)
+        << "GBLinear::PredictContribution: ntrees is only valid for gbtree predictor";
+    const std::vector<bst_float>& base_margin = p_fmat->info().base_margin;
+    const int ngroup = model.param.num_output_group;
+    const size_t ncolumns = model.param.num_feature + 1;
+    // allocate space for (#features + bias) times #groups times #rows
+    std::vector<bst_float>& contribs = *out_contribs;
+    contribs.resize(p_fmat->info().num_row * ncolumns * ngroup);
+    // make sure contributions is zeroed, we could be reusing a previously allocated one
+    std::fill(contribs.begin(), contribs.end(), 0);
+    // start collecting the contributions
+    dmlc::DataIter<RowBatch>* iter = p_fmat->RowIterator();
+    iter->BeforeFirst();
+    while (iter->Next()) {
+      const RowBatch& batch = iter->Value();
+      // parallel over local batch
+      const bst_omp_uint nsize = static_cast<bst_omp_uint>(batch.size);
+      #pragma omp parallel for schedule(static)
+      for (bst_omp_uint i = 0; i < nsize; ++i) {
+        const RowBatch::Inst &inst = batch[i];
+        size_t row_idx = static_cast<size_t>(batch.base_rowid + i);
+        // loop over output groups
+        for (int gid = 0; gid < ngroup; ++gid) {
+          bst_float *p_contribs = &contribs[(row_idx * ngroup + gid) * ncolumns];
+          // calculate linear terms' contributions
+          for (bst_uint c = 0; c < inst.length; ++c) {
+            if (inst[c].index >= model.param.num_feature) continue;
+            p_contribs[inst[c].index] = inst[c].fvalue * model[inst[c].index][gid];
+          }
+          // add base margin to BIAS
+          p_contribs[ncolumns - 1] = model.bias()[gid] +
+            ((base_margin.size() != 0) ? base_margin[row_idx * ngroup + gid] : base_margin_);
+        }
+      }
+    }
   }
 
   std::vector<std::string> DumpModel(const FeatureMap& fmap,
                                      bool with_stats,
                                      std::string format) const override {
+    const int ngroup = model.param.num_output_group;
+    const unsigned nfeature = model.param.num_feature;
+
     std::stringstream fo("");
     if (format == "json") {
       fo << "  { \"bias\": [" << std::endl;
-      for (int i = 0; i < model.param.num_output_group; ++i) {
-        if (i != 0) fo << "," << std::endl;
-        fo << "      " << model.bias()[i];
+      for (int gid = 0; gid < ngroup; ++gid) {
+        if (gid != 0) fo << "," << std::endl;
+        fo << "      " << model.bias()[gid];
       }
       fo << std::endl << "    ]," << std::endl
          << "    \"weight\": [" << std::endl;
-      for (int i = 0; i < model.param.num_output_group; ++i) {
-        for (unsigned j = 0; j < model.param.num_feature; ++j) {
-          if (i != 0 || j != 0) fo << "," << std::endl;
-          fo << "      " << model[i][j];
+      for (unsigned i = 0; i < nfeature; ++i) {
+        for (int gid = 0; gid < ngroup; ++gid) {
+          if (i != 0 || gid != 0) fo << "," << std::endl;
+          fo << "      " << model[i][gid];
         }
       }
       fo << std::endl << "    ]" << std::endl << "  }";
     } else {
       fo << "bias:\n";
-      for (int i = 0; i < model.param.num_output_group; ++i) {
-        fo << model.bias()[i] << std::endl;
+      for (int gid = 0; gid < ngroup; ++gid) {
+        fo << model.bias()[gid] << std::endl;
       }
       fo << "weight:\n";
-      for (int i = 0; i < model.param.num_output_group; ++i) {
-        for (unsigned j = 0; j <model.param.num_feature; ++j) {
-          fo << model[i][j] << std::endl;
+      for (unsigned i = 0; i < nfeature; ++i) {
+        for (int gid = 0; gid < ngroup; ++gid) {
+          fo << model[i][gid] << std::endl;
         }
       }
     }
diff --git a/src/gbm/gbtree.cc b/src/gbm/gbtree.cc
index 1f79a58c4..bca210953 100644
--- a/src/gbm/gbtree.cc
+++ b/src/gbm/gbtree.cc
@@ -571,6 +571,7 @@ class GBTree : public GradientBooster {
     if (ntree_limit == 0 || ntree_limit > trees.size()) {
       ntree_limit = static_cast<unsigned>(trees.size());
     }
+    const int ngroup = mparam.num_output_group;
     size_t ncolumns = mparam.num_feature + 1;
     // allocate space for (number of features + bias) times the number of rows
     std::vector<bst_float>& contribs = *out_contribs;
@@ -584,7 +585,7 @@ class GBTree : public GradientBooster {
     }
     // start collecting the contributions
     dmlc::DataIter<RowBatch>* iter = p_fmat->RowIterator();
-    const std::vector<bst_float>& base_margin = p_fmat->info().base_margin;
+    const std::vector<bst_float>& base_margin = info.base_margin;
     iter->BeforeFirst();
     while (iter->Next()) {
       const RowBatch& batch = iter->Value();
@@ -596,8 +597,8 @@ class GBTree : public GradientBooster {
         unsigned root_id = info.GetRoot(row_idx);
         RegTree::FVec &feats = thread_temp[omp_get_thread_num()];
         // loop over all classes
-        for (int gid = 0; gid < mparam.num_output_group; ++gid) {
-          bst_float *p_contribs = &contribs[(row_idx * mparam.num_output_group + gid) * ncolumns];
+        for (int gid = 0; gid < ngroup; ++gid) {
+          bst_float *p_contribs = &contribs[(row_idx * ngroup + gid) * ncolumns];
           feats.Fill(batch[i]);
           // calculate contributions
           for (unsigned j = 0; j < ntree_limit; ++j) {
@@ -607,9 +608,9 @@ class GBTree : public GradientBooster {
             trees[j]->CalculateContributions(feats, root_id, p_contribs);
           }
           feats.Drop(batch[i]);
-          // add base margin to BIAS feature
+          // add base margin to BIAS
           if (base_margin.size() != 0) {
-            p_contribs[ncolumns - 1] += base_margin[row_idx * mparam.num_output_group + gid];
+            p_contribs[ncolumns - 1] += base_margin[row_idx * ngroup + gid];
           } else {
             p_contribs[ncolumns - 1] += base_margin_;
           }