From b52db87d5c3e81440b6b0f0a33f33ad080eaeb68 Mon Sep 17 00:00:00 2001 From: Vadim Khotilovich Date: Sun, 21 May 2017 06:41:51 -0500 Subject: [PATCH] adding feature contributions to R and gblinear (#2295) * [gblinear] add features contribution prediction; fix DumpModel bug * [gbtree] minor changes to PredContrib * [R] add feature contribution prediction to R * [R] bump up version; update NEWS * [gblinear] fix the base_margin issue; fixes #1969 * [R] list of matrices as output of multiclass feature contributions * [gblinear] make order of DumpModel coefficients consistent: group index changes the fastest --- NEWS.md | 4 ++ R-package/DESCRIPTION | 2 +- R-package/R/xgb.Booster.R | 90 +++++++++++++++++++------ R-package/man/predict.xgb.Booster.Rd | 48 +++++++++++-- R-package/tests/testthat/test_helpers.R | 78 +++++++++++++++++++-- include/xgboost/c_api.h | 2 +- include/xgboost/gbm.h | 4 +- include/xgboost/learner.h | 2 +- src/gbm/gblinear.cc | 74 +++++++++++++++----- src/gbm/gbtree.cc | 11 +-- 10 files changed, 255 insertions(+), 60 deletions(-) diff --git a/NEWS.md b/NEWS.md index 61da8a36b..064041988 100644 --- a/NEWS.md +++ b/NEWS.md @@ -12,10 +12,14 @@ This file records the changes in xgboost library in reverse chronological order. - Thread local variable is upgraded so it is automatically freed at thread exit. * Migrate to C++11 - The current master version now requires C++11 enabled compiled(g++4.8 or higher) +* New functionality + - Ability to adjust tree model's statistics to a new dataset without changing tree structures. + - Extracting feature contributions to individual predictions. * R package: - New parameters: - `silent` in `xgb.DMatrix()` - `use_int_id` in `xgb.model.dt.tree()` + - `predcontrib` in `predict()` - Default value of the `save_period` parameter in `xgboost()` changed to NULL (consistent with `xgb.train()`). ## v0.6 (2016.07.29) diff --git a/R-package/DESCRIPTION b/R-package/DESCRIPTION index b1a91be17..758c454af 100644 --- a/R-package/DESCRIPTION +++ b/R-package/DESCRIPTION @@ -1,7 +1,7 @@ Package: xgboost Type: Package Title: Extreme Gradient Boosting -Version: 0.6.4.4 +Version: 0.6.4.5 Date: 2017-01-04 Author: Tianqi Chen , Tong He , Michael Benesty , Vadim Khotilovich , diff --git a/R-package/R/xgb.Booster.R b/R-package/R/xgb.Booster.R index 94bd27028..8e72486bd 100644 --- a/R-package/R/xgb.Booster.R +++ b/R-package/R/xgb.Booster.R @@ -126,7 +126,8 @@ xgb.Booster.complete <- function(object, saveraw = TRUE) { #' logistic regression would result in predictions for log-odds instead of probabilities. #' @param ntreelimit limit the number of model's trees or boosting iterations used in prediction (see Details). #' It will use all the trees by default (\code{NULL} value). -#' @param predleaf whether predict leaf index instead. +#' @param predleaf whether predict leaf index instead. +#' @param predcontrib whether to return feature contributions to individual predictions instead (see Details). #' @param reshape whether to reshape the vector of predictions to a matrix form when there are several #' prediction outputs per case. This option has no effect when \code{predleaf = TRUE}. #' @param ... Parameters passed to \code{predict.xgb.Booster} @@ -135,15 +136,22 @@ xgb.Booster.complete <- function(object, saveraw = TRUE) { #' Note that \code{ntreelimit} is not necessarily equal to the number of boosting iterations #' and it is not necessarily equal to the number of trees in a model. #' E.g., in a random forest-like model, \code{ntreelimit} would limit the number of trees. -#' But for multiclass classification, there are multiple trees per iteration, -#' but \code{ntreelimit} limits the number of boosting iterations. +#' But for multiclass classification, while there are multiple trees per iteration, +#' \code{ntreelimit} limits the number of boosting iterations. #' #' Also note that \code{ntreelimit} would currently do nothing for predictions from gblinear, -#' since gblinear doesn't keep its boosting history. +#' since gblinear doesn't keep its boosting history. #' #' One possible practical applications of the \code{predleaf} option is to use the model #' as a generator of new features which capture non-linearity and interactions, -#' e.g., as implemented in \code{\link{xgb.create.features}}. +#' e.g., as implemented in \code{\link{xgb.create.features}}. +#' +#' Setting \code{predcontrib = TRUE} allows to calculate contributions of each feature to +#' individual predictions. For "gblinear" booster, feature contributions are simply linear terms +#' (feature_beta * feature_value). For "gbtree" booster, feature contribution is calculated +#' as a sum of average contribution of that feature's split nodes across all trees to an +#' individual prediction, following the idea explained in +#' \url{http://blog.datadive.net/interpreting-random-forests/}. #' #' @return #' For regression or binary classification, it returns a vector of length \code{nrows(newdata)}. @@ -154,6 +162,12 @@ xgb.Booster.complete <- function(object, saveraw = TRUE) { #' When \code{predleaf = TRUE}, the output is a matrix object with the #' number of columns corresponding to the number of trees. #' +#' When \code{predcontrib = TRUE} and it is not a multiclass setting, the output is a matrix object with +#' \code{num_features + 1} columns. The last "+ 1" column in a matrix corresponds to bias. +#' For a multiclass case, a list of \code{num_class} elements is returned, where each element is +#' such a matrix. The contribution values are on the scale of untransformed margin +#' (e.g., for binary classification would mean that the contributions are log-odds deviations from bias). +#' #' @seealso #' \code{\link{xgb.train}}. #' @@ -166,11 +180,32 @@ xgb.Booster.complete <- function(object, saveraw = TRUE) { #' test <- agaricus.test #' #' bst <- xgboost(data = train$data, label = train$label, max_depth = 2, -#' eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic") +#' eta = 0.5, nthread = 2, nrounds = 5, objective = "binary:logistic") #' # use all trees by default #' pred <- predict(bst, test$data) #' # use only the 1st tree -#' pred <- predict(bst, test$data, ntreelimit = 1) +#' pred1 <- predict(bst, test$data, ntreelimit = 1) +#' +#' # Predicting tree leafs: +#' # the result is an nsamples X ntrees matrix +#' pred_leaf <- predict(bst, test$data, predleaf = TRUE) +#' str(pred_leaf) +#' +#' # Predicting feature contributions to predictions: +#' # the result is an nsamples X (nfeatures + 1) matrix +#' pred_contr <- predict(bst, test$data, predcontrib = TRUE) +#' str(pred_contr) +#' # verify that contributions' sums are equal to log-odds of predictions (up to foat precision): +#' summary(rowSums(pred_contr) - qlogis(pred)) +#' # for the 1st record, let's inspect its features that had non-zero contribution to prediction: +#' contr1 <- pred_contr[1,] +#' contr1 <- contr1[-length(contr1)] # drop BIAS +#' contr1 <- contr1[contr1 != 0] # drop non-contributing features +#' contr1 <- contr1[order(abs(contr1))] # order by contribution magnitude +#' old_mar <- par("mar") +#' par(mar = old_mar + c(0,7,0,0)) +#' barplot(contr1, horiz = TRUE, las = 2, xlab = "contribution to prediction in log-odds") +#' par(mar = old_mar) #' #' #' ## multiclass classification in iris dataset: @@ -222,8 +257,8 @@ xgb.Booster.complete <- function(object, saveraw = TRUE) { #' #' @rdname predict.xgb.Booster #' @export -predict.xgb.Booster <- function(object, newdata, missing = NA, - outputmargin = FALSE, ntreelimit = NULL, predleaf = FALSE, reshape = FALSE, ...) { +predict.xgb.Booster <- function(object, newdata, missing = NA, outputmargin = FALSE, ntreelimit = NULL, + predleaf = FALSE, predcontrib = FALSE, reshape = FALSE, ...) { object <- xgb.Booster.complete(object, saveraw = FALSE) if (!inherits(newdata, "xgb.DMatrix")) @@ -235,23 +270,40 @@ predict.xgb.Booster <- function(object, newdata, missing = NA, if (ntreelimit < 0) stop("ntreelimit cannot be negative") - option <- 0L + 1L * as.logical(outputmargin) + 2L * as.logical(predleaf) + option <- 0L + 1L * as.logical(outputmargin) + 2L * as.logical(predleaf) + 4L * as.logical(predcontrib) ret <- .Call(XGBoosterPredict_R, object$handle, newdata, option[1], as.integer(ntreelimit)) - if (length(ret) %% nrow(newdata) != 0) - stop("prediction length ", length(ret)," is not multiple of nrows(newdata) ", nrow(newdata)) - npred_per_case <- length(ret) / nrow(newdata) - - if (predleaf){ - len <- nrow(newdata) - ret <- if (length(ret) == len) { + n_ret <- length(ret) + n_row <- nrow(newdata) + npred_per_case <- n_ret / n_row + + if (n_ret %% n_row != 0) + stop("prediction length ", n_ret, " is not multiple of nrows(newdata) ", n_row) + + if (predleaf) { + ret <- if (n_ret == n_row) { matrix(ret, ncol = 1) } else { - t(matrix(ret, ncol = len)) + matrix(ret, nrow = n_row, byrow = TRUE) + } + } else if (predcontrib) { + n_col1 <- ncol(newdata) + 1 + n_group <- npred_per_case / n_col1 + dnames <- list(NULL, c(colnames(newdata), "BIAS")) + ret <- if (n_ret == n_row) { + matrix(ret, ncol = 1, dimnames = dnames) + } else if (n_group == 1) { + matrix(ret, nrow = n_row, byrow = TRUE, dimnames = dnames) + } else { + grp_mask <- rep(1:n_col1, n_row) + + rep((0:(n_row - 1)) * n_col1 * n_group, each = n_col1) + lapply(1:n_group, function(g) { + matrix(ret[grp_mask + n_col1 * (g - 1)], nrow = n_row, byrow = TRUE, dimnames = dnames) + }) } } else if (reshape && npred_per_case > 1) { - ret <- matrix(ret, ncol = length(ret) / nrow(newdata), byrow = TRUE) + ret <- matrix(ret, nrow = n_row, byrow = TRUE) } return(ret) } diff --git a/R-package/man/predict.xgb.Booster.Rd b/R-package/man/predict.xgb.Booster.Rd index 67a06fe2a..ef93c12a7 100644 --- a/R-package/man/predict.xgb.Booster.Rd +++ b/R-package/man/predict.xgb.Booster.Rd @@ -7,7 +7,7 @@ \usage{ \method{predict}{xgb.Booster}(object, newdata, missing = NA, outputmargin = FALSE, ntreelimit = NULL, predleaf = FALSE, - reshape = FALSE, ...) + predcontrib = FALSE, reshape = FALSE, ...) \method{predict}{xgb.Booster.handle}(object, ...) } @@ -28,6 +28,8 @@ It will use all the trees by default (\code{NULL} value).} \item{predleaf}{whether predict leaf index instead.} +\item{predcontrib}{whether to return feature contributions to individual predictions instead (see Details).} + \item{reshape}{whether to reshape the vector of predictions to a matrix form when there are several prediction outputs per case. This option has no effect when \code{predleaf = TRUE}.} @@ -41,6 +43,12 @@ the \code{reshape} value. When \code{predleaf = TRUE}, the output is a matrix object with the number of columns corresponding to the number of trees. + +When \code{predcontrib = TRUE} and it is not a multiclass setting, the output is a matrix object with +\code{num_features + 1} columns. The last "+ 1" column in a matrix corresponds to bias. +For a multiclass case, a list of \code{num_class} elements is returned, where each element is +such a matrix. The contribution values are on the scale of untransformed margin +(e.g., for binary classification would mean that the contributions are log-odds deviations from bias). } \description{ Predicted values based on either xgboost model or model handle object. @@ -49,15 +57,22 @@ Predicted values based on either xgboost model or model handle object. Note that \code{ntreelimit} is not necessarily equal to the number of boosting iterations and it is not necessarily equal to the number of trees in a model. E.g., in a random forest-like model, \code{ntreelimit} would limit the number of trees. -But for multiclass classification, there are multiple trees per iteration, -but \code{ntreelimit} limits the number of boosting iterations. +But for multiclass classification, while there are multiple trees per iteration, +\code{ntreelimit} limits the number of boosting iterations. Also note that \code{ntreelimit} would currently do nothing for predictions from gblinear, -since gblinear doesn't keep its boosting history. +since gblinear doesn't keep its boosting history. One possible practical applications of the \code{predleaf} option is to use the model as a generator of new features which capture non-linearity and interactions, e.g., as implemented in \code{\link{xgb.create.features}}. + +Setting \code{predcontrib = TRUE} allows to calculate contributions of each feature to +individual predictions. For "gblinear" booster, feature contributions are simply linear terms +(feature_beta * feature_value). For "gbtree" booster, feature contribution is calculated +as a sum of average contribution of that feature's split nodes across all trees to an +individual prediction, following the idea explained in +\url{http://blog.datadive.net/interpreting-random-forests/}. } \examples{ ## binary classification: @@ -68,11 +83,32 @@ train <- agaricus.train test <- agaricus.test bst <- xgboost(data = train$data, label = train$label, max_depth = 2, - eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic") + eta = 0.5, nthread = 2, nrounds = 5, objective = "binary:logistic") # use all trees by default pred <- predict(bst, test$data) # use only the 1st tree -pred <- predict(bst, test$data, ntreelimit = 1) +pred1 <- predict(bst, test$data, ntreelimit = 1) + +# Predicting tree leafs: +# the result is an nsamples X ntrees matrix +pred_leaf <- predict(bst, test$data, predleaf = TRUE) +str(pred_leaf) + +# Predicting feature contributions to predictions: +# the result is an nsamples X (nfeatures + 1) matrix +pred_contr <- predict(bst, test$data, predcontrib = TRUE) +str(pred_contr) +# verify that contributions' sums are equal to log-odds of predictions (up to foat precision): +summary(rowSums(pred_contr) - qlogis(pred)) +# for the 1st record, let's inspect its features that had non-zero contribution to prediction: +contr1 <- pred_contr[1,] +contr1 <- contr1[-length(contr1)] # drop BIAS +contr1 <- contr1[contr1 != 0] # drop non-contributing features +contr1 <- contr1[order(abs(contr1))] # order by contribution magnitude +old_mar <- par("mar") +par(mar = old_mar + c(0,7,0,0)) +barplot(contr1, horiz = TRUE, las = 2, xlab = "contribution to prediction in log-odds") +par(mar = old_mar) ## multiclass classification in iris dataset: diff --git a/R-package/tests/testthat/test_helpers.R b/R-package/tests/testthat/test_helpers.R index 84df814c8..64814c403 100644 --- a/R-package/tests/testthat/test_helpers.R +++ b/R-package/tests/testthat/test_helpers.R @@ -14,18 +14,19 @@ df[,ID := NULL] sparse_matrix <- sparse.model.matrix(Improved~.-1, data = df) label <- df[, ifelse(Improved == "Marked", 1, 0)] +nrounds <- 12 bst.Tree <- xgboost(data = sparse_matrix, label = label, max_depth = 9, - eta = 1, nthread = 2, nrounds = 10, verbose = 0, + eta = 1, nthread = 2, nrounds = nrounds, verbose = 0, objective = "binary:logistic", booster = "gbtree") bst.GLM <- xgboost(data = sparse_matrix, label = label, - eta = 1, nthread = 2, nrounds = 10, verbose = 0, + eta = 1, nthread = 1, nrounds = nrounds, verbose = 0, objective = "binary:logistic", booster = "gblinear") feature.names <- colnames(sparse_matrix) test_that("xgb.dump works", { - expect_length(xgb.dump(bst.Tree), 172) + expect_length(xgb.dump(bst.Tree), 200) expect_true(xgb.dump(bst.Tree, 'xgb.model.dump', with_stats = T)) expect_true(file.exists('xgb.model.dump')) expect_gt(file.size('xgb.model.dump'), 8000) @@ -33,7 +34,7 @@ test_that("xgb.dump works", { # JSON format dmp <- xgb.dump(bst.Tree, dump_format = "json") expect_length(dmp, 1) - expect_length(grep('nodeid', strsplit(dmp, '\n')[[1]]), 162) + expect_length(grep('nodeid', strsplit(dmp, '\n')[[1]]), 188) }) test_that("xgb.dump works for gblinear", { @@ -52,13 +53,74 @@ test_that("xgb.dump works for gblinear", { expect_length(grep('\\d', strsplit(dmp, '\n')[[1]]), 11) }) +test_that("predict leafs works", { + # no error for gbtree + expect_error(pred_leaf <- predict(bst.Tree, sparse_matrix, predleaf = TRUE), regexp = NA) + expect_equal(dim(pred_leaf), c(nrow(sparse_matrix), nrounds)) + # error for gblinear + expect_error(predict(bst.GLM, sparse_matrix, predleaf = TRUE)) +}) + +test_that("predict feature contributions works", { + # gbtree binary classifier + expect_error(pred_contr <- predict(bst.Tree, sparse_matrix, predcontrib = TRUE), regexp = NA) + expect_equal(dim(pred_contr), c(nrow(sparse_matrix), ncol(sparse_matrix) + 1)) + expect_equal(colnames(pred_contr), c(colnames(sparse_matrix), "BIAS")) + pred <- predict(bst.Tree, sparse_matrix, outputmargin = TRUE) + expect_lt(max(abs(rowSums(pred_contr) - pred)), 1e-6) + + # gblinear binary classifier + expect_error(pred_contr <- predict(bst.GLM, sparse_matrix, predcontrib = TRUE), regexp = NA) + expect_equal(dim(pred_contr), c(nrow(sparse_matrix), ncol(sparse_matrix) + 1)) + expect_equal(colnames(pred_contr), c(colnames(sparse_matrix), "BIAS")) + pred <- predict(bst.GLM, sparse_matrix, outputmargin = TRUE) + expect_lt(max(abs(rowSums(pred_contr) - pred)), 2e-6) + # manual calculation of linear terms + coefs <- xgb.dump(bst.GLM)[-c(1,2,4)] %>% as.numeric + coefs <- c(coefs[-1], coefs[1]) # intercept must be the last + pred_contr_manual <- sweep(cbind(sparse_matrix, 1), 2, coefs, FUN="*") + expect_equal(as.numeric(pred_contr), as.numeric(pred_contr_manual), 2e-6) + + # gbtree multiclass + lb <- as.numeric(iris$Species) - 1 + bst <- xgboost(data = as.matrix(iris[, -5]), label = lb, verbose = 0, + max_depth = 3, eta = 0.5, nthread = 2, nrounds = 5, + objective = "multi:softprob", num_class = 3) + pred <- predict(bst, as.matrix(iris[, -5]), outputmargin = TRUE, reshape = TRUE) + pred_contr <- predict(bst, as.matrix(iris[, -5]), predcontrib = TRUE) + expect_is(pred_contr, "list") + expect_length(pred_contr, 3) + for (g in seq_along(pred_contr)) { + expect_equal(colnames(pred_contr[[g]]), c(colnames(iris[, -5]), "BIAS")) + expect_lt(max(abs(rowSums(pred_contr[[g]]) - pred[, g])), 2e-6) + } + + # gblinear multiclass (set base_score = 0, which is base margin in multiclass) + bst <- xgboost(data = as.matrix(iris[, -5]), label = lb, verbose = 0, + booster = "gblinear", eta = 0.1, nthread = 1, nrounds = 10, + objective = "multi:softprob", num_class = 3, base_score = 0) + pred <- predict(bst, as.matrix(iris[, -5]), outputmargin = TRUE, reshape = TRUE) + pred_contr <- predict(bst, as.matrix(iris[, -5]), predcontrib = TRUE) + expect_length(pred_contr, 3) + coefs_all <- xgb.dump(bst)[-c(1,2,6)] %>% as.numeric + for (g in seq_along(pred_contr)) { + expect_equal(colnames(pred_contr[[g]]), c(colnames(iris[, -5]), "BIAS")) + expect_lt(max(abs(rowSums(pred_contr[[g]]) - pred[, g])), 2e-6) + # manual calculation of linear terms + coefs <- coefs_all[seq(g, length(coefs_all), by = 3)] + coefs <- c(coefs[-1], coefs[1]) # intercept needs to be the last + pred_contr_manual <- sweep(as.matrix(cbind(iris[,-5], 1)), 2, coefs, FUN="*") + expect_equal(as.numeric(pred_contr[[g]]), as.numeric(pred_contr_manual), 2e-6) + } +}) + test_that("xgb-attribute functionality", { val <- "my attribute value" list.val <- list(my_attr=val, a=123, b='ok') list.ch <- list.val[order(names(list.val))] list.ch <- lapply(list.ch, as.character) # note: iter is 0-index in xgb attributes - list.default <- list(niter = "9") + list.default <- list(niter = as.character(nrounds - 1)) list.ch <- c(list.ch, list.default) # proper input: expect_error(xgb.attr(bst.Tree, NULL)) @@ -85,7 +147,9 @@ test_that("xgb-attribute functionality", { expect_null(xgb.attributes(bst)) }) -if (grepl('Windows', Sys.info()[['sysname']]) || grepl('Linux', Sys.info()[['sysname']]) || grepl('Darwin', Sys.info()[['sysname']])) { +if (grepl('Windows', Sys.info()[['sysname']]) || + grepl('Linux', Sys.info()[['sysname']]) || + grepl('Darwin', Sys.info()[['sysname']])) { test_that("xgb-attribute numeric precision", { # check that lossless conversion works with 17 digits # numeric -> character -> numeric @@ -121,7 +185,7 @@ test_that("xgb.model.dt.tree works with and without feature names", { names.dt.trees <- c("Tree", "Node", "ID", "Feature", "Split", "Yes", "No", "Missing", "Quality", "Cover") dt.tree <- xgb.model.dt.tree(feature_names = feature.names, model = bst.Tree) expect_equal(names.dt.trees, names(dt.tree)) - expect_equal(dim(dt.tree), c(162, 10)) + expect_equal(dim(dt.tree), c(188, 10)) expect_output(str(dt.tree), 'Feature.*\\"Age\\"') dt.tree.0 <- xgb.model.dt.tree(model = bst.Tree) diff --git a/include/xgboost/c_api.h b/include/xgboost/c_api.h index decdcd0bf..8b37d2dc1 100644 --- a/include/xgboost/c_api.h +++ b/include/xgboost/c_api.h @@ -384,7 +384,7 @@ XGB_DLL int XGBoosterEvalOneIter(BoosterHandle handle, * 0:normal prediction * 1:output margin instead of transformed value * 2:output leaf index of trees instead of leaf value, note leaf index is unique per tree - * 4:output feature contributions of all trees instead of predictions + * 4:output feature contributions to individual predictions * \param ntree_limit limit number of trees used for prediction, this is only valid for boosted trees * when the parameter is set to 0, we will use all the trees * \param out_len used to store length of returning result diff --git a/include/xgboost/gbm.h b/include/xgboost/gbm.h index aadd074f8..22f99a7a5 100644 --- a/include/xgboost/gbm.h +++ b/include/xgboost/gbm.h @@ -109,8 +109,8 @@ class GradientBooster { unsigned ntree_limit = 0) = 0; /*! - * \brief predict the feature contributions of each tree, the output will be nsample * (nfeats + 1) vector - * this is only valid in gbtree predictor + * \brief feature contributions to individual predictions; the output will be a vector + * of length (nfeats + 1) * num_output_group * nsample, arranged in that order * \param dmat feature matrix * \param out_contribs output vector to hold the contributions * \param ntree_limit limit the number of trees used in prediction, when it equals 0, this means diff --git a/include/xgboost/learner.h b/include/xgboost/learner.h index 4733f5522..ab41d337b 100644 --- a/include/xgboost/learner.h +++ b/include/xgboost/learner.h @@ -103,7 +103,7 @@ class Learner : public rabit::Serializable { * \param ntree_limit limit number of trees used for boosted tree * predictor, when it equals 0, this means we are using all the trees * \param pred_leaf whether to only predict the leaf index of each tree in a boosted tree predictor - * \param pred_contribs whether to only predict the feature contributions of all trees + * \param pred_contribs whether to only predict the feature contributions */ virtual void Predict(DMatrix* data, bool output_margin, diff --git a/src/gbm/gblinear.cc b/src/gbm/gblinear.cc index 168fe4b57..9aa661f6b 100644 --- a/src/gbm/gblinear.cc +++ b/src/gbm/gblinear.cc @@ -180,10 +180,6 @@ class GBLinear : public GradientBooster { << "GBLinear::Predict ntrees is only valid for gbtree predictor"; std::vector &preds = *out_preds; const std::vector& base_margin = p_fmat->info().base_margin; - if (base_margin.size() != 0) { - CHECK_EQ(preds.size(), base_margin.size()) - << "base_margin.size does not match with prediction size"; - } preds.resize(0); // start collecting the prediction dmlc::DataIter *iter = p_fmat->RowIterator(); @@ -218,45 +214,87 @@ class GBLinear : public GradientBooster { this->Pred(inst, dmlc::BeginPtr(*out_preds), gid, base_margin_); } } + void PredictLeaf(DMatrix *p_fmat, std::vector *out_preds, unsigned ntree_limit) override { - LOG(FATAL) << "gblinear does not support predict leaf index"; + LOG(FATAL) << "gblinear does not support prediction of leaf index"; } + void PredictContribution(DMatrix* p_fmat, std::vector* out_contribs, unsigned ntree_limit) override { - LOG(FATAL) << "gblinear does not support predict contributions"; + if (model.weight.size() == 0) { + model.InitModel(); + } + CHECK_EQ(ntree_limit, 0U) + << "GBLinear::PredictContribution: ntrees is only valid for gbtree predictor"; + const std::vector& base_margin = p_fmat->info().base_margin; + const int ngroup = model.param.num_output_group; + const size_t ncolumns = model.param.num_feature + 1; + // allocate space for (#features + bias) times #groups times #rows + std::vector& contribs = *out_contribs; + contribs.resize(p_fmat->info().num_row * ncolumns * ngroup); + // make sure contributions is zeroed, we could be reusing a previously allocated one + std::fill(contribs.begin(), contribs.end(), 0); + // start collecting the contributions + dmlc::DataIter* iter = p_fmat->RowIterator(); + iter->BeforeFirst(); + while (iter->Next()) { + const RowBatch& batch = iter->Value(); + // parallel over local batch + const bst_omp_uint nsize = static_cast(batch.size); + #pragma omp parallel for schedule(static) + for (bst_omp_uint i = 0; i < nsize; ++i) { + const RowBatch::Inst &inst = batch[i]; + size_t row_idx = static_cast(batch.base_rowid + i); + // loop over output groups + for (int gid = 0; gid < ngroup; ++gid) { + bst_float *p_contribs = &contribs[(row_idx * ngroup + gid) * ncolumns]; + // calculate linear terms' contributions + for (bst_uint c = 0; c < inst.length; ++c) { + if (inst[c].index >= model.param.num_feature) continue; + p_contribs[inst[c].index] = inst[c].fvalue * model[inst[c].index][gid]; + } + // add base margin to BIAS + p_contribs[ncolumns - 1] = model.bias()[gid] + + ((base_margin.size() != 0) ? base_margin[row_idx * ngroup + gid] : base_margin_); + } + } + } } std::vector DumpModel(const FeatureMap& fmap, bool with_stats, std::string format) const override { + const int ngroup = model.param.num_output_group; + const unsigned nfeature = model.param.num_feature; + std::stringstream fo(""); if (format == "json") { fo << " { \"bias\": [" << std::endl; - for (int i = 0; i < model.param.num_output_group; ++i) { - if (i != 0) fo << "," << std::endl; - fo << " " << model.bias()[i]; + for (int gid = 0; gid < ngroup; ++gid) { + if (gid != 0) fo << "," << std::endl; + fo << " " << model.bias()[gid]; } fo << std::endl << " ]," << std::endl << " \"weight\": [" << std::endl; - for (int i = 0; i < model.param.num_output_group; ++i) { - for (unsigned j = 0; j < model.param.num_feature; ++j) { - if (i != 0 || j != 0) fo << "," << std::endl; - fo << " " << model[i][j]; + for (unsigned i = 0; i < nfeature; ++i) { + for (int gid = 0; gid < ngroup; ++gid) { + if (i != 0 || gid != 0) fo << "," << std::endl; + fo << " " << model[i][gid]; } } fo << std::endl << " ]" << std::endl << " }"; } else { fo << "bias:\n"; - for (int i = 0; i < model.param.num_output_group; ++i) { - fo << model.bias()[i] << std::endl; + for (int gid = 0; gid < ngroup; ++gid) { + fo << model.bias()[gid] << std::endl; } fo << "weight:\n"; - for (int i = 0; i < model.param.num_output_group; ++i) { - for (unsigned j = 0; j trees.size()) { ntree_limit = static_cast(trees.size()); } + const int ngroup = mparam.num_output_group; size_t ncolumns = mparam.num_feature + 1; // allocate space for (number of features + bias) times the number of rows std::vector& contribs = *out_contribs; @@ -584,7 +585,7 @@ class GBTree : public GradientBooster { } // start collecting the contributions dmlc::DataIter* iter = p_fmat->RowIterator(); - const std::vector& base_margin = p_fmat->info().base_margin; + const std::vector& base_margin = info.base_margin; iter->BeforeFirst(); while (iter->Next()) { const RowBatch& batch = iter->Value(); @@ -596,8 +597,8 @@ class GBTree : public GradientBooster { unsigned root_id = info.GetRoot(row_idx); RegTree::FVec &feats = thread_temp[omp_get_thread_num()]; // loop over all classes - for (int gid = 0; gid < mparam.num_output_group; ++gid) { - bst_float *p_contribs = &contribs[(row_idx * mparam.num_output_group + gid) * ncolumns]; + for (int gid = 0; gid < ngroup; ++gid) { + bst_float *p_contribs = &contribs[(row_idx * ngroup + gid) * ncolumns]; feats.Fill(batch[i]); // calculate contributions for (unsigned j = 0; j < ntree_limit; ++j) { @@ -607,9 +608,9 @@ class GBTree : public GradientBooster { trees[j]->CalculateContributions(feats, root_id, p_contribs); } feats.Drop(batch[i]); - // add base margin to BIAS feature + // add base margin to BIAS if (base_margin.size() != 0) { - p_contribs[ncolumns - 1] += base_margin[row_idx * mparam.num_output_group + gid]; + p_contribs[ncolumns - 1] += base_margin[row_idx * ngroup + gid]; } else { p_contribs[ncolumns - 1] += base_margin_; }