diff --git a/R-package/DESCRIPTION b/R-package/DESCRIPTION index 758c454af..d838ce4e5 100644 --- a/R-package/DESCRIPTION +++ b/R-package/DESCRIPTION @@ -1,7 +1,7 @@ Package: xgboost Type: Package Title: Extreme Gradient Boosting -Version: 0.6.4.5 +Version: 0.6.4.6 Date: 2017-01-04 Author: Tianqi Chen , Tong He , Michael Benesty , Vadim Khotilovich , diff --git a/R-package/R/xgb.importance.R b/R-package/R/xgb.importance.R index 15de5bfa2..e6cfb1af5 100644 --- a/R-package/R/xgb.importance.R +++ b/R-package/R/xgb.importance.R @@ -6,6 +6,11 @@ #' contains feature names, those would be used when \code{feature_names=NULL} (default value). #' Non-null \code{feature_names} could be provided to override those in the model. #' @param model object of class \code{xgb.Booster}. +#' @param trees (only for the gbtree booster) an integer vector of tree indices that should be included +#' into the importance calculation. If set to \code{NULL}, all trees of the model are parsed. +#' It could be useful, e.g., in multiclass classification to get feature importances +#' for each class separately. IMPORTANT: the tree index in xgboost models +#' is zero-based (e.g., use \code{trees = 0:4} for first 5 trees). #' @param data deprecated. #' @param label deprecated. #' @param target deprecated. @@ -32,27 +37,51 @@ #' a feature have been used in trees. #' } #' -#' A linear model's importance \code{data.table} has only two columns: +#' A linear model's importance \code{data.table} has the following columns: #' \itemize{ #' \item \code{Features} names of the features used in the model; -#' \item \code{Weight} the linear coefficient of this feature. +#' \item \code{Weight} the linear coefficient of this feature; +#' \item \code{Class} (only for multiclass models) class label. #' } #' -#' If you don't provide or \code{model} doesn't have \code{feature_names}, +#' If \code{feature_names} is not provided and \code{model} doesn't have \code{feature_names}, #' index of the features will be used instead. Because the index is extracted from the model dump #' (based on C++ code), it starts at 0 (as in C/C++ or Python) instead of 1 (usual in R). #' #' @examples #' +#' # binomial classification using gbtree: #' data(agaricus.train, package='xgboost') -#' #' bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max_depth = 2, -#' eta = 1, nthread = 2, nrounds = 2,objective = "binary:logistic") -#' +#' eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic") #' xgb.importance(model = bst) #' +#' # binomial classification using gblinear: +#' bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, booster = "gblinear", +#' eta = 0.3, nthread = 1, nrounds = 20, objective = "binary:logistic") +#' xgb.importance(model = bst) +#' +#' # multiclass classification using gbtree: +#' nclass <- 3 +#' nrounds <- 10 +#' mbst <- xgboost(data = as.matrix(iris[, -5]), label = as.numeric(iris$Species) - 1, +#' max_depth = 3, eta = 0.2, nthread = 2, nrounds = nrounds, +#' objective = "multi:softprob", num_class = nclass) +#' # all classes clumped together: +#' xgb.importance(model = mbst) +#' # inspect importances separately for each class: +#' xgb.importance(model = mbst, trees = seq(from=0, by=nclass, length.out=nrounds)) +#' xgb.importance(model = mbst, trees = seq(from=1, by=nclass, length.out=nrounds)) +#' xgb.importance(model = mbst, trees = seq(from=2, by=nclass, length.out=nrounds)) +#' +#' # multiclass classification using gblinear: +#' mbst <- xgboost(data = scale(as.matrix(iris[, -5])), label = as.numeric(iris$Species) - 1, +#' booster = "gblinear", eta = 0.2, nthread = 1, nrounds = 15, +#' objective = "multi:softprob", num_class = nclass) +#' xgb.importance(model = mbst) +#' #' @export -xgb.importance <- function(feature_names = NULL, model = NULL, +xgb.importance <- function(feature_names = NULL, model = NULL, trees = NULL, data = NULL, label = NULL, target = NULL){ if (!(is.null(data) && is.null(label) && is.null(target))) @@ -74,14 +103,25 @@ xgb.importance <- function(feature_names = NULL, model = NULL, weights <- which(model_text_dump == "weight:") %>% {model_text_dump[(. + 1):length(model_text_dump)]} %>% as.numeric + + num_class <- NVL(model$params$num_class, 1) if(is.null(feature_names)) - feature_names <- seq(to = length(weights)) - if (length(feature_names) != length(weights)) - stop("feature_names has less elements than there are features used in the model") - result <- data.table(Feature = feature_names, Weight = weights)[order(-abs(Weight))] + feature_names <- seq(to = length(weights) / num_class) - 1 + if (length(feature_names) * num_class != length(weights)) + stop("feature_names length does not match the number of features used in the model") + + result <- if (num_class == 1) { + data.table(Feature = feature_names, Weight = weights)[order(-abs(Weight))] + } else { + data.table(Feature = rep(feature_names, each = num_class), + Weight = weights, + Class = 0:(num_class - 1))[order(Class, -abs(Weight))] + } } else { # tree model - result <- xgb.model.dt.tree(feature_names = feature_names, text = model_text_dump)[ + result <- xgb.model.dt.tree(feature_names = feature_names, + text = model_text_dump, + trees = trees)[ Feature != "Leaf", .(Gain = sum(Quality), Cover = sum(Cover), Frequency = .N), by = Feature][ diff --git a/R-package/man/xgb.importance.Rd b/R-package/man/xgb.importance.Rd index 11a6cc854..5c968d207 100644 --- a/R-package/man/xgb.importance.Rd +++ b/R-package/man/xgb.importance.Rd @@ -4,8 +4,8 @@ \alias{xgb.importance} \title{Importance of features in a model.} \usage{ -xgb.importance(feature_names = NULL, model = NULL, data = NULL, - label = NULL, target = NULL) +xgb.importance(feature_names = NULL, model = NULL, trees = NULL, + data = NULL, label = NULL, target = NULL) } \arguments{ \item{feature_names}{character vector of feature names. If the model already @@ -14,6 +14,12 @@ Non-null \code{feature_names} could be provided to override those in the model.} \item{model}{object of class \code{xgb.Booster}.} +\item{trees}{(only for the gbtree booster) an integer vector of tree indices that should be included +into the importance calculation. If set to \code{NULL}, all trees of the model are parsed. +It could be useful, e.g., in multiclass classification to get feature importances +for each class separately. IMPORTANT: the tree index in xgboost models +is zero-based (e.g., use \code{trees = 0:4} for first 5 trees).} + \item{data}{deprecated.} \item{label}{deprecated.} @@ -32,13 +38,14 @@ For a tree model, a \code{data.table} with the following columns: a feature have been used in trees. } -A linear model's importance \code{data.table} has only two columns: +A linear model's importance \code{data.table} has the following columns: \itemize{ \item \code{Features} names of the features used in the model; - \item \code{Weight} the linear coefficient of this feature. + \item \code{Weight} the linear coefficient of this feature; + \item \code{Class} (only for multiclass models) class label. } -If you don't provide or \code{model} doesn't have \code{feature_names}, +If \code{feature_names} is not provided and \code{model} doesn't have \code{feature_names}, index of the features will be used instead. Because the index is extracted from the model dump (based on C++ code), it starts at 0 (as in C/C++ or Python) instead of 1 (usual in R). } @@ -55,11 +62,34 @@ L1 or L2 regularization). } \examples{ +# binomial classification using gbtree: data(agaricus.train, package='xgboost') - bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max_depth = 2, - eta = 1, nthread = 2, nrounds = 2,objective = "binary:logistic") - + eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic") xgb.importance(model = bst) +# binomial classification using gblinear: +bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, booster = "gblinear", + eta = 0.3, nthread = 1, nrounds = 20, objective = "binary:logistic") +xgb.importance(model = bst) + +# multiclass classification using gbtree: +nclass <- 3 +nrounds <- 10 +mbst <- xgboost(data = as.matrix(iris[, -5]), label = as.numeric(iris$Species) - 1, + max_depth = 3, eta = 0.2, nthread = 2, nrounds = nrounds, + objective = "multi:softprob", num_class = nclass) +# all classes clumped together: +xgb.importance(model = mbst) +# inspect importances separately for each class: +xgb.importance(model = mbst, trees = seq(from=0, by=nclass, length.out=nrounds)) +xgb.importance(model = mbst, trees = seq(from=1, by=nclass, length.out=nrounds)) +xgb.importance(model = mbst, trees = seq(from=2, by=nclass, length.out=nrounds)) + +# multiclass classification using gblinear: +mbst <- xgboost(data = scale(as.matrix(iris[, -5])), label = as.numeric(iris$Species) - 1, + booster = "gblinear", eta = 0.2, nthread = 1, nrounds = 15, + objective = "multi:softprob", num_class = nclass) +xgb.importance(model = mbst) + } diff --git a/R-package/tests/testthat/test_helpers.R b/R-package/tests/testthat/test_helpers.R index 64814c403..53f5deab0 100644 --- a/R-package/tests/testthat/test_helpers.R +++ b/R-package/tests/testthat/test_helpers.R @@ -14,6 +14,7 @@ df[,ID := NULL] sparse_matrix <- sparse.model.matrix(Improved~.-1, data = df) label <- df[, ifelse(Improved == "Marked", 1, 0)] +# binary nrounds <- 12 bst.Tree <- xgboost(data = sparse_matrix, label = label, max_depth = 9, eta = 1, nthread = 2, nrounds = nrounds, verbose = 0, @@ -25,6 +26,18 @@ bst.GLM <- xgboost(data = sparse_matrix, label = label, feature.names <- colnames(sparse_matrix) +# multiclass +mlabel <- as.numeric(iris$Species) - 1 +nclass <- 3 +mbst.Tree <- xgboost(data = as.matrix(iris[, -5]), label = mlabel, verbose = 0, + max_depth = 3, eta = 0.5, nthread = 2, nrounds = nrounds, + objective = "multi:softprob", num_class = nclass, base_score = 0) + +mbst.GLM <- xgboost(data = as.matrix(iris[, -5]), label = mlabel, verbose = 0, + booster = "gblinear", eta = 0.1, nthread = 1, nrounds = nrounds, + objective = "multi:softprob", num_class = nclass, base_score = 0) + + test_that("xgb.dump works", { expect_length(xgb.dump(bst.Tree), 200) expect_true(xgb.dump(bst.Tree, 'xgb.model.dump', with_stats = T)) @@ -82,12 +95,8 @@ test_that("predict feature contributions works", { expect_equal(as.numeric(pred_contr), as.numeric(pred_contr_manual), 2e-6) # gbtree multiclass - lb <- as.numeric(iris$Species) - 1 - bst <- xgboost(data = as.matrix(iris[, -5]), label = lb, verbose = 0, - max_depth = 3, eta = 0.5, nthread = 2, nrounds = 5, - objective = "multi:softprob", num_class = 3) - pred <- predict(bst, as.matrix(iris[, -5]), outputmargin = TRUE, reshape = TRUE) - pred_contr <- predict(bst, as.matrix(iris[, -5]), predcontrib = TRUE) + pred <- predict(mbst.Tree, as.matrix(iris[, -5]), outputmargin = TRUE, reshape = TRUE) + pred_contr <- predict(mbst.Tree, as.matrix(iris[, -5]), predcontrib = TRUE) expect_is(pred_contr, "list") expect_length(pred_contr, 3) for (g in seq_along(pred_contr)) { @@ -96,19 +105,15 @@ test_that("predict feature contributions works", { } # gblinear multiclass (set base_score = 0, which is base margin in multiclass) - bst <- xgboost(data = as.matrix(iris[, -5]), label = lb, verbose = 0, - booster = "gblinear", eta = 0.1, nthread = 1, nrounds = 10, - objective = "multi:softprob", num_class = 3, base_score = 0) - pred <- predict(bst, as.matrix(iris[, -5]), outputmargin = TRUE, reshape = TRUE) - pred_contr <- predict(bst, as.matrix(iris[, -5]), predcontrib = TRUE) + pred <- predict(mbst.GLM, as.matrix(iris[, -5]), outputmargin = TRUE, reshape = TRUE) + pred_contr <- predict(mbst.GLM, as.matrix(iris[, -5]), predcontrib = TRUE) expect_length(pred_contr, 3) - coefs_all <- xgb.dump(bst)[-c(1,2,6)] %>% as.numeric + coefs_all <- xgb.dump(mbst.GLM)[-c(1,2,6)] %>% as.numeric %>% matrix(ncol = 3, byrow = TRUE) for (g in seq_along(pred_contr)) { expect_equal(colnames(pred_contr[[g]]), c(colnames(iris[, -5]), "BIAS")) expect_lt(max(abs(rowSums(pred_contr[[g]]) - pred[, g])), 2e-6) # manual calculation of linear terms - coefs <- coefs_all[seq(g, length(coefs_all), by = 3)] - coefs <- c(coefs[-1], coefs[1]) # intercept needs to be the last + coefs <- c(coefs_all[-1, g], coefs_all[1, g]) # intercept needs to be the last pred_contr_manual <- sweep(as.matrix(cbind(iris[,-5], 1)), 2, coefs, FUN="*") expect_equal(as.numeric(pred_contr[[g]]), as.numeric(pred_contr_manual), 2e-6) } @@ -227,6 +232,11 @@ test_that("xgb.importance works with and without feature names", { imp2plot <- xgb.plot.importance(importance_matrix = importance.Tree) expect_equal(colnames(imp2plot), c("Feature", "Gain", "Cover", "Frequency", "Importance")) xgb.ggplot.importance(importance_matrix = importance.Tree) + + # for multiclass + imp.Tree <- xgb.importance(model = mbst.Tree) + expect_equal(dim(imp.Tree), c(4, 4)) + xgb.importance(model = mbst.Tree, trees = seq(from=0, by=nclass, length.out=nrounds)) }) test_that("xgb.importance works with GLM model", { @@ -237,6 +247,11 @@ test_that("xgb.importance works with GLM model", { imp2plot <- xgb.plot.importance(importance.GLM) expect_equal(colnames(imp2plot), c("Feature", "Weight", "Importance")) xgb.ggplot.importance(importance.GLM) + + # for multiclass + imp.GLM <- xgb.importance(model = mbst.GLM) + expect_equal(dim(imp.GLM), c(12, 3)) + expect_equal(imp.GLM$Class, rep(0:2, each=4)) }) test_that("xgb.model.dt.tree and xgb.importance work with a single split model", {