[R] xgb.importance: fix for multiclass gblinear, new 'trees' parameter (#2388)

This commit is contained in:
Vadim Khotilovich 2017-06-07 13:13:21 -05:00 committed by GitHub
parent 2ae56ca84f
commit c82276386d
4 changed files with 120 additions and 35 deletions

View File

@ -1,7 +1,7 @@
Package: xgboost Package: xgboost
Type: Package Type: Package
Title: Extreme Gradient Boosting Title: Extreme Gradient Boosting
Version: 0.6.4.5 Version: 0.6.4.6
Date: 2017-01-04 Date: 2017-01-04
Author: Tianqi Chen <tianqi.tchen@gmail.com>, Tong He <hetong007@gmail.com>, Author: Tianqi Chen <tianqi.tchen@gmail.com>, Tong He <hetong007@gmail.com>,
Michael Benesty <michael@benesty.fr>, Vadim Khotilovich <khotilovich@gmail.com>, Michael Benesty <michael@benesty.fr>, Vadim Khotilovich <khotilovich@gmail.com>,

View File

@ -6,6 +6,11 @@
#' contains feature names, those would be used when \code{feature_names=NULL} (default value). #' contains feature names, those would be used when \code{feature_names=NULL} (default value).
#' Non-null \code{feature_names} could be provided to override those in the model. #' Non-null \code{feature_names} could be provided to override those in the model.
#' @param model object of class \code{xgb.Booster}. #' @param model object of class \code{xgb.Booster}.
#' @param trees (only for the gbtree booster) an integer vector of tree indices that should be included
#' into the importance calculation. If set to \code{NULL}, all trees of the model are parsed.
#' It could be useful, e.g., in multiclass classification to get feature importances
#' for each class separately. IMPORTANT: the tree index in xgboost models
#' is zero-based (e.g., use \code{trees = 0:4} for first 5 trees).
#' @param data deprecated. #' @param data deprecated.
#' @param label deprecated. #' @param label deprecated.
#' @param target deprecated. #' @param target deprecated.
@ -32,27 +37,51 @@
#' a feature have been used in trees. #' a feature have been used in trees.
#' } #' }
#' #'
#' A linear model's importance \code{data.table} has only two columns: #' A linear model's importance \code{data.table} has the following columns:
#' \itemize{ #' \itemize{
#' \item \code{Features} names of the features used in the model; #' \item \code{Features} names of the features used in the model;
#' \item \code{Weight} the linear coefficient of this feature. #' \item \code{Weight} the linear coefficient of this feature;
#' \item \code{Class} (only for multiclass models) class label.
#' } #' }
#' #'
#' If you don't provide or \code{model} doesn't have \code{feature_names}, #' If \code{feature_names} is not provided and \code{model} doesn't have \code{feature_names},
#' index of the features will be used instead. Because the index is extracted from the model dump #' index of the features will be used instead. Because the index is extracted from the model dump
#' (based on C++ code), it starts at 0 (as in C/C++ or Python) instead of 1 (usual in R). #' (based on C++ code), it starts at 0 (as in C/C++ or Python) instead of 1 (usual in R).
#' #'
#' @examples #' @examples
#' #'
#' # binomial classification using gbtree:
#' data(agaricus.train, package='xgboost') #' data(agaricus.train, package='xgboost')
#'
#' bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max_depth = 2, #' bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max_depth = 2,
#' eta = 1, nthread = 2, nrounds = 2,objective = "binary:logistic") #' eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic")
#'
#' xgb.importance(model = bst) #' xgb.importance(model = bst)
#' #'
#' # binomial classification using gblinear:
#' bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, booster = "gblinear",
#' eta = 0.3, nthread = 1, nrounds = 20, objective = "binary:logistic")
#' xgb.importance(model = bst)
#'
#' # multiclass classification using gbtree:
#' nclass <- 3
#' nrounds <- 10
#' mbst <- xgboost(data = as.matrix(iris[, -5]), label = as.numeric(iris$Species) - 1,
#' max_depth = 3, eta = 0.2, nthread = 2, nrounds = nrounds,
#' objective = "multi:softprob", num_class = nclass)
#' # all classes clumped together:
#' xgb.importance(model = mbst)
#' # inspect importances separately for each class:
#' xgb.importance(model = mbst, trees = seq(from=0, by=nclass, length.out=nrounds))
#' xgb.importance(model = mbst, trees = seq(from=1, by=nclass, length.out=nrounds))
#' xgb.importance(model = mbst, trees = seq(from=2, by=nclass, length.out=nrounds))
#'
#' # multiclass classification using gblinear:
#' mbst <- xgboost(data = scale(as.matrix(iris[, -5])), label = as.numeric(iris$Species) - 1,
#' booster = "gblinear", eta = 0.2, nthread = 1, nrounds = 15,
#' objective = "multi:softprob", num_class = nclass)
#' xgb.importance(model = mbst)
#'
#' @export #' @export
xgb.importance <- function(feature_names = NULL, model = NULL, xgb.importance <- function(feature_names = NULL, model = NULL, trees = NULL,
data = NULL, label = NULL, target = NULL){ data = NULL, label = NULL, target = NULL){
if (!(is.null(data) && is.null(label) && is.null(target))) if (!(is.null(data) && is.null(label) && is.null(target)))
@ -74,14 +103,25 @@ xgb.importance <- function(feature_names = NULL, model = NULL,
weights <- which(model_text_dump == "weight:") %>% weights <- which(model_text_dump == "weight:") %>%
{model_text_dump[(. + 1):length(model_text_dump)]} %>% {model_text_dump[(. + 1):length(model_text_dump)]} %>%
as.numeric as.numeric
num_class <- NVL(model$params$num_class, 1)
if(is.null(feature_names)) if(is.null(feature_names))
feature_names <- seq(to = length(weights)) feature_names <- seq(to = length(weights) / num_class) - 1
if (length(feature_names) != length(weights)) if (length(feature_names) * num_class != length(weights))
stop("feature_names has less elements than there are features used in the model") stop("feature_names length does not match the number of features used in the model")
result <- data.table(Feature = feature_names, Weight = weights)[order(-abs(Weight))]
result <- if (num_class == 1) {
data.table(Feature = feature_names, Weight = weights)[order(-abs(Weight))]
} else {
data.table(Feature = rep(feature_names, each = num_class),
Weight = weights,
Class = 0:(num_class - 1))[order(Class, -abs(Weight))]
}
} else { } else {
# tree model # tree model
result <- xgb.model.dt.tree(feature_names = feature_names, text = model_text_dump)[ result <- xgb.model.dt.tree(feature_names = feature_names,
text = model_text_dump,
trees = trees)[
Feature != "Leaf", .(Gain = sum(Quality), Feature != "Leaf", .(Gain = sum(Quality),
Cover = sum(Cover), Cover = sum(Cover),
Frequency = .N), by = Feature][ Frequency = .N), by = Feature][

View File

@ -4,8 +4,8 @@
\alias{xgb.importance} \alias{xgb.importance}
\title{Importance of features in a model.} \title{Importance of features in a model.}
\usage{ \usage{
xgb.importance(feature_names = NULL, model = NULL, data = NULL, xgb.importance(feature_names = NULL, model = NULL, trees = NULL,
label = NULL, target = NULL) data = NULL, label = NULL, target = NULL)
} }
\arguments{ \arguments{
\item{feature_names}{character vector of feature names. If the model already \item{feature_names}{character vector of feature names. If the model already
@ -14,6 +14,12 @@ Non-null \code{feature_names} could be provided to override those in the model.}
\item{model}{object of class \code{xgb.Booster}.} \item{model}{object of class \code{xgb.Booster}.}
\item{trees}{(only for the gbtree booster) an integer vector of tree indices that should be included
into the importance calculation. If set to \code{NULL}, all trees of the model are parsed.
It could be useful, e.g., in multiclass classification to get feature importances
for each class separately. IMPORTANT: the tree index in xgboost models
is zero-based (e.g., use \code{trees = 0:4} for first 5 trees).}
\item{data}{deprecated.} \item{data}{deprecated.}
\item{label}{deprecated.} \item{label}{deprecated.}
@ -32,13 +38,14 @@ For a tree model, a \code{data.table} with the following columns:
a feature have been used in trees. a feature have been used in trees.
} }
A linear model's importance \code{data.table} has only two columns: A linear model's importance \code{data.table} has the following columns:
\itemize{ \itemize{
\item \code{Features} names of the features used in the model; \item \code{Features} names of the features used in the model;
\item \code{Weight} the linear coefficient of this feature. \item \code{Weight} the linear coefficient of this feature;
\item \code{Class} (only for multiclass models) class label.
} }
If you don't provide or \code{model} doesn't have \code{feature_names}, If \code{feature_names} is not provided and \code{model} doesn't have \code{feature_names},
index of the features will be used instead. Because the index is extracted from the model dump index of the features will be used instead. Because the index is extracted from the model dump
(based on C++ code), it starts at 0 (as in C/C++ or Python) instead of 1 (usual in R). (based on C++ code), it starts at 0 (as in C/C++ or Python) instead of 1 (usual in R).
} }
@ -55,11 +62,34 @@ L1 or L2 regularization).
} }
\examples{ \examples{
# binomial classification using gbtree:
data(agaricus.train, package='xgboost') data(agaricus.train, package='xgboost')
bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max_depth = 2, bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max_depth = 2,
eta = 1, nthread = 2, nrounds = 2,objective = "binary:logistic") eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic")
xgb.importance(model = bst) xgb.importance(model = bst)
# binomial classification using gblinear:
bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, booster = "gblinear",
eta = 0.3, nthread = 1, nrounds = 20, objective = "binary:logistic")
xgb.importance(model = bst)
# multiclass classification using gbtree:
nclass <- 3
nrounds <- 10
mbst <- xgboost(data = as.matrix(iris[, -5]), label = as.numeric(iris$Species) - 1,
max_depth = 3, eta = 0.2, nthread = 2, nrounds = nrounds,
objective = "multi:softprob", num_class = nclass)
# all classes clumped together:
xgb.importance(model = mbst)
# inspect importances separately for each class:
xgb.importance(model = mbst, trees = seq(from=0, by=nclass, length.out=nrounds))
xgb.importance(model = mbst, trees = seq(from=1, by=nclass, length.out=nrounds))
xgb.importance(model = mbst, trees = seq(from=2, by=nclass, length.out=nrounds))
# multiclass classification using gblinear:
mbst <- xgboost(data = scale(as.matrix(iris[, -5])), label = as.numeric(iris$Species) - 1,
booster = "gblinear", eta = 0.2, nthread = 1, nrounds = 15,
objective = "multi:softprob", num_class = nclass)
xgb.importance(model = mbst)
} }

View File

@ -14,6 +14,7 @@ df[,ID := NULL]
sparse_matrix <- sparse.model.matrix(Improved~.-1, data = df) sparse_matrix <- sparse.model.matrix(Improved~.-1, data = df)
label <- df[, ifelse(Improved == "Marked", 1, 0)] label <- df[, ifelse(Improved == "Marked", 1, 0)]
# binary
nrounds <- 12 nrounds <- 12
bst.Tree <- xgboost(data = sparse_matrix, label = label, max_depth = 9, bst.Tree <- xgboost(data = sparse_matrix, label = label, max_depth = 9,
eta = 1, nthread = 2, nrounds = nrounds, verbose = 0, eta = 1, nthread = 2, nrounds = nrounds, verbose = 0,
@ -25,6 +26,18 @@ bst.GLM <- xgboost(data = sparse_matrix, label = label,
feature.names <- colnames(sparse_matrix) feature.names <- colnames(sparse_matrix)
# multiclass
mlabel <- as.numeric(iris$Species) - 1
nclass <- 3
mbst.Tree <- xgboost(data = as.matrix(iris[, -5]), label = mlabel, verbose = 0,
max_depth = 3, eta = 0.5, nthread = 2, nrounds = nrounds,
objective = "multi:softprob", num_class = nclass, base_score = 0)
mbst.GLM <- xgboost(data = as.matrix(iris[, -5]), label = mlabel, verbose = 0,
booster = "gblinear", eta = 0.1, nthread = 1, nrounds = nrounds,
objective = "multi:softprob", num_class = nclass, base_score = 0)
test_that("xgb.dump works", { test_that("xgb.dump works", {
expect_length(xgb.dump(bst.Tree), 200) expect_length(xgb.dump(bst.Tree), 200)
expect_true(xgb.dump(bst.Tree, 'xgb.model.dump', with_stats = T)) expect_true(xgb.dump(bst.Tree, 'xgb.model.dump', with_stats = T))
@ -82,12 +95,8 @@ test_that("predict feature contributions works", {
expect_equal(as.numeric(pred_contr), as.numeric(pred_contr_manual), 2e-6) expect_equal(as.numeric(pred_contr), as.numeric(pred_contr_manual), 2e-6)
# gbtree multiclass # gbtree multiclass
lb <- as.numeric(iris$Species) - 1 pred <- predict(mbst.Tree, as.matrix(iris[, -5]), outputmargin = TRUE, reshape = TRUE)
bst <- xgboost(data = as.matrix(iris[, -5]), label = lb, verbose = 0, pred_contr <- predict(mbst.Tree, as.matrix(iris[, -5]), predcontrib = TRUE)
max_depth = 3, eta = 0.5, nthread = 2, nrounds = 5,
objective = "multi:softprob", num_class = 3)
pred <- predict(bst, as.matrix(iris[, -5]), outputmargin = TRUE, reshape = TRUE)
pred_contr <- predict(bst, as.matrix(iris[, -5]), predcontrib = TRUE)
expect_is(pred_contr, "list") expect_is(pred_contr, "list")
expect_length(pred_contr, 3) expect_length(pred_contr, 3)
for (g in seq_along(pred_contr)) { for (g in seq_along(pred_contr)) {
@ -96,19 +105,15 @@ test_that("predict feature contributions works", {
} }
# gblinear multiclass (set base_score = 0, which is base margin in multiclass) # gblinear multiclass (set base_score = 0, which is base margin in multiclass)
bst <- xgboost(data = as.matrix(iris[, -5]), label = lb, verbose = 0, pred <- predict(mbst.GLM, as.matrix(iris[, -5]), outputmargin = TRUE, reshape = TRUE)
booster = "gblinear", eta = 0.1, nthread = 1, nrounds = 10, pred_contr <- predict(mbst.GLM, as.matrix(iris[, -5]), predcontrib = TRUE)
objective = "multi:softprob", num_class = 3, base_score = 0)
pred <- predict(bst, as.matrix(iris[, -5]), outputmargin = TRUE, reshape = TRUE)
pred_contr <- predict(bst, as.matrix(iris[, -5]), predcontrib = TRUE)
expect_length(pred_contr, 3) expect_length(pred_contr, 3)
coefs_all <- xgb.dump(bst)[-c(1,2,6)] %>% as.numeric coefs_all <- xgb.dump(mbst.GLM)[-c(1,2,6)] %>% as.numeric %>% matrix(ncol = 3, byrow = TRUE)
for (g in seq_along(pred_contr)) { for (g in seq_along(pred_contr)) {
expect_equal(colnames(pred_contr[[g]]), c(colnames(iris[, -5]), "BIAS")) expect_equal(colnames(pred_contr[[g]]), c(colnames(iris[, -5]), "BIAS"))
expect_lt(max(abs(rowSums(pred_contr[[g]]) - pred[, g])), 2e-6) expect_lt(max(abs(rowSums(pred_contr[[g]]) - pred[, g])), 2e-6)
# manual calculation of linear terms # manual calculation of linear terms
coefs <- coefs_all[seq(g, length(coefs_all), by = 3)] coefs <- c(coefs_all[-1, g], coefs_all[1, g]) # intercept needs to be the last
coefs <- c(coefs[-1], coefs[1]) # intercept needs to be the last
pred_contr_manual <- sweep(as.matrix(cbind(iris[,-5], 1)), 2, coefs, FUN="*") pred_contr_manual <- sweep(as.matrix(cbind(iris[,-5], 1)), 2, coefs, FUN="*")
expect_equal(as.numeric(pred_contr[[g]]), as.numeric(pred_contr_manual), 2e-6) expect_equal(as.numeric(pred_contr[[g]]), as.numeric(pred_contr_manual), 2e-6)
} }
@ -227,6 +232,11 @@ test_that("xgb.importance works with and without feature names", {
imp2plot <- xgb.plot.importance(importance_matrix = importance.Tree) imp2plot <- xgb.plot.importance(importance_matrix = importance.Tree)
expect_equal(colnames(imp2plot), c("Feature", "Gain", "Cover", "Frequency", "Importance")) expect_equal(colnames(imp2plot), c("Feature", "Gain", "Cover", "Frequency", "Importance"))
xgb.ggplot.importance(importance_matrix = importance.Tree) xgb.ggplot.importance(importance_matrix = importance.Tree)
# for multiclass
imp.Tree <- xgb.importance(model = mbst.Tree)
expect_equal(dim(imp.Tree), c(4, 4))
xgb.importance(model = mbst.Tree, trees = seq(from=0, by=nclass, length.out=nrounds))
}) })
test_that("xgb.importance works with GLM model", { test_that("xgb.importance works with GLM model", {
@ -237,6 +247,11 @@ test_that("xgb.importance works with GLM model", {
imp2plot <- xgb.plot.importance(importance.GLM) imp2plot <- xgb.plot.importance(importance.GLM)
expect_equal(colnames(imp2plot), c("Feature", "Weight", "Importance")) expect_equal(colnames(imp2plot), c("Feature", "Weight", "Importance"))
xgb.ggplot.importance(importance.GLM) xgb.ggplot.importance(importance.GLM)
# for multiclass
imp.GLM <- xgb.importance(model = mbst.GLM)
expect_equal(dim(imp.GLM), c(12, 3))
expect_equal(imp.GLM$Class, rep(0:2, each=4))
}) })
test_that("xgb.model.dt.tree and xgb.importance work with a single split model", { test_that("xgb.model.dt.tree and xgb.importance work with a single split model", {