[R] Fix global feature importance and predict with 1 sample. (#7394)
* [R] Fix global feature importance. * Add implementation for tree index. The parameter is not documented in C API since we should work on porting the model slicing to R instead of supporting more use of tree index. * Fix the difference between "gain" and "total_gain". * debug. * Fix prediction.
This commit is contained in:
@@ -397,6 +397,7 @@ predict.xgb.Booster <- function(object, newdata, missing = NA, outputmargin = FA
|
||||
shape <- predts$shape
|
||||
ret <- predts$results
|
||||
|
||||
n_ret <- length(ret)
|
||||
n_row <- nrow(newdata)
|
||||
if (n_row != shape[1]) {
|
||||
stop("Incorrect predict shape.")
|
||||
@@ -405,36 +406,55 @@ predict.xgb.Booster <- function(object, newdata, missing = NA, outputmargin = FA
|
||||
arr <- array(data = ret, dim = rev(shape))
|
||||
|
||||
cnames <- if (!is.null(colnames(newdata))) c(colnames(newdata), "BIAS") else NULL
|
||||
n_groups <- shape[2]
|
||||
|
||||
## Needed regardless of whether strict shape is being used.
|
||||
if (predcontrib) {
|
||||
dimnames(arr) <- list(cnames, NULL, NULL)
|
||||
if (!strict_shape) {
|
||||
arr <- aperm(a = arr, perm = c(2, 3, 1)) # [group, row, col]
|
||||
}
|
||||
} else if (predinteraction) {
|
||||
dimnames(arr) <- list(cnames, cnames, NULL, NULL)
|
||||
if (!strict_shape) {
|
||||
arr <- aperm(a = arr, perm = c(3, 4, 1, 2)) # [group, row, col, col]
|
||||
}
|
||||
}
|
||||
if (strict_shape) {
|
||||
return(arr) # strict shape is calculated by libxgboost uniformly.
|
||||
}
|
||||
|
||||
if (!strict_shape) {
|
||||
n_groups <- shape[2]
|
||||
if (predleaf) {
|
||||
arr <- matrix(arr, nrow = n_row, byrow = TRUE)
|
||||
} else if (predcontrib && n_groups != 1) {
|
||||
arr <- lapply(seq_len(n_groups), function(g) arr[g, , ])
|
||||
} else if (predinteraction && n_groups != 1) {
|
||||
arr <- lapply(seq_len(n_groups), function(g) arr[g, , , ])
|
||||
} else if (!reshape && n_groups != 1) {
|
||||
arr <- ret
|
||||
} else if (reshape && n_groups != 1) {
|
||||
arr <- matrix(arr, ncol = n_groups, byrow = TRUE)
|
||||
if (predleaf) {
|
||||
## Predict leaf
|
||||
arr <- if (n_ret == n_row) {
|
||||
matrix(arr, ncol = 1)
|
||||
} else {
|
||||
matrix(arr, nrow = n_row, byrow = TRUE)
|
||||
}
|
||||
arr <- drop(arr)
|
||||
if (length(dim(arr)) == 1) {
|
||||
arr <- as.vector(arr)
|
||||
} else if (length(dim(arr)) == 2) {
|
||||
arr <- as.matrix(arr)
|
||||
} else if (predcontrib) {
|
||||
## Predict contribution
|
||||
arr <- aperm(a = arr, perm = c(2, 3, 1)) # [group, row, col]
|
||||
arr <- if (n_ret == n_row) {
|
||||
matrix(arr, ncol = 1, dimnames = list(NULL, cnames))
|
||||
} else if (n_groups != 1) {
|
||||
## turns array into list of matrices
|
||||
lapply(seq_len(n_groups), function(g) arr[g, , ])
|
||||
} else {
|
||||
## remove the first axis (group)
|
||||
as.matrix(arr[1, , ])
|
||||
}
|
||||
} else if (predinteraction) {
|
||||
## Predict interaction
|
||||
arr <- aperm(a = arr, perm = c(3, 4, 1, 2)) # [group, row, col, col]
|
||||
arr <- if (n_ret == n_row) {
|
||||
matrix(arr, ncol = 1, dimnames = list(NULL, cnames))
|
||||
} else if (n_groups != 1) {
|
||||
## turns array into list of matrices
|
||||
lapply(seq_len(n_groups), function(g) arr[g, , , ])
|
||||
} else {
|
||||
## remove the first axis (group)
|
||||
arr[1, , , ]
|
||||
}
|
||||
} else {
|
||||
## Normal prediction
|
||||
arr <- if (reshape && n_groups != 1) {
|
||||
matrix(arr, ncol = n_groups, byrow = TRUE)
|
||||
} else {
|
||||
as.vector(ret)
|
||||
}
|
||||
}
|
||||
return(arr)
|
||||
|
||||
@@ -115,14 +115,14 @@ xgb.importance <- function(feature_names = NULL, model = NULL, trees = NULL,
|
||||
} else {
|
||||
concatenated <- list()
|
||||
output_names <- vector()
|
||||
for (importance_type in c("weight", "gain", "cover")) {
|
||||
args <- list(importance_type = importance_type, feature_names = feature_names)
|
||||
for (importance_type in c("weight", "total_gain", "total_cover")) {
|
||||
args <- list(importance_type = importance_type, feature_names = feature_names, tree_idx = trees)
|
||||
results <- .Call(
|
||||
XGBoosterFeatureScore_R, model$handle, jsonlite::toJSON(args, auto_unbox = TRUE, null = "null")
|
||||
)
|
||||
names(results) <- c("features", "shape", importance_type)
|
||||
concatenated[
|
||||
switch(importance_type, "weight" = "Frequency", "gain" = "Gain", "cover" = "Cover")
|
||||
switch(importance_type, "weight" = "Frequency", "total_gain" = "Gain", "total_cover" = "Cover")
|
||||
] <- results[importance_type]
|
||||
output_names <- results$features
|
||||
}
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
library(testthat)
|
||||
context('Test helper functions')
|
||||
|
||||
require(xgboost)
|
||||
@@ -310,7 +311,35 @@ test_that("xgb.importance works with and without feature names", {
|
||||
# for multiclass
|
||||
imp.Tree <- xgb.importance(model = mbst.Tree)
|
||||
expect_equal(dim(imp.Tree), c(4, 4))
|
||||
xgb.importance(model = mbst.Tree, trees = seq(from = 0, by = nclass, length.out = nrounds))
|
||||
|
||||
trees <- seq(from = 0, by = 2, length.out = 2)
|
||||
importance <- xgb.importance(feature_names = feature.names, model = bst.Tree, trees = trees)
|
||||
|
||||
importance_from_dump <- function() {
|
||||
model_text_dump <- xgb.dump(model = bst.Tree, with_stats = TRUE, trees = trees)
|
||||
imp <- xgb.model.dt.tree(
|
||||
feature_names = feature.names,
|
||||
text = model_text_dump,
|
||||
trees = trees
|
||||
)[
|
||||
Feature != "Leaf", .(
|
||||
Gain = sum(Quality),
|
||||
Cover = sum(Cover),
|
||||
Frequency = .N
|
||||
),
|
||||
by = Feature
|
||||
][
|
||||
, `:=`(
|
||||
Gain = Gain / sum(Gain),
|
||||
Cover = Cover / sum(Cover),
|
||||
Frequency = Frequency / sum(Frequency)
|
||||
)
|
||||
][
|
||||
order(Gain, decreasing = TRUE)
|
||||
]
|
||||
imp
|
||||
}
|
||||
expect_equal(importance_from_dump(), importance, tolerance = 1e-6)
|
||||
})
|
||||
|
||||
test_that("xgb.importance works with GLM model", {
|
||||
|
||||
Reference in New Issue
Block a user