[backport] [R] Fix global feature importance and predict with 1 sample. (#7394) (#7397)

* [R] Fix global feature importance.

* Add implementation for tree index.  The parameter is not documented in C API since we
should work on porting the model slicing to R instead of supporting more use of tree
index.

* Fix the difference between "gain" and "total_gain".

* debug.

* Fix prediction.
This commit is contained in:
Jiaming Yuan 2021-11-06 00:07:36 +08:00 committed by GitHub
parent a3d195e73e
commit e7ac2486eb
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
11 changed files with 119 additions and 49 deletions

View File

@ -397,6 +397,7 @@ predict.xgb.Booster <- function(object, newdata, missing = NA, outputmargin = FA
shape <- predts$shape shape <- predts$shape
ret <- predts$results ret <- predts$results
n_ret <- length(ret)
n_row <- nrow(newdata) n_row <- nrow(newdata)
if (n_row != shape[1]) { if (n_row != shape[1]) {
stop("Incorrect predict shape.") stop("Incorrect predict shape.")
@ -405,36 +406,55 @@ predict.xgb.Booster <- function(object, newdata, missing = NA, outputmargin = FA
arr <- array(data = ret, dim = rev(shape)) arr <- array(data = ret, dim = rev(shape))
cnames <- if (!is.null(colnames(newdata))) c(colnames(newdata), "BIAS") else NULL cnames <- if (!is.null(colnames(newdata))) c(colnames(newdata), "BIAS") else NULL
n_groups <- shape[2]
## Needed regardless of whether strict shape is being used.
if (predcontrib) { if (predcontrib) {
dimnames(arr) <- list(cnames, NULL, NULL) dimnames(arr) <- list(cnames, NULL, NULL)
if (!strict_shape) {
arr <- aperm(a = arr, perm = c(2, 3, 1)) # [group, row, col]
}
} else if (predinteraction) { } else if (predinteraction) {
dimnames(arr) <- list(cnames, cnames, NULL, NULL) dimnames(arr) <- list(cnames, cnames, NULL, NULL)
if (!strict_shape) { }
arr <- aperm(a = arr, perm = c(3, 4, 1, 2)) # [group, row, col, col] if (strict_shape) {
} return(arr) # strict shape is calculated by libxgboost uniformly.
} }
if (!strict_shape) { if (predleaf) {
n_groups <- shape[2] ## Predict leaf
if (predleaf) { arr <- if (n_ret == n_row) {
arr <- matrix(arr, nrow = n_row, byrow = TRUE) matrix(arr, ncol = 1)
} else if (predcontrib && n_groups != 1) { } else {
arr <- lapply(seq_len(n_groups), function(g) arr[g, , ]) matrix(arr, nrow = n_row, byrow = TRUE)
} else if (predinteraction && n_groups != 1) {
arr <- lapply(seq_len(n_groups), function(g) arr[g, , , ])
} else if (!reshape && n_groups != 1) {
arr <- ret
} else if (reshape && n_groups != 1) {
arr <- matrix(arr, ncol = n_groups, byrow = TRUE)
} }
arr <- drop(arr) } else if (predcontrib) {
if (length(dim(arr)) == 1) { ## Predict contribution
arr <- as.vector(arr) arr <- aperm(a = arr, perm = c(2, 3, 1)) # [group, row, col]
} else if (length(dim(arr)) == 2) { arr <- if (n_ret == n_row) {
arr <- as.matrix(arr) matrix(arr, ncol = 1, dimnames = list(NULL, cnames))
} else if (n_groups != 1) {
## turns array into list of matrices
lapply(seq_len(n_groups), function(g) arr[g, , ])
} else {
## remove the first axis (group)
as.matrix(arr[1, , ])
}
} else if (predinteraction) {
## Predict interaction
arr <- aperm(a = arr, perm = c(3, 4, 1, 2)) # [group, row, col, col]
arr <- if (n_ret == n_row) {
matrix(arr, ncol = 1, dimnames = list(NULL, cnames))
} else if (n_groups != 1) {
## turns array into list of matrices
lapply(seq_len(n_groups), function(g) arr[g, , , ])
} else {
## remove the first axis (group)
arr[1, , , ]
}
} else {
## Normal prediction
arr <- if (reshape && n_groups != 1) {
matrix(arr, ncol = n_groups, byrow = TRUE)
} else {
as.vector(ret)
} }
} }
return(arr) return(arr)

View File

@ -115,14 +115,14 @@ xgb.importance <- function(feature_names = NULL, model = NULL, trees = NULL,
} else { } else {
concatenated <- list() concatenated <- list()
output_names <- vector() output_names <- vector()
for (importance_type in c("weight", "gain", "cover")) { for (importance_type in c("weight", "total_gain", "total_cover")) {
args <- list(importance_type = importance_type, feature_names = feature_names) args <- list(importance_type = importance_type, feature_names = feature_names, tree_idx = trees)
results <- .Call( results <- .Call(
XGBoosterFeatureScore_R, model$handle, jsonlite::toJSON(args, auto_unbox = TRUE, null = "null") XGBoosterFeatureScore_R, model$handle, jsonlite::toJSON(args, auto_unbox = TRUE, null = "null")
) )
names(results) <- c("features", "shape", importance_type) names(results) <- c("features", "shape", importance_type)
concatenated[ concatenated[
switch(importance_type, "weight" = "Frequency", "gain" = "Gain", "cover" = "Cover") switch(importance_type, "weight" = "Frequency", "total_gain" = "Gain", "total_cover" = "Cover")
] <- results[importance_type] ] <- results[importance_type]
output_names <- results$features output_names <- results$features
} }

View File

@ -1,3 +1,4 @@
library(testthat)
context('Test helper functions') context('Test helper functions')
require(xgboost) require(xgboost)
@ -310,7 +311,35 @@ test_that("xgb.importance works with and without feature names", {
# for multiclass # for multiclass
imp.Tree <- xgb.importance(model = mbst.Tree) imp.Tree <- xgb.importance(model = mbst.Tree)
expect_equal(dim(imp.Tree), c(4, 4)) expect_equal(dim(imp.Tree), c(4, 4))
xgb.importance(model = mbst.Tree, trees = seq(from = 0, by = nclass, length.out = nrounds))
trees <- seq(from = 0, by = 2, length.out = 2)
importance <- xgb.importance(feature_names = feature.names, model = bst.Tree, trees = trees)
importance_from_dump <- function() {
model_text_dump <- xgb.dump(model = bst.Tree, with_stats = TRUE, trees = trees)
imp <- xgb.model.dt.tree(
feature_names = feature.names,
text = model_text_dump,
trees = trees
)[
Feature != "Leaf", .(
Gain = sum(Quality),
Cover = sum(Cover),
Frequency = .N
),
by = Feature
][
, `:=`(
Gain = Gain / sum(Gain),
Cover = Cover / sum(Cover),
Frequency = Frequency / sum(Frequency)
)
][
order(Gain, decreasing = TRUE)
]
imp
}
expect_equal(importance_from_dump(), importance, tolerance = 1e-6)
}) })
test_that("xgb.importance works with GLM model", { test_that("xgb.importance works with GLM model", {

View File

@ -32,8 +32,8 @@ After 1.4 release, we added a new parameter called ``strict_shape``, one can set
- When using ``output_margin`` to avoid transformation and ``strict_shape`` is set to ``True``: - When using ``output_margin`` to avoid transformation and ``strict_shape`` is set to ``True``:
Similar to the previous case, output is a 2-dim array, except for that ``multi:softmax`` Similar to the previous case, output is a 2-dim array, except for that ``multi:softmax``
has equivalent output of ``multi:softprob`` due to dropped transformation. If strict has equivalent output shape of ``multi:softprob`` due to dropped transformation. If
shape is set to False then output can have 1 or 2 dim depending on used model. strict shape is set to False then output can have 1 or 2 dim depending on used model.
- When using ``preds_contribs`` with ``strict_shape`` set to ``True``: - When using ``preds_contribs`` with ``strict_shape`` set to ``True``:

View File

@ -182,9 +182,10 @@ class GradientBooster : public Model, public Configurable {
bool with_stats, bool with_stats,
std::string format) const = 0; std::string format) const = 0;
virtual void FeatureScore(std::string const &importance_type, virtual void FeatureScore(std::string const& importance_type,
std::vector<bst_feature_t> *features, common::Span<int32_t const> trees,
std::vector<float> *scores) const = 0; std::vector<bst_feature_t>* features,
std::vector<float>* scores) const = 0;
/*! /*!
* \brief Whether the current booster uses GPU. * \brief Whether the current booster uses GPU.
*/ */

View File

@ -155,9 +155,10 @@ class Learner : public Model, public Configurable, public dmlc::Serializable {
/*! /*!
* \brief Calculate feature score. See doc in C API for outputs. * \brief Calculate feature score. See doc in C API for outputs.
*/ */
virtual void CalcFeatureScore(std::string const &importance_type, virtual void CalcFeatureScore(std::string const& importance_type,
std::vector<bst_feature_t> *features, common::Span<int32_t const> trees,
std::vector<float> *scores) = 0; std::vector<bst_feature_t>* features,
std::vector<float>* scores) = 0;
/* /*
* \brief Get number of boosted rounds from gradient booster. * \brief Get number of boosted rounds from gradient booster.

View File

@ -1159,9 +1159,17 @@ XGB_DLL int XGBoosterFeatureScore(BoosterHandle handle, char const *json_config,
custom_feature_names = get<Array const>(config["feature_names"]); custom_feature_names = get<Array const>(config["feature_names"]);
} }
auto& scores = learner->GetThreadLocal().ret_vec_float; std::vector<int32_t> tree_idx;
if (!IsA<Null>(config["tree_idx"])) {
auto j_tree_idx = get<Array const>(config["tree_idx"]);
for (auto const &idx : j_tree_idx) {
tree_idx.push_back(get<Integer const>(idx));
}
}
auto &scores = learner->GetThreadLocal().ret_vec_float;
std::vector<bst_feature_t> features; std::vector<bst_feature_t> features;
learner->CalcFeatureScore(importance, &features, &scores); learner->CalcFeatureScore(importance, common::Span<int32_t const>(tree_idx), &features, &scores);
auto n_features = learner->GetNumFeature(); auto n_features = learner->GetNumFeature();
GenerateFeatureMap(learner, custom_feature_names, n_features, &feature_map); GenerateFeatureMap(learner, custom_feature_names, n_features, &feature_map);

View File

@ -232,9 +232,11 @@ class GBLinear : public GradientBooster {
} }
void FeatureScore(std::string const &importance_type, void FeatureScore(std::string const &importance_type,
common::Span<int32_t const> trees,
std::vector<bst_feature_t> *out_features, std::vector<bst_feature_t> *out_features,
std::vector<float> *out_scores) const override { std::vector<float> *out_scores) const override {
CHECK(!model_.weight.empty()) << "Model is not initialized"; CHECK(!model_.weight.empty()) << "Model is not initialized";
CHECK(trees.empty()) << "gblinear doesn't support number of trees for feature importance.";
CHECK_EQ(importance_type, "weight") CHECK_EQ(importance_type, "weight")
<< "gblinear only has `weight` defined for feature importance."; << "gblinear only has `weight` defined for feature importance.";
out_features->resize(this->learner_model_param_->num_feature, 0); out_features->resize(this->learner_model_param_->num_feature, 0);

View File

@ -300,18 +300,28 @@ class GBTree : public GradientBooster {
} }
} }
void FeatureScore(std::string const &importance_type, void FeatureScore(std::string const& importance_type, common::Span<int32_t const> trees,
std::vector<bst_feature_t> *features, std::vector<bst_feature_t>* features,
std::vector<float> *scores) const override { std::vector<float>* scores) const override {
// Because feature with no importance doesn't appear in the return value so // Because feature with no importance doesn't appear in the return value so
// we need to set up another pair of vectors to store the values during // we need to set up another pair of vectors to store the values during
// computation. // computation.
std::vector<size_t> split_counts(this->model_.learner_model_param->num_feature, 0); std::vector<size_t> split_counts(this->model_.learner_model_param->num_feature, 0);
std::vector<float> gain_map(this->model_.learner_model_param->num_feature, 0); std::vector<float> gain_map(this->model_.learner_model_param->num_feature, 0);
std::vector<int32_t> tree_idx;
if (trees.empty()) {
tree_idx.resize(this->model_.trees.size());
std::iota(tree_idx.begin(), tree_idx.end(), 0);
trees = common::Span<int32_t const>(tree_idx);
}
auto total_n_trees = model_.trees.size();
auto add_score = [&](auto fn) { auto add_score = [&](auto fn) {
for (auto const &p_tree : model_.trees) { for (auto idx : trees) {
CHECK_LE(idx, total_n_trees) << "Invalid tree index.";
auto const& p_tree = model_.trees[idx];
p_tree->WalkTree([&](bst_node_t nidx) { p_tree->WalkTree([&](bst_node_t nidx) {
auto const &node = (*p_tree)[nidx]; auto const& node = (*p_tree)[nidx];
if (!node.IsLeaf()) { if (!node.IsLeaf()) {
split_counts[node.SplitIndex()]++; split_counts[node.SplitIndex()]++;
fn(p_tree, nidx, node.SplitIndex()); fn(p_tree, nidx, node.SplitIndex());

View File

@ -1214,11 +1214,10 @@ class LearnerImpl : public LearnerIO {
*out_preds = &out_predictions.predictions; *out_preds = &out_predictions.predictions;
} }
void CalcFeatureScore(std::string const &importance_type, void CalcFeatureScore(std::string const& importance_type, common::Span<int32_t const> trees,
std::vector<bst_feature_t> *features, std::vector<bst_feature_t>* features, std::vector<float>* scores) override {
std::vector<float> *scores) override {
this->Configure(); this->Configure();
gbm_->FeatureScore(importance_type, features, scores); gbm_->FeatureScore(importance_type, trees, features, scores);
} }
const std::map<std::string, std::string>& GetConfigurationArguments() const override { const std::map<std::string, std::string>& GetConfigurationArguments() const override {

View File

@ -430,7 +430,7 @@ TEST(GBTree, FeatureScore) {
std::vector<bst_feature_t> features_weight; std::vector<bst_feature_t> features_weight;
std::vector<float> scores_weight; std::vector<float> scores_weight;
learner->CalcFeatureScore("weight", &features_weight, &scores_weight); learner->CalcFeatureScore("weight", {}, &features_weight, &scores_weight);
ASSERT_EQ(features_weight.size(), scores_weight.size()); ASSERT_EQ(features_weight.size(), scores_weight.size());
ASSERT_LE(features_weight.size(), learner->GetNumFeature()); ASSERT_LE(features_weight.size(), learner->GetNumFeature());
ASSERT_TRUE(std::is_sorted(features_weight.begin(), features_weight.end())); ASSERT_TRUE(std::is_sorted(features_weight.begin(), features_weight.end()));
@ -438,11 +438,11 @@ TEST(GBTree, FeatureScore) {
auto test_eq = [&learner, &scores_weight](std::string type) { auto test_eq = [&learner, &scores_weight](std::string type) {
std::vector<bst_feature_t> features; std::vector<bst_feature_t> features;
std::vector<float> scores; std::vector<float> scores;
learner->CalcFeatureScore(type, &features, &scores); learner->CalcFeatureScore(type, {}, &features, &scores);
std::vector<bst_feature_t> features_total; std::vector<bst_feature_t> features_total;
std::vector<float> scores_total; std::vector<float> scores_total;
learner->CalcFeatureScore("total_" + type, &features_total, &scores_total); learner->CalcFeatureScore("total_" + type, {}, &features_total, &scores_total);
for (size_t i = 0; i < scores_weight.size(); ++i) { for (size_t i = 0; i < scores_weight.size(); ++i) {
ASSERT_LE(RelError(scores_total[i] / scores[i], scores_weight[i]), kRtEps); ASSERT_LE(RelError(scores_total[i] / scores[i], scores_weight[i]), kRtEps);