* [R] Fix global feature importance. * Add implementation for tree index. The parameter is not documented in C API since we should work on porting the model slicing to R instead of supporting more use of tree index. * Fix the difference between "gain" and "total_gain". * debug. * Fix prediction.
This commit is contained in:
parent
a3d195e73e
commit
e7ac2486eb
@ -397,6 +397,7 @@ predict.xgb.Booster <- function(object, newdata, missing = NA, outputmargin = FA
|
||||
shape <- predts$shape
|
||||
ret <- predts$results
|
||||
|
||||
n_ret <- length(ret)
|
||||
n_row <- nrow(newdata)
|
||||
if (n_row != shape[1]) {
|
||||
stop("Incorrect predict shape.")
|
||||
@ -405,36 +406,55 @@ predict.xgb.Booster <- function(object, newdata, missing = NA, outputmargin = FA
|
||||
arr <- array(data = ret, dim = rev(shape))
|
||||
|
||||
cnames <- if (!is.null(colnames(newdata))) c(colnames(newdata), "BIAS") else NULL
|
||||
n_groups <- shape[2]
|
||||
|
||||
## Needed regardless of whether strict shape is being used.
|
||||
if (predcontrib) {
|
||||
dimnames(arr) <- list(cnames, NULL, NULL)
|
||||
if (!strict_shape) {
|
||||
arr <- aperm(a = arr, perm = c(2, 3, 1)) # [group, row, col]
|
||||
}
|
||||
} else if (predinteraction) {
|
||||
dimnames(arr) <- list(cnames, cnames, NULL, NULL)
|
||||
if (!strict_shape) {
|
||||
arr <- aperm(a = arr, perm = c(3, 4, 1, 2)) # [group, row, col, col]
|
||||
}
|
||||
if (strict_shape) {
|
||||
return(arr) # strict shape is calculated by libxgboost uniformly.
|
||||
}
|
||||
|
||||
if (!strict_shape) {
|
||||
n_groups <- shape[2]
|
||||
if (predleaf) {
|
||||
arr <- matrix(arr, nrow = n_row, byrow = TRUE)
|
||||
} else if (predcontrib && n_groups != 1) {
|
||||
arr <- lapply(seq_len(n_groups), function(g) arr[g, , ])
|
||||
} else if (predinteraction && n_groups != 1) {
|
||||
arr <- lapply(seq_len(n_groups), function(g) arr[g, , , ])
|
||||
} else if (!reshape && n_groups != 1) {
|
||||
arr <- ret
|
||||
} else if (reshape && n_groups != 1) {
|
||||
arr <- matrix(arr, ncol = n_groups, byrow = TRUE)
|
||||
## Predict leaf
|
||||
arr <- if (n_ret == n_row) {
|
||||
matrix(arr, ncol = 1)
|
||||
} else {
|
||||
matrix(arr, nrow = n_row, byrow = TRUE)
|
||||
}
|
||||
arr <- drop(arr)
|
||||
if (length(dim(arr)) == 1) {
|
||||
arr <- as.vector(arr)
|
||||
} else if (length(dim(arr)) == 2) {
|
||||
arr <- as.matrix(arr)
|
||||
} else if (predcontrib) {
|
||||
## Predict contribution
|
||||
arr <- aperm(a = arr, perm = c(2, 3, 1)) # [group, row, col]
|
||||
arr <- if (n_ret == n_row) {
|
||||
matrix(arr, ncol = 1, dimnames = list(NULL, cnames))
|
||||
} else if (n_groups != 1) {
|
||||
## turns array into list of matrices
|
||||
lapply(seq_len(n_groups), function(g) arr[g, , ])
|
||||
} else {
|
||||
## remove the first axis (group)
|
||||
as.matrix(arr[1, , ])
|
||||
}
|
||||
} else if (predinteraction) {
|
||||
## Predict interaction
|
||||
arr <- aperm(a = arr, perm = c(3, 4, 1, 2)) # [group, row, col, col]
|
||||
arr <- if (n_ret == n_row) {
|
||||
matrix(arr, ncol = 1, dimnames = list(NULL, cnames))
|
||||
} else if (n_groups != 1) {
|
||||
## turns array into list of matrices
|
||||
lapply(seq_len(n_groups), function(g) arr[g, , , ])
|
||||
} else {
|
||||
## remove the first axis (group)
|
||||
arr[1, , , ]
|
||||
}
|
||||
} else {
|
||||
## Normal prediction
|
||||
arr <- if (reshape && n_groups != 1) {
|
||||
matrix(arr, ncol = n_groups, byrow = TRUE)
|
||||
} else {
|
||||
as.vector(ret)
|
||||
}
|
||||
}
|
||||
return(arr)
|
||||
|
||||
@ -115,14 +115,14 @@ xgb.importance <- function(feature_names = NULL, model = NULL, trees = NULL,
|
||||
} else {
|
||||
concatenated <- list()
|
||||
output_names <- vector()
|
||||
for (importance_type in c("weight", "gain", "cover")) {
|
||||
args <- list(importance_type = importance_type, feature_names = feature_names)
|
||||
for (importance_type in c("weight", "total_gain", "total_cover")) {
|
||||
args <- list(importance_type = importance_type, feature_names = feature_names, tree_idx = trees)
|
||||
results <- .Call(
|
||||
XGBoosterFeatureScore_R, model$handle, jsonlite::toJSON(args, auto_unbox = TRUE, null = "null")
|
||||
)
|
||||
names(results) <- c("features", "shape", importance_type)
|
||||
concatenated[
|
||||
switch(importance_type, "weight" = "Frequency", "gain" = "Gain", "cover" = "Cover")
|
||||
switch(importance_type, "weight" = "Frequency", "total_gain" = "Gain", "total_cover" = "Cover")
|
||||
] <- results[importance_type]
|
||||
output_names <- results$features
|
||||
}
|
||||
|
||||
@ -1,3 +1,4 @@
|
||||
library(testthat)
|
||||
context('Test helper functions')
|
||||
|
||||
require(xgboost)
|
||||
@ -310,7 +311,35 @@ test_that("xgb.importance works with and without feature names", {
|
||||
# for multiclass
|
||||
imp.Tree <- xgb.importance(model = mbst.Tree)
|
||||
expect_equal(dim(imp.Tree), c(4, 4))
|
||||
xgb.importance(model = mbst.Tree, trees = seq(from = 0, by = nclass, length.out = nrounds))
|
||||
|
||||
trees <- seq(from = 0, by = 2, length.out = 2)
|
||||
importance <- xgb.importance(feature_names = feature.names, model = bst.Tree, trees = trees)
|
||||
|
||||
importance_from_dump <- function() {
|
||||
model_text_dump <- xgb.dump(model = bst.Tree, with_stats = TRUE, trees = trees)
|
||||
imp <- xgb.model.dt.tree(
|
||||
feature_names = feature.names,
|
||||
text = model_text_dump,
|
||||
trees = trees
|
||||
)[
|
||||
Feature != "Leaf", .(
|
||||
Gain = sum(Quality),
|
||||
Cover = sum(Cover),
|
||||
Frequency = .N
|
||||
),
|
||||
by = Feature
|
||||
][
|
||||
, `:=`(
|
||||
Gain = Gain / sum(Gain),
|
||||
Cover = Cover / sum(Cover),
|
||||
Frequency = Frequency / sum(Frequency)
|
||||
)
|
||||
][
|
||||
order(Gain, decreasing = TRUE)
|
||||
]
|
||||
imp
|
||||
}
|
||||
expect_equal(importance_from_dump(), importance, tolerance = 1e-6)
|
||||
})
|
||||
|
||||
test_that("xgb.importance works with GLM model", {
|
||||
|
||||
@ -32,8 +32,8 @@ After 1.4 release, we added a new parameter called ``strict_shape``, one can set
|
||||
- When using ``output_margin`` to avoid transformation and ``strict_shape`` is set to ``True``:
|
||||
|
||||
Similar to the previous case, output is a 2-dim array, except for that ``multi:softmax``
|
||||
has equivalent output of ``multi:softprob`` due to dropped transformation. If strict
|
||||
shape is set to False then output can have 1 or 2 dim depending on used model.
|
||||
has equivalent output shape of ``multi:softprob`` due to dropped transformation. If
|
||||
strict shape is set to False then output can have 1 or 2 dim depending on used model.
|
||||
|
||||
- When using ``preds_contribs`` with ``strict_shape`` set to ``True``:
|
||||
|
||||
|
||||
@ -183,6 +183,7 @@ class GradientBooster : public Model, public Configurable {
|
||||
std::string format) const = 0;
|
||||
|
||||
virtual void FeatureScore(std::string const& importance_type,
|
||||
common::Span<int32_t const> trees,
|
||||
std::vector<bst_feature_t>* features,
|
||||
std::vector<float>* scores) const = 0;
|
||||
/*!
|
||||
|
||||
@ -156,6 +156,7 @@ class Learner : public Model, public Configurable, public dmlc::Serializable {
|
||||
* \brief Calculate feature score. See doc in C API for outputs.
|
||||
*/
|
||||
virtual void CalcFeatureScore(std::string const& importance_type,
|
||||
common::Span<int32_t const> trees,
|
||||
std::vector<bst_feature_t>* features,
|
||||
std::vector<float>* scores) = 0;
|
||||
|
||||
|
||||
@ -1159,9 +1159,17 @@ XGB_DLL int XGBoosterFeatureScore(BoosterHandle handle, char const *json_config,
|
||||
custom_feature_names = get<Array const>(config["feature_names"]);
|
||||
}
|
||||
|
||||
std::vector<int32_t> tree_idx;
|
||||
if (!IsA<Null>(config["tree_idx"])) {
|
||||
auto j_tree_idx = get<Array const>(config["tree_idx"]);
|
||||
for (auto const &idx : j_tree_idx) {
|
||||
tree_idx.push_back(get<Integer const>(idx));
|
||||
}
|
||||
}
|
||||
|
||||
auto &scores = learner->GetThreadLocal().ret_vec_float;
|
||||
std::vector<bst_feature_t> features;
|
||||
learner->CalcFeatureScore(importance, &features, &scores);
|
||||
learner->CalcFeatureScore(importance, common::Span<int32_t const>(tree_idx), &features, &scores);
|
||||
|
||||
auto n_features = learner->GetNumFeature();
|
||||
GenerateFeatureMap(learner, custom_feature_names, n_features, &feature_map);
|
||||
|
||||
@ -232,9 +232,11 @@ class GBLinear : public GradientBooster {
|
||||
}
|
||||
|
||||
void FeatureScore(std::string const &importance_type,
|
||||
common::Span<int32_t const> trees,
|
||||
std::vector<bst_feature_t> *out_features,
|
||||
std::vector<float> *out_scores) const override {
|
||||
CHECK(!model_.weight.empty()) << "Model is not initialized";
|
||||
CHECK(trees.empty()) << "gblinear doesn't support number of trees for feature importance.";
|
||||
CHECK_EQ(importance_type, "weight")
|
||||
<< "gblinear only has `weight` defined for feature importance.";
|
||||
out_features->resize(this->learner_model_param_->num_feature, 0);
|
||||
|
||||
@ -300,7 +300,7 @@ class GBTree : public GradientBooster {
|
||||
}
|
||||
}
|
||||
|
||||
void FeatureScore(std::string const &importance_type,
|
||||
void FeatureScore(std::string const& importance_type, common::Span<int32_t const> trees,
|
||||
std::vector<bst_feature_t>* features,
|
||||
std::vector<float>* scores) const override {
|
||||
// Because feature with no importance doesn't appear in the return value so
|
||||
@ -308,8 +308,18 @@ class GBTree : public GradientBooster {
|
||||
// computation.
|
||||
std::vector<size_t> split_counts(this->model_.learner_model_param->num_feature, 0);
|
||||
std::vector<float> gain_map(this->model_.learner_model_param->num_feature, 0);
|
||||
std::vector<int32_t> tree_idx;
|
||||
if (trees.empty()) {
|
||||
tree_idx.resize(this->model_.trees.size());
|
||||
std::iota(tree_idx.begin(), tree_idx.end(), 0);
|
||||
trees = common::Span<int32_t const>(tree_idx);
|
||||
}
|
||||
|
||||
auto total_n_trees = model_.trees.size();
|
||||
auto add_score = [&](auto fn) {
|
||||
for (auto const &p_tree : model_.trees) {
|
||||
for (auto idx : trees) {
|
||||
CHECK_LE(idx, total_n_trees) << "Invalid tree index.";
|
||||
auto const& p_tree = model_.trees[idx];
|
||||
p_tree->WalkTree([&](bst_node_t nidx) {
|
||||
auto const& node = (*p_tree)[nidx];
|
||||
if (!node.IsLeaf()) {
|
||||
|
||||
@ -1214,11 +1214,10 @@ class LearnerImpl : public LearnerIO {
|
||||
*out_preds = &out_predictions.predictions;
|
||||
}
|
||||
|
||||
void CalcFeatureScore(std::string const &importance_type,
|
||||
std::vector<bst_feature_t> *features,
|
||||
std::vector<float> *scores) override {
|
||||
void CalcFeatureScore(std::string const& importance_type, common::Span<int32_t const> trees,
|
||||
std::vector<bst_feature_t>* features, std::vector<float>* scores) override {
|
||||
this->Configure();
|
||||
gbm_->FeatureScore(importance_type, features, scores);
|
||||
gbm_->FeatureScore(importance_type, trees, features, scores);
|
||||
}
|
||||
|
||||
const std::map<std::string, std::string>& GetConfigurationArguments() const override {
|
||||
|
||||
@ -430,7 +430,7 @@ TEST(GBTree, FeatureScore) {
|
||||
|
||||
std::vector<bst_feature_t> features_weight;
|
||||
std::vector<float> scores_weight;
|
||||
learner->CalcFeatureScore("weight", &features_weight, &scores_weight);
|
||||
learner->CalcFeatureScore("weight", {}, &features_weight, &scores_weight);
|
||||
ASSERT_EQ(features_weight.size(), scores_weight.size());
|
||||
ASSERT_LE(features_weight.size(), learner->GetNumFeature());
|
||||
ASSERT_TRUE(std::is_sorted(features_weight.begin(), features_weight.end()));
|
||||
@ -438,11 +438,11 @@ TEST(GBTree, FeatureScore) {
|
||||
auto test_eq = [&learner, &scores_weight](std::string type) {
|
||||
std::vector<bst_feature_t> features;
|
||||
std::vector<float> scores;
|
||||
learner->CalcFeatureScore(type, &features, &scores);
|
||||
learner->CalcFeatureScore(type, {}, &features, &scores);
|
||||
|
||||
std::vector<bst_feature_t> features_total;
|
||||
std::vector<float> scores_total;
|
||||
learner->CalcFeatureScore("total_" + type, &features_total, &scores_total);
|
||||
learner->CalcFeatureScore("total_" + type, {}, &features_total, &scores_total);
|
||||
|
||||
for (size_t i = 0; i < scores_weight.size(); ++i) {
|
||||
ASSERT_LE(RelError(scores_total[i] / scores[i], scores_weight[i]), kRtEps);
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user