Implement feature score for linear model. (#7048)

* Add feature score support for linear model.
* Port R interface to the new implementation.
* Add linear model support in Python.

Co-authored-by: Philip Hyunsu Cho <chohyu01@cs.washington.edu>
This commit is contained in:
Jiaming Yuan 2021-06-25 14:34:02 +08:00 committed by GitHub
parent b2d300e727
commit 663136aa08
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
18 changed files with 367 additions and 232 deletions

View File

@ -96,41 +96,44 @@ xgb.importance <- function(feature_names = NULL, model = NULL, trees = NULL,
if (!(is.null(feature_names) || is.character(feature_names))) if (!(is.null(feature_names) || is.character(feature_names)))
stop("feature_names: Has to be a character vector") stop("feature_names: Has to be a character vector")
model_text_dump <- xgb.dump(model = model, with_stats = TRUE) model <- xgb.Booster.complete(model)
config <- jsonlite::fromJSON(xgb.config(model))
# linear model if (config$learner$gradient_booster$name == "gblinear") {
if (model_text_dump[2] == "bias:"){ args <- list(importance_type = "weight", feature_names = feature_names)
weight_index <- which(model_text_dump == "weight:") + 1 results <- .Call(
weights <- as.numeric( XGBoosterFeatureScore_R, model$handle, jsonlite::toJSON(args, auto_unbox = TRUE, null = "null")
model_text_dump[weight_index:length(model_text_dump)]
) )
names(results) <- c("features", "shape", "weight")
num_class <- NVL(model$params$num_class, 1) n_classes <- if (length(results$shape) == 2) { results$shape[2] } else { 0 }
if (is.null(feature_names)) importance <- if (n_classes == 0) {
feature_names <- seq(to = length(weights) / num_class) - 1 data.table(Feature = results$features, Weight = results$weight)[order(-abs(Weight))]
if (length(feature_names) * num_class != length(weights))
stop("feature_names length does not match the number of features used in the model")
result <- if (num_class == 1) {
data.table(Feature = feature_names, Weight = weights)[order(-abs(Weight))]
} else { } else {
data.table(Feature = rep(feature_names, each = num_class), data.table(
Weight = weights, Feature = rep(results$features, each = n_classes), Weight = results$weight, Class = seq_len(n_classes) - 1
Class = seq_len(num_class) - 1)[order(Class, -abs(Weight))] )[order(Class, -abs(Weight))]
} }
} else { # tree model } else {
result <- xgb.model.dt.tree(feature_names = feature_names, concatenated <- list()
text = model_text_dump, output_names <- vector()
trees = trees)[ for (importance_type in c("weight", "gain", "cover")) {
Feature != "Leaf", .(Gain = sum(Quality), args <- list(importance_type = importance_type, feature_names = feature_names)
Cover = sum(Cover), results <- .Call(
Frequency = .N), by = Feature][ XGBoosterFeatureScore_R, model$handle, jsonlite::toJSON(args, auto_unbox = TRUE, null = "null")
, `:=`(Gain = Gain / sum(Gain), )
Cover = Cover / sum(Cover), names(results) <- c("features", "shape", importance_type)
Frequency = Frequency / sum(Frequency))][ concatenated[
order(Gain, decreasing = TRUE)] switch(importance_type, "weight" = "Frequency", "gain" = "Gain", "cover" = "Cover")
] <- results[importance_type]
output_names <- results$features
}
importance <- data.table(
Feature = output_names,
Gain = concatenated$Gain / sum(concatenated$Gain),
Cover = concatenated$Cover / sum(concatenated$Cover),
Frequency = concatenated$Frequency / sum(concatenated$Frequency)
)[order(Gain, decreasing = TRUE)]
} }
result importance
} }
# Avoid error messages during CRAN check. # Avoid error messages during CRAN check.

View File

@ -47,6 +47,7 @@ extern SEXP XGDMatrixSetInfo_R(SEXP, SEXP, SEXP);
extern SEXP XGDMatrixSliceDMatrix_R(SEXP, SEXP); extern SEXP XGDMatrixSliceDMatrix_R(SEXP, SEXP);
extern SEXP XGBSetGlobalConfig_R(SEXP); extern SEXP XGBSetGlobalConfig_R(SEXP);
extern SEXP XGBGetGlobalConfig_R(); extern SEXP XGBGetGlobalConfig_R();
extern SEXP XGBoosterFeatureScore_R(SEXP, SEXP);
static const R_CallMethodDef CallEntries[] = { static const R_CallMethodDef CallEntries[] = {
{"XGBoosterBoostOneIter_R", (DL_FUNC) &XGBoosterBoostOneIter_R, 4}, {"XGBoosterBoostOneIter_R", (DL_FUNC) &XGBoosterBoostOneIter_R, 4},
@ -81,6 +82,7 @@ static const R_CallMethodDef CallEntries[] = {
{"XGDMatrixSliceDMatrix_R", (DL_FUNC) &XGDMatrixSliceDMatrix_R, 2}, {"XGDMatrixSliceDMatrix_R", (DL_FUNC) &XGDMatrixSliceDMatrix_R, 2},
{"XGBSetGlobalConfig_R", (DL_FUNC) &XGBSetGlobalConfig_R, 1}, {"XGBSetGlobalConfig_R", (DL_FUNC) &XGBSetGlobalConfig_R, 1},
{"XGBGetGlobalConfig_R", (DL_FUNC) &XGBGetGlobalConfig_R, 0}, {"XGBGetGlobalConfig_R", (DL_FUNC) &XGBGetGlobalConfig_R, 0},
{"XGBoosterFeatureScore_R", (DL_FUNC) &XGBoosterFeatureScore_R, 2},
{NULL, NULL, 0} {NULL, NULL, 0}
}; };

View File

@ -38,11 +38,11 @@
using namespace dmlc; using namespace dmlc;
SEXP XGCheckNullPtr_R(SEXP handle) { XGB_DLL SEXP XGCheckNullPtr_R(SEXP handle) {
return ScalarLogical(R_ExternalPtrAddr(handle) == NULL); return ScalarLogical(R_ExternalPtrAddr(handle) == NULL);
} }
void _DMatrixFinalizer(SEXP ext) { XGB_DLL void _DMatrixFinalizer(SEXP ext) {
R_API_BEGIN(); R_API_BEGIN();
if (R_ExternalPtrAddr(ext) == NULL) return; if (R_ExternalPtrAddr(ext) == NULL) return;
CHECK_CALL(XGDMatrixFree(R_ExternalPtrAddr(ext))); CHECK_CALL(XGDMatrixFree(R_ExternalPtrAddr(ext)));
@ -50,14 +50,14 @@ void _DMatrixFinalizer(SEXP ext) {
R_API_END(); R_API_END();
} }
SEXP XGBSetGlobalConfig_R(SEXP json_str) { XGB_DLL SEXP XGBSetGlobalConfig_R(SEXP json_str) {
R_API_BEGIN(); R_API_BEGIN();
CHECK_CALL(XGBSetGlobalConfig(CHAR(asChar(json_str)))); CHECK_CALL(XGBSetGlobalConfig(CHAR(asChar(json_str))));
R_API_END(); R_API_END();
return R_NilValue; return R_NilValue;
} }
SEXP XGBGetGlobalConfig_R() { XGB_DLL SEXP XGBGetGlobalConfig_R() {
const char* json_str; const char* json_str;
R_API_BEGIN(); R_API_BEGIN();
CHECK_CALL(XGBGetGlobalConfig(&json_str)); CHECK_CALL(XGBGetGlobalConfig(&json_str));
@ -65,7 +65,7 @@ SEXP XGBGetGlobalConfig_R() {
return mkString(json_str); return mkString(json_str);
} }
SEXP XGDMatrixCreateFromFile_R(SEXP fname, SEXP silent) { XGB_DLL SEXP XGDMatrixCreateFromFile_R(SEXP fname, SEXP silent) {
SEXP ret; SEXP ret;
R_API_BEGIN(); R_API_BEGIN();
DMatrixHandle handle; DMatrixHandle handle;
@ -77,8 +77,7 @@ SEXP XGDMatrixCreateFromFile_R(SEXP fname, SEXP silent) {
return ret; return ret;
} }
SEXP XGDMatrixCreateFromMat_R(SEXP mat, XGB_DLL SEXP XGDMatrixCreateFromMat_R(SEXP mat, SEXP missing) {
SEXP missing) {
SEXP ret; SEXP ret;
R_API_BEGIN(); R_API_BEGIN();
SEXP dim = getAttrib(mat, R_DimSymbol); SEXP dim = getAttrib(mat, R_DimSymbol);
@ -112,10 +111,8 @@ SEXP XGDMatrixCreateFromMat_R(SEXP mat,
return ret; return ret;
} }
SEXP XGDMatrixCreateFromCSC_R(SEXP indptr, XGB_DLL SEXP XGDMatrixCreateFromCSC_R(SEXP indptr, SEXP indices, SEXP data,
SEXP indices, SEXP num_row) {
SEXP data,
SEXP num_row) {
SEXP ret; SEXP ret;
R_API_BEGIN(); R_API_BEGIN();
const int *p_indptr = INTEGER(indptr); const int *p_indptr = INTEGER(indptr);
@ -151,7 +148,7 @@ SEXP XGDMatrixCreateFromCSC_R(SEXP indptr,
return ret; return ret;
} }
SEXP XGDMatrixSliceDMatrix_R(SEXP handle, SEXP idxset) { XGB_DLL SEXP XGDMatrixSliceDMatrix_R(SEXP handle, SEXP idxset) {
SEXP ret; SEXP ret;
R_API_BEGIN(); R_API_BEGIN();
int len = length(idxset); int len = length(idxset);
@ -171,7 +168,7 @@ SEXP XGDMatrixSliceDMatrix_R(SEXP handle, SEXP idxset) {
return ret; return ret;
} }
SEXP XGDMatrixSaveBinary_R(SEXP handle, SEXP fname, SEXP silent) { XGB_DLL SEXP XGDMatrixSaveBinary_R(SEXP handle, SEXP fname, SEXP silent) {
R_API_BEGIN(); R_API_BEGIN();
CHECK_CALL(XGDMatrixSaveBinary(R_ExternalPtrAddr(handle), CHECK_CALL(XGDMatrixSaveBinary(R_ExternalPtrAddr(handle),
CHAR(asChar(fname)), CHAR(asChar(fname)),
@ -180,7 +177,7 @@ SEXP XGDMatrixSaveBinary_R(SEXP handle, SEXP fname, SEXP silent) {
return R_NilValue; return R_NilValue;
} }
SEXP XGDMatrixSetInfo_R(SEXP handle, SEXP field, SEXP array) { XGB_DLL SEXP XGDMatrixSetInfo_R(SEXP handle, SEXP field, SEXP array) {
R_API_BEGIN(); R_API_BEGIN();
int len = length(array); int len = length(array);
const char *name = CHAR(asChar(field)); const char *name = CHAR(asChar(field));
@ -214,7 +211,7 @@ SEXP XGDMatrixSetInfo_R(SEXP handle, SEXP field, SEXP array) {
return R_NilValue; return R_NilValue;
} }
SEXP XGDMatrixGetInfo_R(SEXP handle, SEXP field) { XGB_DLL SEXP XGDMatrixGetInfo_R(SEXP handle, SEXP field) {
SEXP ret; SEXP ret;
R_API_BEGIN(); R_API_BEGIN();
bst_ulong olen; bst_ulong olen;
@ -232,7 +229,7 @@ SEXP XGDMatrixGetInfo_R(SEXP handle, SEXP field) {
return ret; return ret;
} }
SEXP XGDMatrixNumRow_R(SEXP handle) { XGB_DLL SEXP XGDMatrixNumRow_R(SEXP handle) {
bst_ulong nrow; bst_ulong nrow;
R_API_BEGIN(); R_API_BEGIN();
CHECK_CALL(XGDMatrixNumRow(R_ExternalPtrAddr(handle), &nrow)); CHECK_CALL(XGDMatrixNumRow(R_ExternalPtrAddr(handle), &nrow));
@ -240,7 +237,7 @@ SEXP XGDMatrixNumRow_R(SEXP handle) {
return ScalarInteger(static_cast<int>(nrow)); return ScalarInteger(static_cast<int>(nrow));
} }
SEXP XGDMatrixNumCol_R(SEXP handle) { XGB_DLL SEXP XGDMatrixNumCol_R(SEXP handle) {
bst_ulong ncol; bst_ulong ncol;
R_API_BEGIN(); R_API_BEGIN();
CHECK_CALL(XGDMatrixNumCol(R_ExternalPtrAddr(handle), &ncol)); CHECK_CALL(XGDMatrixNumCol(R_ExternalPtrAddr(handle), &ncol));
@ -255,7 +252,7 @@ void _BoosterFinalizer(SEXP ext) {
R_ClearExternalPtr(ext); R_ClearExternalPtr(ext);
} }
SEXP XGBoosterCreate_R(SEXP dmats) { XGB_DLL SEXP XGBoosterCreate_R(SEXP dmats) {
SEXP ret; SEXP ret;
R_API_BEGIN(); R_API_BEGIN();
int len = length(dmats); int len = length(dmats);
@ -272,7 +269,7 @@ SEXP XGBoosterCreate_R(SEXP dmats) {
return ret; return ret;
} }
SEXP XGBoosterCreateInEmptyObj_R(SEXP dmats, SEXP R_handle) { XGB_DLL SEXP XGBoosterCreateInEmptyObj_R(SEXP dmats, SEXP R_handle) {
R_API_BEGIN(); R_API_BEGIN();
int len = length(dmats); int len = length(dmats);
std::vector<void*> dvec; std::vector<void*> dvec;
@ -287,7 +284,7 @@ SEXP XGBoosterCreateInEmptyObj_R(SEXP dmats, SEXP R_handle) {
return R_NilValue; return R_NilValue;
} }
SEXP XGBoosterSetParam_R(SEXP handle, SEXP name, SEXP val) { XGB_DLL SEXP XGBoosterSetParam_R(SEXP handle, SEXP name, SEXP val) {
R_API_BEGIN(); R_API_BEGIN();
CHECK_CALL(XGBoosterSetParam(R_ExternalPtrAddr(handle), CHECK_CALL(XGBoosterSetParam(R_ExternalPtrAddr(handle),
CHAR(asChar(name)), CHAR(asChar(name)),
@ -296,7 +293,7 @@ SEXP XGBoosterSetParam_R(SEXP handle, SEXP name, SEXP val) {
return R_NilValue; return R_NilValue;
} }
SEXP XGBoosterUpdateOneIter_R(SEXP handle, SEXP iter, SEXP dtrain) { XGB_DLL SEXP XGBoosterUpdateOneIter_R(SEXP handle, SEXP iter, SEXP dtrain) {
R_API_BEGIN(); R_API_BEGIN();
CHECK_CALL(XGBoosterUpdateOneIter(R_ExternalPtrAddr(handle), CHECK_CALL(XGBoosterUpdateOneIter(R_ExternalPtrAddr(handle),
asInteger(iter), asInteger(iter),
@ -305,7 +302,7 @@ SEXP XGBoosterUpdateOneIter_R(SEXP handle, SEXP iter, SEXP dtrain) {
return R_NilValue; return R_NilValue;
} }
SEXP XGBoosterBoostOneIter_R(SEXP handle, SEXP dtrain, SEXP grad, SEXP hess) { XGB_DLL SEXP XGBoosterBoostOneIter_R(SEXP handle, SEXP dtrain, SEXP grad, SEXP hess) {
R_API_BEGIN(); R_API_BEGIN();
CHECK_EQ(length(grad), length(hess)) CHECK_EQ(length(grad), length(hess))
<< "gradient and hess must have same length"; << "gradient and hess must have same length";
@ -328,7 +325,7 @@ SEXP XGBoosterBoostOneIter_R(SEXP handle, SEXP dtrain, SEXP grad, SEXP hess) {
return R_NilValue; return R_NilValue;
} }
SEXP XGBoosterEvalOneIter_R(SEXP handle, SEXP iter, SEXP dmats, SEXP evnames) { XGB_DLL SEXP XGBoosterEvalOneIter_R(SEXP handle, SEXP iter, SEXP dmats, SEXP evnames) {
const char *ret; const char *ret;
R_API_BEGIN(); R_API_BEGIN();
CHECK_EQ(length(dmats), length(evnames)) CHECK_EQ(length(dmats), length(evnames))
@ -353,8 +350,8 @@ SEXP XGBoosterEvalOneIter_R(SEXP handle, SEXP iter, SEXP dmats, SEXP evnames) {
return mkString(ret); return mkString(ret);
} }
SEXP XGBoosterPredict_R(SEXP handle, SEXP dmat, SEXP option_mask, XGB_DLL SEXP XGBoosterPredict_R(SEXP handle, SEXP dmat, SEXP option_mask,
SEXP ntree_limit, SEXP training) { SEXP ntree_limit, SEXP training) {
SEXP ret; SEXP ret;
R_API_BEGIN(); R_API_BEGIN();
bst_ulong olen; bst_ulong olen;
@ -374,7 +371,7 @@ SEXP XGBoosterPredict_R(SEXP handle, SEXP dmat, SEXP option_mask,
return ret; return ret;
} }
SEXP XGBoosterPredictFromDMatrix_R(SEXP handle, SEXP dmat, SEXP json_config) { XGB_DLL SEXP XGBoosterPredictFromDMatrix_R(SEXP handle, SEXP dmat, SEXP json_config) {
SEXP r_out_shape; SEXP r_out_shape;
SEXP r_out_result; SEXP r_out_result;
SEXP r_out; SEXP r_out;
@ -413,21 +410,21 @@ SEXP XGBoosterPredictFromDMatrix_R(SEXP handle, SEXP dmat, SEXP json_config) {
return r_out; return r_out;
} }
SEXP XGBoosterLoadModel_R(SEXP handle, SEXP fname) { XGB_DLL SEXP XGBoosterLoadModel_R(SEXP handle, SEXP fname) {
R_API_BEGIN(); R_API_BEGIN();
CHECK_CALL(XGBoosterLoadModel(R_ExternalPtrAddr(handle), CHAR(asChar(fname)))); CHECK_CALL(XGBoosterLoadModel(R_ExternalPtrAddr(handle), CHAR(asChar(fname))));
R_API_END(); R_API_END();
return R_NilValue; return R_NilValue;
} }
SEXP XGBoosterSaveModel_R(SEXP handle, SEXP fname) { XGB_DLL SEXP XGBoosterSaveModel_R(SEXP handle, SEXP fname) {
R_API_BEGIN(); R_API_BEGIN();
CHECK_CALL(XGBoosterSaveModel(R_ExternalPtrAddr(handle), CHAR(asChar(fname)))); CHECK_CALL(XGBoosterSaveModel(R_ExternalPtrAddr(handle), CHAR(asChar(fname))));
R_API_END(); R_API_END();
return R_NilValue; return R_NilValue;
} }
SEXP XGBoosterModelToRaw_R(SEXP handle) { XGB_DLL SEXP XGBoosterModelToRaw_R(SEXP handle) {
SEXP ret; SEXP ret;
R_API_BEGIN(); R_API_BEGIN();
bst_ulong olen; bst_ulong olen;
@ -442,7 +439,7 @@ SEXP XGBoosterModelToRaw_R(SEXP handle) {
return ret; return ret;
} }
SEXP XGBoosterLoadModelFromRaw_R(SEXP handle, SEXP raw) { XGB_DLL SEXP XGBoosterLoadModelFromRaw_R(SEXP handle, SEXP raw) {
R_API_BEGIN(); R_API_BEGIN();
CHECK_CALL(XGBoosterLoadModelFromBuffer(R_ExternalPtrAddr(handle), CHECK_CALL(XGBoosterLoadModelFromBuffer(R_ExternalPtrAddr(handle),
RAW(raw), RAW(raw),
@ -451,7 +448,7 @@ SEXP XGBoosterLoadModelFromRaw_R(SEXP handle, SEXP raw) {
return R_NilValue; return R_NilValue;
} }
SEXP XGBoosterSaveJsonConfig_R(SEXP handle) { XGB_DLL SEXP XGBoosterSaveJsonConfig_R(SEXP handle) {
const char* ret; const char* ret;
R_API_BEGIN(); R_API_BEGIN();
bst_ulong len {0}; bst_ulong len {0};
@ -462,14 +459,14 @@ SEXP XGBoosterSaveJsonConfig_R(SEXP handle) {
return mkString(ret); return mkString(ret);
} }
SEXP XGBoosterLoadJsonConfig_R(SEXP handle, SEXP value) { XGB_DLL SEXP XGBoosterLoadJsonConfig_R(SEXP handle, SEXP value) {
R_API_BEGIN(); R_API_BEGIN();
CHECK_CALL(XGBoosterLoadJsonConfig(R_ExternalPtrAddr(handle), CHAR(asChar(value)))); CHECK_CALL(XGBoosterLoadJsonConfig(R_ExternalPtrAddr(handle), CHAR(asChar(value))));
R_API_END(); R_API_END();
return R_NilValue; return R_NilValue;
} }
SEXP XGBoosterSerializeToBuffer_R(SEXP handle) { XGB_DLL SEXP XGBoosterSerializeToBuffer_R(SEXP handle) {
SEXP ret; SEXP ret;
R_API_BEGIN(); R_API_BEGIN();
bst_ulong out_len; bst_ulong out_len;
@ -484,7 +481,7 @@ SEXP XGBoosterSerializeToBuffer_R(SEXP handle) {
return ret; return ret;
} }
SEXP XGBoosterUnserializeFromBuffer_R(SEXP handle, SEXP raw) { XGB_DLL SEXP XGBoosterUnserializeFromBuffer_R(SEXP handle, SEXP raw) {
R_API_BEGIN(); R_API_BEGIN();
CHECK_CALL(XGBoosterUnserializeFromBuffer(R_ExternalPtrAddr(handle), CHECK_CALL(XGBoosterUnserializeFromBuffer(R_ExternalPtrAddr(handle),
RAW(raw), RAW(raw),
@ -493,7 +490,7 @@ SEXP XGBoosterUnserializeFromBuffer_R(SEXP handle, SEXP raw) {
return R_NilValue; return R_NilValue;
} }
SEXP XGBoosterDumpModel_R(SEXP handle, SEXP fmap, SEXP with_stats, SEXP dump_format) { XGB_DLL SEXP XGBoosterDumpModel_R(SEXP handle, SEXP fmap, SEXP with_stats, SEXP dump_format) {
SEXP out; SEXP out;
R_API_BEGIN(); R_API_BEGIN();
bst_ulong olen; bst_ulong olen;
@ -530,7 +527,7 @@ SEXP XGBoosterDumpModel_R(SEXP handle, SEXP fmap, SEXP with_stats, SEXP dump_for
return out; return out;
} }
SEXP XGBoosterGetAttr_R(SEXP handle, SEXP name) { XGB_DLL SEXP XGBoosterGetAttr_R(SEXP handle, SEXP name) {
SEXP out; SEXP out;
R_API_BEGIN(); R_API_BEGIN();
int success; int success;
@ -550,7 +547,7 @@ SEXP XGBoosterGetAttr_R(SEXP handle, SEXP name) {
return out; return out;
} }
SEXP XGBoosterSetAttr_R(SEXP handle, SEXP name, SEXP val) { XGB_DLL SEXP XGBoosterSetAttr_R(SEXP handle, SEXP name, SEXP val) {
R_API_BEGIN(); R_API_BEGIN();
const char *v = isNull(val) ? nullptr : CHAR(asChar(val)); const char *v = isNull(val) ? nullptr : CHAR(asChar(val));
CHECK_CALL(XGBoosterSetAttr(R_ExternalPtrAddr(handle), CHECK_CALL(XGBoosterSetAttr(R_ExternalPtrAddr(handle),
@ -559,7 +556,7 @@ SEXP XGBoosterSetAttr_R(SEXP handle, SEXP name, SEXP val) {
return R_NilValue; return R_NilValue;
} }
SEXP XGBoosterGetAttrNames_R(SEXP handle) { XGB_DLL SEXP XGBoosterGetAttrNames_R(SEXP handle) {
SEXP out; SEXP out;
R_API_BEGIN(); R_API_BEGIN();
bst_ulong len; bst_ulong len;
@ -578,3 +575,51 @@ SEXP XGBoosterGetAttrNames_R(SEXP handle) {
UNPROTECT(1); UNPROTECT(1);
return out; return out;
} }
XGB_DLL SEXP XGBoosterFeatureScore_R(SEXP handle, SEXP json_config) {
SEXP out_features_sexp;
SEXP out_scores_sexp;
SEXP out_shape_sexp;
SEXP r_out;
R_API_BEGIN();
char const *c_json_config = CHAR(asChar(json_config));
bst_ulong out_n_features;
char const **out_features;
bst_ulong out_dim;
bst_ulong const *out_shape;
float const *out_scores;
CHECK_CALL(XGBoosterFeatureScore(R_ExternalPtrAddr(handle), c_json_config,
&out_n_features, &out_features,
&out_dim, &out_shape, &out_scores));
out_shape_sexp = PROTECT(allocVector(INTSXP, out_dim));
size_t len = 1;
for (size_t i = 0; i < out_dim; ++i) {
INTEGER(out_shape_sexp)[i] = out_shape[i];
len *= out_shape[i];
}
out_scores_sexp = PROTECT(allocVector(REALSXP, len));
#pragma omp parallel for
for (omp_ulong i = 0; i < len; ++i) {
REAL(out_scores_sexp)[i] = out_scores[i];
}
out_features_sexp = PROTECT(allocVector(STRSXP, out_n_features));
for (size_t i = 0; i < out_n_features; ++i) {
SET_STRING_ELT(out_features_sexp, i, mkChar(out_features[i]));
}
r_out = PROTECT(allocVector(VECSXP, 3));
SET_VECTOR_ELT(r_out, 0, out_features_sexp);
SET_VECTOR_ELT(r_out, 1, out_shape_sexp);
SET_VECTOR_ELT(r_out, 2, out_scores_sexp);
R_API_END();
UNPROTECT(4);
return r_out;
}

View File

@ -275,4 +275,12 @@ XGB_DLL SEXP XGBoosterSetAttr_R(SEXP handle, SEXP name, SEXP val);
*/ */
XGB_DLL SEXP XGBoosterGetAttrNames_R(SEXP handle); XGB_DLL SEXP XGBoosterGetAttrNames_R(SEXP handle);
/*!
* \brief Get feature scores from the model.
* \param json_config See `XGBoosterFeatureScore` in xgboost c_api.h
* \return A vector with the first element as feature names, second element as shape of
* feature scores and thrid element as feature scores.
*/
XGB_DLL SEXP XGBoosterFeatureScore_R(SEXP handle, SEXP json_config);
#endif // XGBOOST_WRAPPER_R_H_ // NOLINT(*) #endif // XGBOOST_WRAPPER_R_H_ // NOLINT(*)

View File

@ -11,8 +11,8 @@ DEMO_DIR = os.path.join(XGBOOST_ROOT_DIR, 'demo')
# simple example # simple example
# load file from text file, also binary buffer generated by xgboost # load file from text file, also binary buffer generated by xgboost
dtrain = xgb.DMatrix(os.path.join(DEMO_DIR, 'data', 'agaricus.txt.train')) dtrain = xgb.DMatrix(os.path.join(DEMO_DIR, 'data', 'agaricus.txt.train?indexing_mode=1'))
dtest = xgb.DMatrix(os.path.join(DEMO_DIR, 'data', 'agaricus.txt.test')) dtest = xgb.DMatrix(os.path.join(DEMO_DIR, 'data', 'agaricus.txt.test?indexing_mode=1'))
# specify parameters via map, definition are same as c++ version # specify parameters via map, definition are same as c++ version
param = {'max_depth': 2, 'eta': 1, 'objective': 'binary:logistic'} param = {'max_depth': 2, 'eta': 1, 'objective': 'binary:logistic'}

View File

@ -1195,10 +1195,13 @@ XGB_DLL int XGBoosterGetStrFeatureInfo(BoosterHandle handle, const char *field,
const char ***out_features); const char ***out_features);
/*! /*!
* \brief Calculate feature scores for tree models. * \brief Calculate feature scores for tree models. When used on linear model, only the
* `weight` importance type is defined, and output scores is a row major matrix with shape
* [n_features, n_classes] for multi-class model. For tree model, out_n_feature is always
* equal to out_n_scores and has multiple definitions of importance type.
* *
* \param handle An instance of Booster * \param handle An instance of Booster
* \param json_config Parameters for computing scores. Accepted JSON keys are: * \param json_config Parameters for computing scores. Accepted JSON keys are:
* - importance_type: A JSON string with following possible values: * - importance_type: A JSON string with following possible values:
* * 'weight': the number of times a feature is used to split the data across all trees. * * 'weight': the number of times a feature is used to split the data across all trees.
* * 'gain': the average gain across all splits the feature is used in. * * 'gain': the average gain across all splits the feature is used in.
@ -1206,15 +1209,20 @@ XGB_DLL int XGBoosterGetStrFeatureInfo(BoosterHandle handle, const char *field,
* * 'total_gain': the total gain across all splits the feature is used in. * * 'total_gain': the total gain across all splits the feature is used in.
* * 'total_cover': the total coverage across all splits the feature is used in. * * 'total_cover': the total coverage across all splits the feature is used in.
* - feature_map: An optional JSON string with URI or path to the feature map file. * - feature_map: An optional JSON string with URI or path to the feature map file.
* - feature_names: An optional JSON array with string names for each feature.
* *
* \param out_length Length of output arrays. * \param out_n_features Length of output feature names.
* \param out_features An array of string as feature names, ordered the same as output scores. * \param out_features An array of string as feature names, ordered the same as output scores.
* \param out_scores An array of floating point as feature scores. * \param out_dim Dimension of output feature scores.
* \param out_shape Shape of output feature scores with length of `out_dim`.
* \param out_scores An array of floating point as feature scores with shape of `out_shape`.
* *
* \return 0 when success, -1 when failure happens * \return 0 when success, -1 when failure happens
*/ */
XGB_DLL int XGBoosterFeatureScore(BoosterHandle handle, const char *json_config, XGB_DLL int XGBoosterFeatureScore(BoosterHandle handle, const char *json_config,
bst_ulong *out_length, bst_ulong *out_n_features,
const char ***out_features, char const ***out_features,
float **out_scores); bst_ulong *out_dim,
bst_ulong const **out_shape,
float const **out_scores);
#endif // XGBOOST_C_API_H_ #endif // XGBOOST_C_API_H_

View File

@ -184,9 +184,7 @@ class GradientBooster : public Model, public Configurable {
virtual void FeatureScore(std::string const &importance_type, virtual void FeatureScore(std::string const &importance_type,
std::vector<bst_feature_t> *features, std::vector<bst_feature_t> *features,
std::vector<float> *scores) const { std::vector<float> *scores) const = 0;
LOG(FATAL) << "`feature_score` is not implemented for current booster.";
}
/*! /*!
* \brief Whether the current booster uses GPU. * \brief Whether the current booster uses GPU.
*/ */

View File

@ -13,6 +13,7 @@
#include <array> #include <array>
#include <algorithm> #include <algorithm>
#include <utility> #include <utility>
#include <vector>
namespace xgboost { namespace xgboost {
/*! /*!
@ -59,6 +60,13 @@ template <typename T> class MatrixView {
strides_[0] = shape[1]; strides_[0] = shape[1];
strides_[1] = 1; strides_[1] = 1;
} }
MatrixView(std::vector<T> *vec, std::array<size_t, 2> shape)
: device_{GenericParameter::kCpuId}, values_{*vec} {
CHECK_EQ(vec->size(), shape[0] * shape[1]);
std::copy(shape.cbegin(), shape.cend(), shape_);
strides_[0] = shape[1];
strides_[1] = 1;
}
MatrixView(HostDeviceVector<std::remove_const_t<T>> const *vec, MatrixView(HostDeviceVector<std::remove_const_t<T>> const *vec,
std::array<size_t, 2> shape, int32_t device) std::array<size_t, 2> shape, int32_t device)
: device_{device}, values_{InferValues(vec, device)} { : device_{device}, values_{InferValues(vec, device)} {

View File

@ -1,10 +1,10 @@
/* /*
Copyright (c) 2014 by Contributors Copyright (c) 2014-2021 by Contributors
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
You may obtain a copy of the License at You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0 http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software Unless required by applicable law or agreed to in writing, software
@ -32,6 +32,9 @@ import org.junit.Test;
* @author hzx * @author hzx
*/ */
public class BoosterImplTest { public class BoosterImplTest {
private String train_uri = "../../demo/data/agaricus.txt.train?indexing_mode=1";
private String test_uri = "../../demo/data/agaricus.txt.test?indexing_mode=1";
public static class EvalError implements IEvaluation { public static class EvalError implements IEvaluation {
@Override @Override
public String getMetric() { public String getMetric() {
@ -87,8 +90,8 @@ public class BoosterImplTest {
@Test @Test
public void testBoosterBasic() throws XGBoostError, IOException { public void testBoosterBasic() throws XGBoostError, IOException {
DMatrix trainMat = new DMatrix("../../demo/data/agaricus.txt.train"); DMatrix trainMat = new DMatrix(this.train_uri);
DMatrix testMat = new DMatrix("../../demo/data/agaricus.txt.test"); DMatrix testMat = new DMatrix(this.test_uri);
Booster booster = trainBooster(trainMat, testMat); Booster booster = trainBooster(trainMat, testMat);
@ -103,8 +106,8 @@ public class BoosterImplTest {
@Test @Test
public void saveLoadModelWithPath() throws XGBoostError, IOException { public void saveLoadModelWithPath() throws XGBoostError, IOException {
DMatrix trainMat = new DMatrix("../../demo/data/agaricus.txt.train"); DMatrix trainMat = new DMatrix(this.train_uri);
DMatrix testMat = new DMatrix("../../demo/data/agaricus.txt.test"); DMatrix testMat = new DMatrix(this.test_uri);
IEvaluation eval = new EvalError(); IEvaluation eval = new EvalError();
Booster booster = trainBooster(trainMat, testMat); Booster booster = trainBooster(trainMat, testMat);
@ -121,8 +124,8 @@ public class BoosterImplTest {
@Test @Test
public void saveLoadModelWithStream() throws XGBoostError, IOException { public void saveLoadModelWithStream() throws XGBoostError, IOException {
DMatrix trainMat = new DMatrix("../../demo/data/agaricus.txt.train"); DMatrix trainMat = new DMatrix(this.train_uri);
DMatrix testMat = new DMatrix("../../demo/data/agaricus.txt.test"); DMatrix testMat = new DMatrix(this.test_uri);
Booster booster = trainBooster(trainMat, testMat); Booster booster = trainBooster(trainMat, testMat);
@ -310,8 +313,8 @@ public class BoosterImplTest {
@Test @Test
public void testBoosterEarlyStop() throws XGBoostError, IOException { public void testBoosterEarlyStop() throws XGBoostError, IOException {
DMatrix trainMat = new DMatrix("../../demo/data/agaricus.txt.train"); DMatrix trainMat = new DMatrix(this.train_uri);
DMatrix testMat = new DMatrix("../../demo/data/agaricus.txt.test"); DMatrix testMat = new DMatrix(this.test_uri);
Map<String, Object> paramMap = new HashMap<String, Object>() { Map<String, Object> paramMap = new HashMap<String, Object>() {
{ {
put("max_depth", 3); put("max_depth", 3);
@ -363,8 +366,8 @@ public class BoosterImplTest {
@Test @Test
public void testQuantileHistoDepthWise() throws XGBoostError { public void testQuantileHistoDepthWise() throws XGBoostError {
DMatrix trainMat = new DMatrix("../../demo/data/agaricus.txt.train"); DMatrix trainMat = new DMatrix(this.train_uri);
DMatrix testMat = new DMatrix("../../demo/data/agaricus.txt.test"); DMatrix testMat = new DMatrix(this.test_uri);
Map<String, Object> paramMap = new HashMap<String, Object>() { Map<String, Object> paramMap = new HashMap<String, Object>() {
{ {
put("max_depth", 3); put("max_depth", 3);
@ -383,8 +386,8 @@ public class BoosterImplTest {
@Test @Test
public void testQuantileHistoLossGuide() throws XGBoostError { public void testQuantileHistoLossGuide() throws XGBoostError {
DMatrix trainMat = new DMatrix("../../demo/data/agaricus.txt.train"); DMatrix trainMat = new DMatrix(this.train_uri);
DMatrix testMat = new DMatrix("../../demo/data/agaricus.txt.test"); DMatrix testMat = new DMatrix(this.test_uri);
Map<String, Object> paramMap = new HashMap<String, Object>() { Map<String, Object> paramMap = new HashMap<String, Object>() {
{ {
put("max_depth", 3); put("max_depth", 3);
@ -404,8 +407,8 @@ public class BoosterImplTest {
@Test @Test
public void testQuantileHistoLossGuideMaxBin() throws XGBoostError { public void testQuantileHistoLossGuideMaxBin() throws XGBoostError {
DMatrix trainMat = new DMatrix("../../demo/data/agaricus.txt.train"); DMatrix trainMat = new DMatrix(this.train_uri);
DMatrix testMat = new DMatrix("../../demo/data/agaricus.txt.test"); DMatrix testMat = new DMatrix(this.test_uri);
Map<String, Object> paramMap = new HashMap<String, Object>() { Map<String, Object> paramMap = new HashMap<String, Object>() {
{ {
put("max_depth", 3); put("max_depth", 3);
@ -425,8 +428,8 @@ public class BoosterImplTest {
@Test @Test
public void testDumpModelJson() throws XGBoostError { public void testDumpModelJson() throws XGBoostError {
DMatrix trainMat = new DMatrix("../../demo/data/agaricus.txt.train"); DMatrix trainMat = new DMatrix(this.train_uri);
DMatrix testMat = new DMatrix("../../demo/data/agaricus.txt.test"); DMatrix testMat = new DMatrix(this.test_uri);
Booster booster = trainBooster(trainMat, testMat); Booster booster = trainBooster(trainMat, testMat);
String[] dump = booster.getModelDump("", false, "json"); String[] dump = booster.getModelDump("", false, "json");
@ -441,8 +444,8 @@ public class BoosterImplTest {
@Test @Test
public void testGetFeatureScore() throws XGBoostError { public void testGetFeatureScore() throws XGBoostError {
DMatrix trainMat = new DMatrix("../../demo/data/agaricus.txt.train"); DMatrix trainMat = new DMatrix(this.train_uri);
DMatrix testMat = new DMatrix("../../demo/data/agaricus.txt.test"); DMatrix testMat = new DMatrix(this.test_uri);
Booster booster = trainBooster(trainMat, testMat); Booster booster = trainBooster(trainMat, testMat);
String[] featureNames = new String[126]; String[] featureNames = new String[126];
@ -453,8 +456,8 @@ public class BoosterImplTest {
@Test @Test
public void testGetFeatureImportanceGain() throws XGBoostError { public void testGetFeatureImportanceGain() throws XGBoostError {
DMatrix trainMat = new DMatrix("../../demo/data/agaricus.txt.train"); DMatrix trainMat = new DMatrix(this.train_uri);
DMatrix testMat = new DMatrix("../../demo/data/agaricus.txt.test"); DMatrix testMat = new DMatrix(this.test_uri);
Booster booster = trainBooster(trainMat, testMat); Booster booster = trainBooster(trainMat, testMat);
String[] featureNames = new String[126]; String[] featureNames = new String[126];
@ -465,8 +468,8 @@ public class BoosterImplTest {
@Test @Test
public void testGetFeatureImportanceTotalGain() throws XGBoostError { public void testGetFeatureImportanceTotalGain() throws XGBoostError {
DMatrix trainMat = new DMatrix("../../demo/data/agaricus.txt.train"); DMatrix trainMat = new DMatrix(this.train_uri);
DMatrix testMat = new DMatrix("../../demo/data/agaricus.txt.test"); DMatrix testMat = new DMatrix(this.test_uri);
Booster booster = trainBooster(trainMat, testMat); Booster booster = trainBooster(trainMat, testMat);
String[] featureNames = new String[126]; String[] featureNames = new String[126];
@ -477,8 +480,8 @@ public class BoosterImplTest {
@Test @Test
public void testGetFeatureImportanceCover() throws XGBoostError { public void testGetFeatureImportanceCover() throws XGBoostError {
DMatrix trainMat = new DMatrix("../../demo/data/agaricus.txt.train"); DMatrix trainMat = new DMatrix(this.train_uri);
DMatrix testMat = new DMatrix("../../demo/data/agaricus.txt.test"); DMatrix testMat = new DMatrix(this.test_uri);
Booster booster = trainBooster(trainMat, testMat); Booster booster = trainBooster(trainMat, testMat);
String[] featureNames = new String[126]; String[] featureNames = new String[126];
@ -489,8 +492,8 @@ public class BoosterImplTest {
@Test @Test
public void testGetFeatureImportanceTotalCover() throws XGBoostError { public void testGetFeatureImportanceTotalCover() throws XGBoostError {
DMatrix trainMat = new DMatrix("../../demo/data/agaricus.txt.train"); DMatrix trainMat = new DMatrix(this.train_uri);
DMatrix testMat = new DMatrix("../../demo/data/agaricus.txt.test"); DMatrix testMat = new DMatrix(this.test_uri);
Booster booster = trainBooster(trainMat, testMat); Booster booster = trainBooster(trainMat, testMat);
String[] featureNames = new String[126]; String[] featureNames = new String[126];
@ -501,7 +504,7 @@ public class BoosterImplTest {
@Test @Test
public void testQuantileHistoDepthwiseMaxDepth() throws XGBoostError { public void testQuantileHistoDepthwiseMaxDepth() throws XGBoostError {
DMatrix trainMat = new DMatrix("../../demo/data/agaricus.txt.train"); DMatrix trainMat = new DMatrix(this.train_uri);
Map<String, Object> paramMap = new HashMap<String, Object>() { Map<String, Object> paramMap = new HashMap<String, Object>() {
{ {
put("max_depth", 3); put("max_depth", 3);
@ -519,8 +522,8 @@ public class BoosterImplTest {
@Test @Test
public void testQuantileHistoDepthwiseMaxDepthMaxBin() throws XGBoostError { public void testQuantileHistoDepthwiseMaxDepthMaxBin() throws XGBoostError {
DMatrix trainMat = new DMatrix("../../demo/data/agaricus.txt.train"); DMatrix trainMat = new DMatrix(this.train_uri);
DMatrix testMat = new DMatrix("../../demo/data/agaricus.txt.test"); DMatrix testMat = new DMatrix(this.test_uri);
Map<String, Object> paramMap = new HashMap<String, Object>() { Map<String, Object> paramMap = new HashMap<String, Object>() {
{ {
put("max_depth", 3); put("max_depth", 3);
@ -545,7 +548,7 @@ public class BoosterImplTest {
@Test @Test
public void testCV() throws XGBoostError { public void testCV() throws XGBoostError {
//load train mat //load train mat
DMatrix trainMat = new DMatrix("../../demo/data/agaricus.txt.train"); DMatrix trainMat = new DMatrix(this.train_uri);
//set params //set params
Map<String, Object> param = new HashMap<String, Object>() { Map<String, Object> param = new HashMap<String, Object>() {
@ -573,8 +576,8 @@ public class BoosterImplTest {
*/ */
@Test @Test
public void testTrainFromExistingModel() throws XGBoostError, IOException { public void testTrainFromExistingModel() throws XGBoostError, IOException {
DMatrix trainMat = new DMatrix("../../demo/data/agaricus.txt.train"); DMatrix trainMat = new DMatrix(this.train_uri);
DMatrix testMat = new DMatrix("../../demo/data/agaricus.txt.test"); DMatrix testMat = new DMatrix(this.test_uri);
IEvaluation eval = new EvalError(); IEvaluation eval = new EvalError();
Map<String, Object> paramMap = new HashMap<String, Object>() { Map<String, Object> paramMap = new HashMap<String, Object>() {
@ -624,8 +627,8 @@ public class BoosterImplTest {
*/ */
@Test @Test
public void testSetAndGetAttrs() throws XGBoostError { public void testSetAndGetAttrs() throws XGBoostError {
DMatrix trainMat = new DMatrix("../../demo/data/agaricus.txt.train"); DMatrix trainMat = new DMatrix(this.train_uri);
DMatrix testMat = new DMatrix("../../demo/data/agaricus.txt.test"); DMatrix testMat = new DMatrix(this.test_uri);
Booster booster = trainBooster(trainMat, testMat); Booster booster = trainBooster(trainMat, testMat);
booster.setAttr("testKey1", "testValue1"); booster.setAttr("testKey1", "testValue1");
@ -654,10 +657,10 @@ public class BoosterImplTest {
*/ */
@Test @Test
public void testGetNumFeature() throws XGBoostError { public void testGetNumFeature() throws XGBoostError {
DMatrix trainMat = new DMatrix("../../demo/data/agaricus.txt.train"); DMatrix trainMat = new DMatrix(this.train_uri);
DMatrix testMat = new DMatrix("../../demo/data/agaricus.txt.test"); DMatrix testMat = new DMatrix(this.test_uri);
Booster booster = trainBooster(trainMat, testMat); Booster booster = trainBooster(trainMat, testMat);
TestCase.assertEquals(booster.getNumFeature(), 127); TestCase.assertEquals(booster.getNumFeature(), 126);
} }
} }

View File

@ -2132,47 +2132,18 @@ class Booster(object):
fmap = os.fspath(os.path.expanduser(fmap)) fmap = os.fspath(os.path.expanduser(fmap))
length = c_bst_ulong() length = c_bst_ulong()
sarr = ctypes.POINTER(ctypes.c_char_p)() sarr = ctypes.POINTER(ctypes.c_char_p)()
if self.feature_names is not None and fmap == '': _check_call(_LIB.XGBoosterDumpModelEx(self.handle,
flen = len(self.feature_names) c_str(fmap),
ctypes.c_int(with_stats),
fname = from_pystr_to_cstr(self.feature_names) c_str(dump_format),
ctypes.byref(length),
if self.feature_types is None: ctypes.byref(sarr)))
# use quantitative as default
# {'q': quantitative, 'i': indicator}
ftype = from_pystr_to_cstr(['q'] * flen)
else:
ftype = from_pystr_to_cstr(self.feature_types)
_check_call(_LIB.XGBoosterDumpModelExWithFeatures(
self.handle,
ctypes.c_int(flen),
fname,
ftype,
ctypes.c_int(with_stats),
c_str(dump_format),
ctypes.byref(length),
ctypes.byref(sarr)))
else:
if fmap != '' and not os.path.exists(fmap):
raise ValueError("No such file: {0}".format(fmap))
_check_call(_LIB.XGBoosterDumpModelEx(self.handle,
c_str(fmap),
ctypes.c_int(with_stats),
c_str(dump_format),
ctypes.byref(length),
ctypes.byref(sarr)))
res = from_cstr_to_pystr(sarr, length) res = from_cstr_to_pystr(sarr, length)
return res return res
def get_fscore(self, fmap=''): def get_fscore(self, fmap=''):
"""Get feature importance of each feature. """Get feature importance of each feature.
.. note:: Feature importance is defined only for tree boosters
Feature importance is only defined when the decision tree model is chosen as base
learner (`booster=gbtree`). It is not defined for other base learner types, such
as linear learners (`booster=gblinear`).
.. note:: Zero-importance features will not be included .. note:: Zero-importance features will not be included
Keep in mind that this function does not include zero-importance feature, i.e. Keep in mind that this function does not include zero-importance feature, i.e.
@ -2190,7 +2161,7 @@ class Booster(object):
self, fmap: os.PathLike = '', importance_type: str = 'weight' self, fmap: os.PathLike = '', importance_type: str = 'weight'
) -> Dict[str, float]: ) -> Dict[str, float]:
"""Get feature importance of each feature. """Get feature importance of each feature.
Importance type can be defined as: For tree model Importance type can be defined as:
* 'weight': the number of times a feature is used to split the data across all trees. * 'weight': the number of times a feature is used to split the data across all trees.
* 'gain': the average gain across all splits the feature is used in. * 'gain': the average gain across all splits the feature is used in.
@ -2198,11 +2169,15 @@ class Booster(object):
* 'total_gain': the total gain across all splits the feature is used in. * 'total_gain': the total gain across all splits the feature is used in.
* 'total_cover': the total coverage across all splits the feature is used in. * 'total_cover': the total coverage across all splits the feature is used in.
.. note:: Feature importance is defined only for tree boosters .. note::
Feature importance is only defined when the decision tree model is chosen as For linear model, only "weight" is defined and it's the normalized coefficients
base learner (`booster=gbtree` or `booster=dart`). It is not defined for other without bias.
base learner types, such as linear learners (`booster=gblinear`).
.. note:: Zero-importance features will not be included
Keep in mind that this function does not include zero-importance feature, i.e.
those features that have not been used in any split conditions.
Parameters Parameters
---------- ----------
@ -2213,7 +2188,9 @@ class Booster(object):
Returns Returns
------- -------
A map between feature names and their scores. A map between feature names and their scores. When `gblinear` is used for
multi-class classification the scores for each feature is a list with length
`n_classes`, otherwise they're scalars.
""" """
fmap = os.fspath(os.path.expanduser(fmap)) fmap = os.fspath(os.path.expanduser(fmap))
args = from_pystr_to_cstr( args = from_pystr_to_cstr(
@ -2221,21 +2198,31 @@ class Booster(object):
) )
features = ctypes.POINTER(ctypes.c_char_p)() features = ctypes.POINTER(ctypes.c_char_p)()
scores = ctypes.POINTER(ctypes.c_float)() scores = ctypes.POINTER(ctypes.c_float)()
length = c_bst_ulong() n_out_features = c_bst_ulong()
out_dim = c_bst_ulong()
shape = ctypes.POINTER(c_bst_ulong)()
_check_call( _check_call(
_LIB.XGBoosterFeatureScore( _LIB.XGBoosterFeatureScore(
self.handle, self.handle,
args, args,
ctypes.byref(length), ctypes.byref(n_out_features),
ctypes.byref(features), ctypes.byref(features),
ctypes.byref(scores) ctypes.byref(out_dim),
ctypes.byref(shape),
ctypes.byref(scores),
) )
) )
features_arr = from_cstr_to_pystr(features, length) features_arr = from_cstr_to_pystr(features, n_out_features)
scores_arr = ctypes2numpy(scores, length.value, np.float32) scores_arr = _prediction_output(shape, out_dim, scores, False)
results = {} results = {}
for feat, score in zip(features_arr, scores_arr): if len(scores_arr.shape) > 1 and scores_arr.shape[1] > 1:
results[feat] = float(score) for feat, score in zip(features_arr, scores_arr):
results[feat] = [float(s) for s in score]
else:
for feat, score in zip(features_arr, scores_arr):
results[feat] = float(score)
return results return results
def trees_to_dataframe(self, fmap=''): def trees_to_dataframe(self, fmap=''):

View File

@ -156,9 +156,14 @@ __model_doc = f'''
[2, 3, 4]], where each inner list is a group of indices of features [2, 3, 4]], where each inner list is a group of indices of features
that are allowed to interact with each other. See tutorial for more that are allowed to interact with each other. See tutorial for more
information information
importance_type: string, default "gain" importance_type: Optional[str]
The feature importance type for the feature_importances\\_ property: The feature importance type for the feature_importances\\_ property:
either "gain", "weight", "cover", "total_gain" or "total_cover".
* For tree model, it's either "gain", "weight", "cover", "total_gain" or
"total_cover".
* For linear model, only "weight" is defined and it's the normalized coefficients
without bias.
gpu_id : Optional[int] gpu_id : Optional[int]
Device ordinal. Device ordinal.
validate_parameters : Optional[bool] validate_parameters : Optional[bool]
@ -382,7 +387,7 @@ class XGBModel(XGBModelBase):
num_parallel_tree: Optional[int] = None, num_parallel_tree: Optional[int] = None,
monotone_constraints: Optional[Union[Dict[str, int], str]] = None, monotone_constraints: Optional[Union[Dict[str, int], str]] = None,
interaction_constraints: Optional[Union[str, List[Tuple[str]]]] = None, interaction_constraints: Optional[Union[str, List[Tuple[str]]]] = None,
importance_type: str = "gain", importance_type: Optional[str] = None,
gpu_id: Optional[int] = None, gpu_id: Optional[int] = None,
validate_parameters: Optional[bool] = None, validate_parameters: Optional[bool] = None,
predictor: Optional[str] = None, predictor: Optional[str] = None,
@ -991,29 +996,26 @@ class XGBModel(XGBModelBase):
@property @property
def feature_importances_(self) -> np.ndarray: def feature_importances_(self) -> np.ndarray:
""" """
Feature importances property Feature importances property, return depends on `importance_type` parameter.
.. note:: Feature importance is defined only for tree boosters
Feature importance is only defined when the decision tree model is chosen as base
learner (`booster=gbtree`). It is not defined for other base learner types, such
as linear learners (`booster=gblinear`).
Returns Returns
------- -------
feature_importances_ : array of shape ``[n_features]`` feature_importances_ : array of shape ``[n_features]`` except for multi-class
linear model, which returns an array with shape `(n_features, n_classes)`
""" """
if self.get_params()['booster'] not in {'gbtree', 'dart'}:
raise AttributeError(
'Feature importance is not defined for Booster type {}'
.format(self.booster))
b: Booster = self.get_booster() b: Booster = self.get_booster()
score = b.get_score(importance_type=self.importance_type)
def dft() -> str:
return "weight" if self.booster == "gblinear" else "gain"
score = b.get_score(
importance_type=self.importance_type if self.importance_type else dft()
)
if b.feature_names is None: if b.feature_names is None:
feature_names = ["f{0}".format(i) for i in range(self.n_features_in_)] feature_names = ["f{0}".format(i) for i in range(self.n_features_in_)]
else: else:
feature_names = b.feature_names feature_names = b.feature_names
# gblinear returns all features so the `get` in next line is only for gbtree.
all_features = [score.get(f, 0.) for f in feature_names] all_features = [score.get(f, 0.) for f in feature_names]
all_features_arr = np.array(all_features, dtype=np.float32) all_features_arr = np.array(all_features, dtype=np.float32)
total = all_features_arr.sum() total = all_features_arr.sum()

View File

@ -927,14 +927,17 @@ XGB_DLL int XGBoosterSlice(BoosterHandle handle, int begin_layer,
API_END(); API_END();
} }
inline void XGBoostDumpModelImpl(BoosterHandle handle, const FeatureMap &fmap, inline void XGBoostDumpModelImpl(BoosterHandle handle, FeatureMap* fmap,
int with_stats, const char *format, int with_stats, const char *format,
xgboost::bst_ulong *len, xgboost::bst_ulong *len,
const char ***out_models) { const char ***out_models) {
auto *bst = static_cast<Learner*>(handle); auto *bst = static_cast<Learner*>(handle);
bst->Configure();
GenerateFeatureMap(bst, {}, bst->GetNumFeature(), fmap);
std::vector<std::string>& str_vecs = bst->GetThreadLocal().ret_vec_str; std::vector<std::string>& str_vecs = bst->GetThreadLocal().ret_vec_str;
std::vector<const char*>& charp_vecs = bst->GetThreadLocal().ret_vec_charp; std::vector<const char*>& charp_vecs = bst->GetThreadLocal().ret_vec_charp;
str_vecs = bst->DumpModel(fmap, with_stats != 0, format); str_vecs = bst->DumpModel(*fmap, with_stats != 0, format);
charp_vecs.resize(str_vecs.size()); charp_vecs.resize(str_vecs.size());
for (size_t i = 0; i < str_vecs.size(); ++i) { for (size_t i = 0; i < str_vecs.size(); ++i) {
charp_vecs[i] = str_vecs[i].c_str(); charp_vecs[i] = str_vecs[i].c_str();
@ -962,14 +965,9 @@ XGB_DLL int XGBoosterDumpModelEx(BoosterHandle handle,
const char*** out_models) { const char*** out_models) {
API_BEGIN(); API_BEGIN();
CHECK_HANDLE(); CHECK_HANDLE();
FeatureMap featmap; std::string uri{fmap};
if (strlen(fmap) != 0) { FeatureMap featmap = LoadFeatureMap(uri);
std::unique_ptr<dmlc::Stream> fs( XGBoostDumpModelImpl(handle, &featmap, with_stats, format, len, out_models);
dmlc::Stream::Create(fmap, "r"));
dmlc::istream is(fs.get());
featmap.LoadText(is);
}
XGBoostDumpModelImpl(handle, featmap, with_stats, format, len, out_models);
API_END(); API_END();
} }
@ -980,8 +978,8 @@ XGB_DLL int XGBoosterDumpModelWithFeatures(BoosterHandle handle,
int with_stats, int with_stats,
xgboost::bst_ulong* len, xgboost::bst_ulong* len,
const char*** out_models) { const char*** out_models) {
return XGBoosterDumpModelExWithFeatures(handle, fnum, fname, ftype, with_stats, return XGBoosterDumpModelExWithFeatures(handle, fnum, fname, ftype,
"text", len, out_models); with_stats, "text", len, out_models);
} }
XGB_DLL int XGBoosterDumpModelExWithFeatures(BoosterHandle handle, XGB_DLL int XGBoosterDumpModelExWithFeatures(BoosterHandle handle,
@ -998,7 +996,7 @@ XGB_DLL int XGBoosterDumpModelExWithFeatures(BoosterHandle handle,
for (int i = 0; i < fnum; ++i) { for (int i = 0; i < fnum; ++i) {
featmap.PushBack(i, fname[i], ftype[i]); featmap.PushBack(i, fname[i], ftype[i]);
} }
XGBoostDumpModelImpl(handle, featmap, with_stats, format, len, out_models); XGBoostDumpModelImpl(handle, &featmap, with_stats, format, len, out_models);
API_END(); API_END();
} }
@ -1098,11 +1096,12 @@ XGB_DLL int XGBoosterGetStrFeatureInfo(BoosterHandle handle, const char *field,
API_END(); API_END();
} }
XGB_DLL int XGBoosterFeatureScore(BoosterHandle handle, XGB_DLL int XGBoosterFeatureScore(BoosterHandle handle, char const *json_config,
const char *json_config, xgboost::bst_ulong *out_n_features,
xgboost::bst_ulong* out_length, char const ***out_features,
const char ***out_features, bst_ulong *out_dim,
float **out_scores) { bst_ulong const **out_shape,
float const **out_scores) {
API_BEGIN(); API_BEGIN();
CHECK_HANDLE(); CHECK_HANDLE();
auto *learner = static_cast<Learner *>(handle); auto *learner = static_cast<Learner *>(handle);
@ -1113,14 +1112,17 @@ XGB_DLL int XGBoosterFeatureScore(BoosterHandle handle,
feature_map_uri = get<String const>(config["feature_map"]); feature_map_uri = get<String const>(config["feature_map"]);
} }
FeatureMap feature_map = LoadFeatureMap(feature_map_uri); FeatureMap feature_map = LoadFeatureMap(feature_map_uri);
std::vector<Json> custom_feature_names;
if (!IsA<Null>(config["feature_names"])) {
custom_feature_names = get<Array const>(config["feature_names"]);
}
auto& scores = learner->GetThreadLocal().ret_vec_float; auto& scores = learner->GetThreadLocal().ret_vec_float;
std::vector<bst_feature_t> features; std::vector<bst_feature_t> features;
learner->CalcFeatureScore(importance, &features, &scores); learner->CalcFeatureScore(importance, &features, &scores);
auto n_features = learner->GetNumFeature(); auto n_features = learner->GetNumFeature();
GenerateFeatureMap(learner, n_features, &feature_map); GenerateFeatureMap(learner, custom_feature_names, n_features, &feature_map);
CHECK_LE(features.size(), n_features);
auto& feature_names = learner->GetThreadLocal().ret_vec_str; auto& feature_names = learner->GetThreadLocal().ret_vec_str;
feature_names.resize(features.size()); feature_names.resize(features.size());
@ -1131,10 +1133,24 @@ XGB_DLL int XGBoosterFeatureScore(BoosterHandle handle,
feature_names[i] = feature_map.Name(features[i]); feature_names[i] = feature_map.Name(features[i]);
feature_names_c[i] = feature_names[i].data(); feature_names_c[i] = feature_names[i].data();
} }
*out_n_features = feature_names.size();
CHECK_EQ(scores.size(), features.size()); CHECK_LE(features.size(), scores.size());
CHECK_EQ(scores.size(), feature_names.size()); auto &shape = learner->GetThreadLocal().prediction_shape;
*out_length = scores.size(); if (scores.size() > features.size()) {
// Linear model multi-class model
CHECK_EQ(scores.size() % features.size(), 0ul);
auto n_classes = scores.size() / features.size();
*out_dim = 2;
shape = {n_features, n_classes};
} else {
CHECK_EQ(features.size(), scores.size());
*out_dim = 1;
shape.resize(1);
shape.front() = scores.size();
}
*out_shape = dmlc::BeginPtr(shape);
*out_scores = scores.data(); *out_scores = scores.data();
*out_features = dmlc::BeginPtr(feature_names_c); *out_features = dmlc::BeginPtr(feature_names_c);
API_END(); API_END();

View File

@ -194,8 +194,8 @@ inline FeatureMap LoadFeatureMap(std::string const& uri) {
return feat; return feat;
} }
// FIXME(jiamingy): Use this for model dump.
inline void GenerateFeatureMap(Learner const *learner, inline void GenerateFeatureMap(Learner const *learner,
std::vector<Json> const &custom_feature_names,
size_t n_features, FeatureMap *out_feature_map) { size_t n_features, FeatureMap *out_feature_map) {
auto &feature_map = *out_feature_map; auto &feature_map = *out_feature_map;
auto maybe = [&](std::vector<std::string> const &values, size_t i, auto maybe = [&](std::vector<std::string> const &values, size_t i,
@ -205,15 +205,31 @@ inline void GenerateFeatureMap(Learner const *learner,
if (feature_map.Size() == 0) { if (feature_map.Size() == 0) {
// Use the feature names and types from booster. // Use the feature names and types from booster.
std::vector<std::string> feature_names; std::vector<std::string> feature_names;
learner->GetFeatureNames(&feature_names); // priority:
// 1. feature map.
// 2. customized feature name.
// 3. from booster
// 4. default feature name.
if (!custom_feature_names.empty()) {
CHECK_EQ(custom_feature_names.size(), n_features)
<< "Incorrect number of feature names.";
feature_names.resize(custom_feature_names.size());
std::transform(custom_feature_names.begin(), custom_feature_names.end(),
feature_names.begin(),
[](Json const &name) { return get<String const>(name); });
} else {
learner->GetFeatureNames(&feature_names);
}
if (!feature_names.empty()) { if (!feature_names.empty()) {
CHECK_EQ(feature_names.size(), n_features) << "Incorrect number of feature names."; CHECK_EQ(feature_names.size(), n_features) << "Incorrect number of feature names.";
} }
std::vector<std::string> feature_types; std::vector<std::string> feature_types;
learner->GetFeatureTypes(&feature_types); learner->GetFeatureTypes(&feature_types);
if (!feature_types.empty()) { if (!feature_types.empty()) {
CHECK_EQ(feature_types.size(), n_features) << "Incorrect number of feature types."; CHECK_EQ(feature_types.size(), n_features) << "Incorrect number of feature types.";
} }
for (size_t i = 0; i < n_features; ++i) { for (size_t i = 0; i < n_features; ++i) {
feature_map.PushBack( feature_map.PushBack(
i, i,

View File

@ -12,6 +12,7 @@
#include <string> #include <string>
#include <sstream> #include <sstream>
#include <algorithm> #include <algorithm>
#include <numeric>
#include "xgboost/gbm.h" #include "xgboost/gbm.h"
#include "xgboost/json.h" #include "xgboost/json.h"
@ -19,6 +20,7 @@
#include "xgboost/linear_updater.h" #include "xgboost/linear_updater.h"
#include "xgboost/logging.h" #include "xgboost/logging.h"
#include "xgboost/learner.h" #include "xgboost/learner.h"
#include "xgboost/linalg.h"
#include "gblinear_model.h" #include "gblinear_model.h"
#include "../common/timer.h" #include "../common/timer.h"
@ -219,6 +221,26 @@ class GBLinear : public GradientBooster {
return model_.DumpModel(fmap, with_stats, format); return model_.DumpModel(fmap, with_stats, format);
} }
void FeatureScore(std::string const &importance_type,
std::vector<bst_feature_t> *out_features,
std::vector<float> *out_scores) const override {
CHECK(!model_.weight.empty()) << "Model is not initialized";
CHECK_EQ(importance_type, "weight")
<< "gblinear only has `weight` defined for feature importance.";
out_features->resize(this->learner_model_param_->num_feature, 0);
std::iota(out_features->begin(), out_features->end(), 0);
// Don't include the bias term in the feature importance scores
// The bias is the last weight
out_scores->resize(model_.weight.size() - learner_model_param_->num_output_group, 0);
auto n_groups = learner_model_param_->num_output_group;
MatrixView<float> scores{out_scores, {learner_model_param_->num_feature, n_groups}};
for (size_t i = 0; i < learner_model_param_->num_feature; ++i) {
for (bst_group_t g = 0; g < n_groups; ++g) {
scores(i, g) = model_[i][g];
}
}
}
bool UseGPU() const override { bool UseGPU() const override {
if (param_.updater == "gpu_coord_descent") { if (param_.updater == "gpu_coord_descent") {
return true; return true;

View File

@ -325,16 +325,19 @@ class GBTree : public GradientBooster {
add_score([&](auto const &p_tree, bst_node_t, bst_feature_t split) { add_score([&](auto const &p_tree, bst_node_t, bst_feature_t split) {
gain_map[split] = split_counts[split]; gain_map[split] = split_counts[split];
}); });
} } else if (importance_type == "gain" || importance_type == "total_gain") {
if (importance_type == "gain" || importance_type == "total_gain") {
add_score([&](auto const &p_tree, bst_node_t nidx, bst_feature_t split) { add_score([&](auto const &p_tree, bst_node_t nidx, bst_feature_t split) {
gain_map[split] += p_tree->Stat(nidx).loss_chg; gain_map[split] += p_tree->Stat(nidx).loss_chg;
}); });
} } else if (importance_type == "cover" || importance_type == "total_cover") {
if (importance_type == "cover" || importance_type == "total_cover") {
add_score([&](auto const &p_tree, bst_node_t nidx, bst_feature_t split) { add_score([&](auto const &p_tree, bst_node_t nidx, bst_feature_t split) {
gain_map[split] += p_tree->Stat(nidx).sum_hess; gain_map[split] += p_tree->Stat(nidx).sum_hess;
}); });
} else {
LOG(FATAL)
<< "Unknown feature importance type, expected one of: "
<< R"({"weight", "total_gain", "total_cover", "gain", "cover"}, got: )"
<< importance_type;
} }
if (importance_type == "gain" || importance_type == "cover") { if (importance_type == "gain" || importance_type == "cover") {
for (size_t i = 0; i < gain_map.size(); ++i) { for (size_t i = 0; i < gain_map.size(); ++i) {

View File

@ -1197,23 +1197,6 @@ class LearnerImpl : public LearnerIO {
std::vector<bst_feature_t> *features, std::vector<bst_feature_t> *features,
std::vector<float> *scores) override { std::vector<float> *scores) override {
this->Configure(); this->Configure();
std::vector<std::string> allowed_importance_type = {
"weight", "total_gain", "total_cover", "gain", "cover"
};
if (std::find(allowed_importance_type.begin(),
allowed_importance_type.end(),
importance_type) == allowed_importance_type.end()) {
std::stringstream ss;
ss << "importance_type mismatch, got: " << importance_type
<< "`, expected one of ";
for (size_t i = 0; i < allowed_importance_type.size(); ++i) {
ss << "`" << allowed_importance_type[i] << "`";
if (i != allowed_importance_type.size() - 1) {
ss << ", ";
}
}
LOG(FATAL) << ss.str();
}
gbm_->FeatureScore(importance_type, features, scores); gbm_->FeatureScore(importance_type, features, scores);
} }

View File

@ -154,6 +154,9 @@ class TestBasic:
dump4j = json.loads(dump4[0]) dump4j = json.loads(dump4[0])
assert 'gain' in dump4j, "Expected 'gain' to be dumped in JSON." assert 'gain' in dump4j, "Expected 'gain' to be dumped in JSON."
with pytest.raises(ValueError):
bst.get_dump(fmap="foo")
def test_feature_score(self): def test_feature_score(self):
rng = np.random.RandomState(0) rng = np.random.RandomState(0)
data = rng.randn(100, 2) data = rng.randn(100, 2)

View File

@ -211,6 +211,7 @@ def test_feature_importances_weight():
digits = load_digits(n_class=2) digits = load_digits(n_class=2)
y = digits['target'] y = digits['target']
X = digits['data'] X = digits['data']
xgb_model = xgb.XGBClassifier(random_state=0, xgb_model = xgb.XGBClassifier(random_state=0,
tree_method="exact", tree_method="exact",
learning_rate=0.1, learning_rate=0.1,
@ -241,6 +242,33 @@ def test_feature_importances_weight():
importance_type="weight").fit(X, y) importance_type="weight").fit(X, y)
np.testing.assert_almost_equal(xgb_model.feature_importances_, exp) np.testing.assert_almost_equal(xgb_model.feature_importances_, exp)
with pytest.raises(ValueError):
xgb_model.set_params(importance_type="foo")
xgb_model.feature_importances_
X, y = load_digits(n_class=3, return_X_y=True)
cls = xgb.XGBClassifier(booster="gblinear", n_estimators=4)
cls.fit(X, y)
assert cls.feature_importances_.shape[0] == X.shape[1]
assert cls.feature_importances_.shape[1] == 3
with tempfile.TemporaryDirectory() as tmpdir:
path = os.path.join(tmpdir, "model.json")
cls.save_model(path)
with open(path, "r") as fd:
model = json.load(fd)
weights = np.array(
model["learner"]["gradient_booster"]["model"]["weights"]
).reshape((cls.n_features_in_ + 1, 3))
weights = weights[:-1, ...]
np.testing.assert_allclose(
weights / weights.sum(), cls.feature_importances_, rtol=1e-6
)
with pytest.raises(ValueError):
cls.set_params(importance_type="cover")
cls.feature_importances_
@pytest.mark.skipif(**tm.no_pandas()) @pytest.mark.skipif(**tm.no_pandas())
def test_feature_importances_gain(): def test_feature_importances_gain():