adding feature contributions to R and gblinear (#2295)

* [gblinear] add features contribution prediction; fix DumpModel bug

* [gbtree] minor changes to PredContrib

* [R] add feature contribution prediction to R

* [R] bump up version; update NEWS

* [gblinear] fix the base_margin issue; fixes #1969

* [R] list of matrices as output of multiclass feature contributions

* [gblinear] make order of DumpModel coefficients consistent: group index changes the fastest
This commit is contained in:
Vadim Khotilovich
2017-05-21 06:41:51 -05:00
committed by Yuan (Terry) Tang
parent e5e721722e
commit b52db87d5c
10 changed files with 255 additions and 60 deletions

View File

@@ -180,10 +180,6 @@ class GBLinear : public GradientBooster {
<< "GBLinear::Predict ntrees is only valid for gbtree predictor";
std::vector<bst_float> &preds = *out_preds;
const std::vector<bst_float>& base_margin = p_fmat->info().base_margin;
if (base_margin.size() != 0) {
CHECK_EQ(preds.size(), base_margin.size())
<< "base_margin.size does not match with prediction size";
}
preds.resize(0);
// start collecting the prediction
dmlc::DataIter<RowBatch> *iter = p_fmat->RowIterator();
@@ -218,45 +214,87 @@ class GBLinear : public GradientBooster {
this->Pred(inst, dmlc::BeginPtr(*out_preds), gid, base_margin_);
}
}
void PredictLeaf(DMatrix *p_fmat,
std::vector<bst_float> *out_preds,
unsigned ntree_limit) override {
LOG(FATAL) << "gblinear does not support predict leaf index";
LOG(FATAL) << "gblinear does not support prediction of leaf index";
}
void PredictContribution(DMatrix* p_fmat,
std::vector<bst_float>* out_contribs,
unsigned ntree_limit) override {
LOG(FATAL) << "gblinear does not support predict contributions";
if (model.weight.size() == 0) {
model.InitModel();
}
CHECK_EQ(ntree_limit, 0U)
<< "GBLinear::PredictContribution: ntrees is only valid for gbtree predictor";
const std::vector<bst_float>& base_margin = p_fmat->info().base_margin;
const int ngroup = model.param.num_output_group;
const size_t ncolumns = model.param.num_feature + 1;
// allocate space for (#features + bias) times #groups times #rows
std::vector<bst_float>& contribs = *out_contribs;
contribs.resize(p_fmat->info().num_row * ncolumns * ngroup);
// make sure contributions is zeroed, we could be reusing a previously allocated one
std::fill(contribs.begin(), contribs.end(), 0);
// start collecting the contributions
dmlc::DataIter<RowBatch>* iter = p_fmat->RowIterator();
iter->BeforeFirst();
while (iter->Next()) {
const RowBatch& batch = iter->Value();
// parallel over local batch
const bst_omp_uint nsize = static_cast<bst_omp_uint>(batch.size);
#pragma omp parallel for schedule(static)
for (bst_omp_uint i = 0; i < nsize; ++i) {
const RowBatch::Inst &inst = batch[i];
size_t row_idx = static_cast<size_t>(batch.base_rowid + i);
// loop over output groups
for (int gid = 0; gid < ngroup; ++gid) {
bst_float *p_contribs = &contribs[(row_idx * ngroup + gid) * ncolumns];
// calculate linear terms' contributions
for (bst_uint c = 0; c < inst.length; ++c) {
if (inst[c].index >= model.param.num_feature) continue;
p_contribs[inst[c].index] = inst[c].fvalue * model[inst[c].index][gid];
}
// add base margin to BIAS
p_contribs[ncolumns - 1] = model.bias()[gid] +
((base_margin.size() != 0) ? base_margin[row_idx * ngroup + gid] : base_margin_);
}
}
}
}
std::vector<std::string> DumpModel(const FeatureMap& fmap,
bool with_stats,
std::string format) const override {
const int ngroup = model.param.num_output_group;
const unsigned nfeature = model.param.num_feature;
std::stringstream fo("");
if (format == "json") {
fo << " { \"bias\": [" << std::endl;
for (int i = 0; i < model.param.num_output_group; ++i) {
if (i != 0) fo << "," << std::endl;
fo << " " << model.bias()[i];
for (int gid = 0; gid < ngroup; ++gid) {
if (gid != 0) fo << "," << std::endl;
fo << " " << model.bias()[gid];
}
fo << std::endl << " ]," << std::endl
<< " \"weight\": [" << std::endl;
for (int i = 0; i < model.param.num_output_group; ++i) {
for (unsigned j = 0; j < model.param.num_feature; ++j) {
if (i != 0 || j != 0) fo << "," << std::endl;
fo << " " << model[i][j];
for (unsigned i = 0; i < nfeature; ++i) {
for (int gid = 0; gid < ngroup; ++gid) {
if (i != 0 || gid != 0) fo << "," << std::endl;
fo << " " << model[i][gid];
}
}
fo << std::endl << " ]" << std::endl << " }";
} else {
fo << "bias:\n";
for (int i = 0; i < model.param.num_output_group; ++i) {
fo << model.bias()[i] << std::endl;
for (int gid = 0; gid < ngroup; ++gid) {
fo << model.bias()[gid] << std::endl;
}
fo << "weight:\n";
for (int i = 0; i < model.param.num_output_group; ++i) {
for (unsigned j = 0; j <model.param.num_feature; ++j) {
fo << model[i][j] << std::endl;
for (unsigned i = 0; i < nfeature; ++i) {
for (int gid = 0; gid < ngroup; ++gid) {
fo << model[i][gid] << std::endl;
}
}
}

View File

@@ -571,6 +571,7 @@ class GBTree : public GradientBooster {
if (ntree_limit == 0 || ntree_limit > trees.size()) {
ntree_limit = static_cast<unsigned>(trees.size());
}
const int ngroup = mparam.num_output_group;
size_t ncolumns = mparam.num_feature + 1;
// allocate space for (number of features + bias) times the number of rows
std::vector<bst_float>& contribs = *out_contribs;
@@ -584,7 +585,7 @@ class GBTree : public GradientBooster {
}
// start collecting the contributions
dmlc::DataIter<RowBatch>* iter = p_fmat->RowIterator();
const std::vector<bst_float>& base_margin = p_fmat->info().base_margin;
const std::vector<bst_float>& base_margin = info.base_margin;
iter->BeforeFirst();
while (iter->Next()) {
const RowBatch& batch = iter->Value();
@@ -596,8 +597,8 @@ class GBTree : public GradientBooster {
unsigned root_id = info.GetRoot(row_idx);
RegTree::FVec &feats = thread_temp[omp_get_thread_num()];
// loop over all classes
for (int gid = 0; gid < mparam.num_output_group; ++gid) {
bst_float *p_contribs = &contribs[(row_idx * mparam.num_output_group + gid) * ncolumns];
for (int gid = 0; gid < ngroup; ++gid) {
bst_float *p_contribs = &contribs[(row_idx * ngroup + gid) * ncolumns];
feats.Fill(batch[i]);
// calculate contributions
for (unsigned j = 0; j < ntree_limit; ++j) {
@@ -607,9 +608,9 @@ class GBTree : public GradientBooster {
trees[j]->CalculateContributions(feats, root_id, p_contribs);
}
feats.Drop(batch[i]);
// add base margin to BIAS feature
// add base margin to BIAS
if (base_margin.size() != 0) {
p_contribs[ncolumns - 1] += base_margin[row_idx * mparam.num_output_group + gid];
p_contribs[ncolumns - 1] += base_margin[row_idx * ngroup + gid];
} else {
p_contribs[ncolumns - 1] += base_margin_;
}