SHAP values for feature contributions (#2438)

* SHAP values for feature contributions * Fix commenting error * New polynomial time SHAP value estimation algorithm * Update API to support SHAP values * Fix merge conflicts with updates in master * Correct submodule hashes * Fix variable sized stack allocation * Make lint happy * Add docs * Fix typo * Adjust tolerances * Remove unneeded def * Fixed cpp test setup * Updated R API and cleaned up * Fixed test typo
2017-10-12 12:35:51 -07:00
parent ff9180cd73
commit 78c4188cec
16 changed files with 369 additions and 143 deletions
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -758,7 +758,8 @@ XGB_DLL int XGBoosterPredict(BoosterHandle handle,
      (option_mask & 1) != 0,
      &preds, ntree_limit,
      (option_mask & 2) != 0,
-      (option_mask & 4) != 0);
+      (option_mask & 4) != 0,
+      (option_mask & 8) != 0);
  *out_result = dmlc::BeginPtr(preds);
  *len = static_cast<xgboost::bst_ulong>(preds.size());
  API_END();
--- a/src/gbm/gblinear.cc
+++ b/src/gbm/gblinear.cc
@@ -224,7 +224,7 @@ class GBLinear : public GradientBooster {

  void PredictContribution(DMatrix* p_fmat,
                           std::vector<bst_float>* out_contribs,
-                           unsigned ntree_limit) override {
+                           unsigned ntree_limit, bool approximate) override {
    if (model.weight.size() == 0) {
      model.InitModel();
    }
--- a/src/gbm/gbtree.cc
+++ b/src/gbm/gbtree.cc
@@ -233,8 +233,8 @@ class GBTree : public GradientBooster {

  void PredictContribution(DMatrix* p_fmat,
                           std::vector<bst_float>* out_contribs,
-                           unsigned ntree_limit) override {
-    predictor->PredictContribution(p_fmat, out_contribs, model_, ntree_limit);
+                           unsigned ntree_limit, bool approximate) override {
+    predictor->PredictContribution(p_fmat, out_contribs, model_, ntree_limit, approximate);
  }

  std::vector<std::string> DumpModel(const FeatureMap& fmap,
--- a/src/learner.cc
+++ b/src/learner.cc
@@ -433,9 +433,9 @@ class LearnerImpl : public Learner {

  void Predict(DMatrix* data, bool output_margin,
               std::vector<bst_float>* out_preds, unsigned ntree_limit,
-               bool pred_leaf, bool pred_contribs) const override {
+               bool pred_leaf, bool pred_contribs, bool approx_contribs) const override {
    if (pred_contribs) {
-      gbm_->PredictContribution(data, out_preds, ntree_limit);
+      gbm_->PredictContribution(data, out_preds, ntree_limit, approx_contribs);
    } else if (pred_leaf) {
      gbm_->PredictLeaf(data, out_preds, ntree_limit);
    } else {
--- a/src/predictor/cpu_predictor.cc
+++ b/src/predictor/cpu_predictor.cc
@@ -206,9 +206,9 @@ class CPUPredictor : public Predictor {
    }
  }

-  void PredictContribution(DMatrix* p_fmat,
-                           std::vector<bst_float>* out_contribs,
-                           const gbm::GBTreeModel& model, unsigned ntree_limit) override {
+  void PredictContribution(DMatrix* p_fmat, std::vector<bst_float>* out_contribs,
+                           const gbm::GBTreeModel& model, unsigned ntree_limit,
+                           bool approximate) override {
    const int nthread = omp_get_max_threads();
    InitThreadTemp(nthread,  model.param.num_feature);
    const MetaInfo& info = p_fmat->info();
@@ -225,10 +225,12 @@ class CPUPredictor : public Predictor {
    // make sure contributions is zeroed, we could be reusing a previously
    // allocated one
    std::fill(contribs.begin(), contribs.end(), 0);
-// initialize tree node mean values
-#pragma omp parallel for schedule(static)
-    for (bst_omp_uint i = 0; i < ntree_limit; ++i) {
-      model.trees[i]->FillNodeMeanValues();
+    if (approximate) {
+      // initialize tree node mean values
+      #pragma omp parallel for schedule(static)
+      for (bst_omp_uint i = 0; i < ntree_limit; ++i) {
+        model.trees[i]->FillNodeMeanValues();
+      }
    }
    // start collecting the contributions
    dmlc::DataIter<RowBatch>* iter = p_fmat->RowIterator();
@@ -253,7 +255,11 @@ class CPUPredictor : public Predictor {
            if (model.tree_info[j] != gid) {
              continue;
            }
-            model.trees[j]->CalculateContributions(feats, root_id, p_contribs);
+            if (!approximate) {
+              model.trees[j]->CalculateContributions(feats, root_id, p_contribs);
+            } else {
+              model.trees[j]->CalculateContributionsApprox(feats, root_id, p_contribs);
+            }
          }
          feats.Drop(batch[i]);
          // add base margin to BIAS
--- a/src/predictor/gpu_predictor.cu
+++ b/src/predictor/gpu_predictor.cu
@@ -384,9 +384,10 @@ class GPUPredictor : public xgboost::Predictor {
  void PredictContribution(DMatrix* p_fmat,
                           std::vector<bst_float>* out_contribs,
                           const gbm::GBTreeModel& model,
-                           unsigned ntree_limit) override {
+                           unsigned ntree_limit,
+                           bool approximate) override {
    cpu_predictor->PredictContribution(p_fmat, out_contribs, model,
-                                       ntree_limit);
+                                       ntree_limit, approximate);
  }

  void Init(const std::vector<std::pair<std::string, std::string>>& cfg,