Add SHAP interaction effects, fix minor bug, and add cox loss (#3043)

* Add interaction effects and cox loss * Minimize whitespace changes * Cox loss now no longer needs a pre-sorted dataset. * Address code review comments * Remove mem check, rename to pred_interactions, include bias * Make lint happy * More lint fixes * Fix cox loss indexing * Fix main effects and tests * Fix lint * Use half interaction values on the off-diagonals * Fix lint again
2018-02-07 18:38:01 -08:00
parent 077abb35cd
commit d878c36c84
19 changed files with 638 additions and 125 deletions
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -759,7 +759,8 @@ XGB_DLL int XGBoosterPredict(BoosterHandle handle,
      &preds, ntree_limit,
      (option_mask & 2) != 0,
      (option_mask & 4) != 0,
-      (option_mask & 8) != 0);
+      (option_mask & 8) != 0,
+      (option_mask & 16) != 0);
  *out_result = dmlc::BeginPtr(preds);
  *len = static_cast<xgboost::bst_ulong>(preds.size());
  API_END();
--- a/src/gbm/gblinear.cc
+++ b/src/gbm/gblinear.cc
@@ -224,7 +224,8 @@ class GBLinear : public GradientBooster {

  void PredictContribution(DMatrix* p_fmat,
                           std::vector<bst_float>* out_contribs,
-                           unsigned ntree_limit, bool approximate) override {
+                           unsigned ntree_limit, bool approximate, int condition = 0,
+                           unsigned condition_feature = 0) override {
    if (model.weight.size() == 0) {
      model.InitModel();
    }
@@ -265,6 +266,17 @@ class GBLinear : public GradientBooster {
    }
  }

+  void PredictInteractionContributions(DMatrix* p_fmat,
+                           std::vector<bst_float>* out_contribs,
+                           unsigned ntree_limit, bool approximate) override {
+                             std::vector<bst_float>& contribs = *out_contribs;
+
+     // linear models have no interaction effects
+     const size_t nelements = model.param.num_feature*model.param.num_feature;
+     contribs.resize(p_fmat->info().num_row * nelements * model.param.num_output_group);
+     std::fill(contribs.begin(), contribs.end(), 0);
+  }
+
  std::vector<std::string> DumpModel(const FeatureMap& fmap,
                                     bool with_stats,
                                     std::string format) const override {
--- a/src/gbm/gbtree.cc
+++ b/src/gbm/gbtree.cc
@@ -220,10 +220,18 @@ class GBTree : public GradientBooster {

  void PredictContribution(DMatrix* p_fmat,
                           std::vector<bst_float>* out_contribs,
-                           unsigned ntree_limit, bool approximate) override {
+                           unsigned ntree_limit, bool approximate, int condition,
+                           unsigned condition_feature) override {
    predictor->PredictContribution(p_fmat, out_contribs, model_, ntree_limit, approximate);
  }

+  void PredictInteractionContributions(DMatrix* p_fmat,
+                                       std::vector<bst_float>* out_contribs,
+                                       unsigned ntree_limit, bool approximate) override {
+    predictor->PredictInteractionContributions(p_fmat, out_contribs, model_,
+                                               ntree_limit, approximate);
+  }
+
  std::vector<std::string> DumpModel(const FeatureMap& fmap,
                                     bool with_stats,
                                     std::string format) const override {
--- a/src/learner.cc
+++ b/src/learner.cc
@@ -443,9 +443,12 @@ class LearnerImpl : public Learner {

  void Predict(DMatrix* data, bool output_margin,
               std::vector<bst_float>* out_preds, unsigned ntree_limit,
-               bool pred_leaf, bool pred_contribs, bool approx_contribs) const override {
+               bool pred_leaf, bool pred_contribs, bool approx_contribs,
+               bool pred_interactions) const override {
    if (pred_contribs) {
      gbm_->PredictContribution(data, out_preds, ntree_limit, approx_contribs);
+    } else if (pred_interactions) {
+      gbm_->PredictInteractionContributions(data, out_preds, ntree_limit, approx_contribs);
    } else if (pred_leaf) {
      gbm_->PredictLeaf(data, out_preds, ntree_limit);
    } else {
--- a/src/metric/rank_metric.cc
+++ b/src/metric/rank_metric.cc
@@ -304,6 +304,52 @@ struct EvalMAP : public EvalRankList {
  }
 };

+/*! \brief Cox: Partial likelihood of the Cox proportional hazards model */
+struct EvalCox : public Metric {
+ public:
+  EvalCox() {}
+  bst_float Eval(const std::vector<bst_float> &preds,
+                 const MetaInfo &info,
+                 bool distributed) const override {
+    CHECK(!distributed) << "Cox metric does not support distributed evaluation";
+    using namespace std;  // NOLINT(*)
+
+    const bst_omp_uint ndata = static_cast<bst_omp_uint>(info.labels.size());
+    const std::vector<size_t> &label_order = info.LabelAbsSort();
+
+    // pre-compute a sum for the denominator
+    double exp_p_sum = 0;  // we use double because we might need the precision with large datasets
+    for (omp_ulong i = 0; i < ndata; ++i) {
+      exp_p_sum += preds[i];
+    }
+
+    double out = 0;
+    double accumulated_sum = 0;
+    bst_omp_uint num_events = 0;
+    for (bst_omp_uint i = 0; i < ndata; ++i) {
+      const size_t ind = label_order[i];
+      const auto label = info.labels[ind];
+      if (label > 0) {
+        out -= log(preds[ind]) - log(exp_p_sum);
+        ++num_events;
+      }
+
+      // only update the denominator after we move forward in time (labels are sorted)
+      accumulated_sum += preds[ind];
+      if (i == ndata - 1 || std::abs(label) < std::abs(info.labels[label_order[i + 1]])) {
+        exp_p_sum -= accumulated_sum;
+        accumulated_sum = 0;
+      }
+    }
+
+    return out/num_events;  // normalize by the number of events
+  }
+
+  const char* Name() const override {
+    return "cox-nloglik";
+  }
+};
+
 XGBOOST_REGISTER_METRIC(AMS, "ams")
 .describe("AMS metric for higgs.")
 .set_body([](const char* param) { return new EvalAMS(param); });
@@ -323,5 +369,9 @@ XGBOOST_REGISTER_METRIC(NDCG, "ndcg")
 XGBOOST_REGISTER_METRIC(MAP, "map")
 .describe("map@k for rank.")
 .set_body([](const char* param) { return new EvalMAP(param); });
+
+XGBOOST_REGISTER_METRIC(Cox, "cox-nloglik")
+.describe("Negative log partial likelihood of Cox proportioanl hazards model.")
+.set_body([](const char* param) { return new EvalCox(); });
 }  // namespace metric
 }  // namespace xgboost
--- a/src/objective/regression_obj.cc
+++ b/src/objective/regression_obj.cc
@@ -197,6 +197,90 @@ XGBOOST_REGISTER_OBJECTIVE(PoissonRegression, "count:poisson")
 .describe("Possion regression for count data.")
 .set_body([]() { return new PoissonRegression(); });

+// cox regression for survival data (negative values mean they are censored)
+class CoxRegression : public ObjFunction {
+ public:
+  // declare functions
+  void Configure(const std::vector<std::pair<std::string, std::string> >& args) override {}
+  void GetGradient(const std::vector<bst_float> &preds,
+                   const MetaInfo &info,
+                   int iter,
+                   std::vector<bst_gpair> *out_gpair) override {
+    CHECK_NE(info.labels.size(), 0U) << "label set cannot be empty";
+    CHECK_EQ(preds.size(), info.labels.size()) << "labels are not correctly provided";
+    out_gpair->resize(preds.size());
+    const std::vector<size_t> &label_order = info.LabelAbsSort();
+
+    const omp_ulong ndata = static_cast<omp_ulong>(preds.size()); // NOLINT(*)
+
+    // pre-compute a sum
+    double exp_p_sum = 0;  // we use double because we might need the precision with large datasets
+    for (omp_ulong i = 0; i < ndata; ++i) {
+      exp_p_sum += std::exp(preds[label_order[i]]);
+    }
+
+    // start calculating grad and hess
+    double r_k = 0;
+    double s_k = 0;
+    double last_exp_p = 0.0;
+    double last_abs_y = 0.0;
+    double accumulated_sum = 0;
+    for (omp_ulong i = 0; i < ndata; ++i) { // NOLINT(*)
+      const size_t ind = label_order[i];
+      const double p = preds[ind];
+      const double exp_p = std::exp(p);
+      const double w = info.GetWeight(ind);
+      const double y = info.labels[ind];
+      const double abs_y = std::abs(y);
+
+      // only update the denominator after we move forward in time (labels are sorted)
+      // this is Breslow's method for ties
+      accumulated_sum += last_exp_p;
+      if (last_abs_y < abs_y) {
+        exp_p_sum -= accumulated_sum;
+        accumulated_sum = 0;
+      } else {
+        CHECK(last_abs_y <= abs_y) << "CoxRegression: labels must be in sorted order, " <<
+                                      "MetaInfo::LabelArgsort failed!";
+      }
+
+      if (y > 0) {
+        r_k += 1.0/exp_p_sum;
+        s_k += 1.0/(exp_p_sum*exp_p_sum);
+      }
+
+      const double grad = exp_p*r_k - static_cast<bst_float>(y > 0);
+      const double hess = exp_p*r_k - exp_p*exp_p * s_k;
+      out_gpair->at(ind) = bst_gpair(grad * w, hess * w);
+
+      last_abs_y = abs_y;
+      last_exp_p = exp_p;
+    }
+  }
+  void PredTransform(std::vector<bst_float> *io_preds) override {
+    std::vector<bst_float> &preds = *io_preds;
+    const long ndata = static_cast<long>(preds.size()); // NOLINT(*)
+    #pragma omp parallel for schedule(static)
+    for (long j = 0; j < ndata; ++j) {  // NOLINT(*)
+      preds[j] = std::exp(preds[j]);
+    }
+  }
+  void EvalTransform(std::vector<bst_float> *io_preds) override {
+    PredTransform(io_preds);
+  }
+  bst_float ProbToMargin(bst_float base_score) const override {
+    return std::log(base_score);
+  }
+  const char* DefaultEvalMetric(void) const override {
+    return "cox-nloglik";
+  }
+};
+
+// register the objective function
+XGBOOST_REGISTER_OBJECTIVE(CoxRegression, "survival:cox")
+.describe("Cox regression for censored survival data (negative labels are considered censored).")
+.set_body([]() { return new CoxRegression(); });
+
 // gamma regression
 class GammaRegression : public ObjFunction {
 public:
--- a/src/predictor/cpu_predictor.cc
+++ b/src/predictor/cpu_predictor.cc
@@ -215,7 +215,9 @@ class CPUPredictor : public Predictor {

  void PredictContribution(DMatrix* p_fmat, std::vector<bst_float>* out_contribs,
                           const gbm::GBTreeModel& model, unsigned ntree_limit,
-                           bool approximate) override {
+                           bool approximate,
+                           int condition,
+                           unsigned condition_feature) override {
    const int nthread = omp_get_max_threads();
    InitThreadTemp(nthread,  model.param.num_feature);
    const MetaInfo& info = p_fmat->info();
@@ -232,12 +234,10 @@ class CPUPredictor : public Predictor {
    // make sure contributions is zeroed, we could be reusing a previously
    // allocated one
    std::fill(contribs.begin(), contribs.end(), 0);
-    if (approximate) {
-      // initialize tree node mean values
-      #pragma omp parallel for schedule(static)
-      for (bst_omp_uint i = 0; i < ntree_limit; ++i) {
-        model.trees[i]->FillNodeMeanValues();
-      }
+    // initialize tree node mean values
+    #pragma omp parallel for schedule(static)
+    for (bst_omp_uint i = 0; i < ntree_limit; ++i) {
+      model.trees[i]->FillNodeMeanValues();
    }
    // start collecting the contributions
    dmlc::DataIter<RowBatch>* iter = p_fmat->RowIterator();
@@ -263,7 +263,8 @@ class CPUPredictor : public Predictor {
              continue;
            }
            if (!approximate) {
-              model.trees[j]->CalculateContributions(feats, root_id, p_contribs);
+              model.trees[j]->CalculateContributions(feats, root_id, p_contribs,
+                                                     condition, condition_feature);
            } else {
              model.trees[j]->CalculateContributionsApprox(feats, root_id, p_contribs);
            }
@@ -279,6 +280,50 @@ class CPUPredictor : public Predictor {
      }
    }
  }
+
+  void PredictInteractionContributions(DMatrix* p_fmat, std::vector<bst_float>* out_contribs,
+                                       const gbm::GBTreeModel& model, unsigned ntree_limit,
+                                       bool approximate) override {
+    const MetaInfo& info = p_fmat->info();
+    const int ngroup = model.param.num_output_group;
+    size_t ncolumns = model.param.num_feature;
+    const unsigned row_chunk = ngroup * (ncolumns + 1) * (ncolumns + 1);
+    const unsigned mrow_chunk = (ncolumns + 1) * (ncolumns + 1);
+    const unsigned crow_chunk = ngroup * (ncolumns + 1);
+
+    // allocate space for (number of features^2) times the number of rows and tmp off/on contribs
+    std::vector<bst_float>& contribs = *out_contribs;
+    contribs.resize(info.num_row * ngroup * (ncolumns + 1) * (ncolumns + 1));
+    std::vector<bst_float> contribs_off(info.num_row * ngroup * (ncolumns + 1));
+    std::vector<bst_float> contribs_on(info.num_row * ngroup * (ncolumns + 1));
+    std::vector<bst_float> contribs_diag(info.num_row * ngroup * (ncolumns + 1));
+
+    // Compute the difference in effects when conditioning on each of the features on and off
+    // see: Axiomatic characterizations of probabilistic and
+    //      cardinal-probabilistic interaction indices
+    PredictContribution(p_fmat, &contribs_diag, model, ntree_limit, approximate, 0, 0);
+    for (size_t i = 0; i < ncolumns + 1; ++i) {
+      PredictContribution(p_fmat, &contribs_off, model, ntree_limit, approximate, -1, i);
+      PredictContribution(p_fmat, &contribs_on, model, ntree_limit, approximate, 1, i);
+
+      for (size_t j = 0; j < info.num_row; ++j) {
+        for (int l = 0; l < ngroup; ++l) {
+          const unsigned o_offset = j * row_chunk + l * mrow_chunk + i * (ncolumns + 1);
+          const unsigned c_offset = j * crow_chunk + l * (ncolumns + 1);
+          contribs[o_offset + i] = 0;
+          for (size_t k = 0; k < ncolumns + 1; ++k) {
+            // fill in the diagonal with additive effects, and off-diagonal with the interactions
+            if (k == i) {
+              contribs[o_offset + i] += contribs_diag[c_offset + k];
+            } else {
+              contribs[o_offset + k] = (contribs_on[c_offset + k] - contribs_off[c_offset + k])/2.0;
+              contribs[o_offset + i] -= contribs[o_offset + k];
+            }
+          }
+        }
+      }
+    }
+  }
  std::vector<RegTree::FVec> thread_temp;
 };

--- a/src/predictor/gpu_predictor.cu
+++ b/src/predictor/gpu_predictor.cu
@@ -454,10 +454,22 @@ class GPUPredictor : public xgboost::Predictor {

  void PredictContribution(DMatrix* p_fmat,
                           std::vector<bst_float>* out_contribs,
-                           const gbm::GBTreeModel& model, unsigned ntree_limit,
-                           bool approximate) override {
-    cpu_predictor->PredictContribution(p_fmat, out_contribs, model, ntree_limit,
-                                       approximate);
+                           const gbm::GBTreeModel& model,
+                           unsigned ntree_limit,
+                           bool approximate,
+                           int condition,
+                           unsigned condition_feature) override {
+    cpu_predictor->PredictContribution(p_fmat, out_contribs, model,
+                                       ntree_limit, approximate, condition, condition_feature);
+  }
+
+  void PredictInteractionContributions(DMatrix* p_fmat,
+                                       std::vector<bst_float>* out_contribs,
+                                       const gbm::GBTreeModel& model,
+                                       unsigned ntree_limit,
+                                       bool approximate) override {
+    cpu_predictor->PredictInteractionContributions(p_fmat, out_contribs, model,
+                                                   ntree_limit, approximate);
  }

  void Init(const std::vector<std::pair<std::string, std::string>>& cfg,