Add SHAP interaction effects, fix minor bug, and add cox loss (#3043)
* Add interaction effects and cox loss * Minimize whitespace changes * Cox loss now no longer needs a pre-sorted dataset. * Address code review comments * Remove mem check, rename to pred_interactions, include bias * Make lint happy * More lint fixes * Fix cox loss indexing * Fix main effects and tests * Fix lint * Use half interaction values on the off-diagonals * Fix lint again
This commit is contained in:
committed by
Vadim Khotilovich
parent
077abb35cd
commit
d878c36c84
@@ -759,7 +759,8 @@ XGB_DLL int XGBoosterPredict(BoosterHandle handle,
|
||||
&preds, ntree_limit,
|
||||
(option_mask & 2) != 0,
|
||||
(option_mask & 4) != 0,
|
||||
(option_mask & 8) != 0);
|
||||
(option_mask & 8) != 0,
|
||||
(option_mask & 16) != 0);
|
||||
*out_result = dmlc::BeginPtr(preds);
|
||||
*len = static_cast<xgboost::bst_ulong>(preds.size());
|
||||
API_END();
|
||||
|
||||
@@ -224,7 +224,8 @@ class GBLinear : public GradientBooster {
|
||||
|
||||
void PredictContribution(DMatrix* p_fmat,
|
||||
std::vector<bst_float>* out_contribs,
|
||||
unsigned ntree_limit, bool approximate) override {
|
||||
unsigned ntree_limit, bool approximate, int condition = 0,
|
||||
unsigned condition_feature = 0) override {
|
||||
if (model.weight.size() == 0) {
|
||||
model.InitModel();
|
||||
}
|
||||
@@ -265,6 +266,17 @@ class GBLinear : public GradientBooster {
|
||||
}
|
||||
}
|
||||
|
||||
void PredictInteractionContributions(DMatrix* p_fmat,
|
||||
std::vector<bst_float>* out_contribs,
|
||||
unsigned ntree_limit, bool approximate) override {
|
||||
std::vector<bst_float>& contribs = *out_contribs;
|
||||
|
||||
// linear models have no interaction effects
|
||||
const size_t nelements = model.param.num_feature*model.param.num_feature;
|
||||
contribs.resize(p_fmat->info().num_row * nelements * model.param.num_output_group);
|
||||
std::fill(contribs.begin(), contribs.end(), 0);
|
||||
}
|
||||
|
||||
std::vector<std::string> DumpModel(const FeatureMap& fmap,
|
||||
bool with_stats,
|
||||
std::string format) const override {
|
||||
|
||||
@@ -220,10 +220,18 @@ class GBTree : public GradientBooster {
|
||||
|
||||
void PredictContribution(DMatrix* p_fmat,
|
||||
std::vector<bst_float>* out_contribs,
|
||||
unsigned ntree_limit, bool approximate) override {
|
||||
unsigned ntree_limit, bool approximate, int condition,
|
||||
unsigned condition_feature) override {
|
||||
predictor->PredictContribution(p_fmat, out_contribs, model_, ntree_limit, approximate);
|
||||
}
|
||||
|
||||
void PredictInteractionContributions(DMatrix* p_fmat,
|
||||
std::vector<bst_float>* out_contribs,
|
||||
unsigned ntree_limit, bool approximate) override {
|
||||
predictor->PredictInteractionContributions(p_fmat, out_contribs, model_,
|
||||
ntree_limit, approximate);
|
||||
}
|
||||
|
||||
std::vector<std::string> DumpModel(const FeatureMap& fmap,
|
||||
bool with_stats,
|
||||
std::string format) const override {
|
||||
|
||||
@@ -443,9 +443,12 @@ class LearnerImpl : public Learner {
|
||||
|
||||
void Predict(DMatrix* data, bool output_margin,
|
||||
std::vector<bst_float>* out_preds, unsigned ntree_limit,
|
||||
bool pred_leaf, bool pred_contribs, bool approx_contribs) const override {
|
||||
bool pred_leaf, bool pred_contribs, bool approx_contribs,
|
||||
bool pred_interactions) const override {
|
||||
if (pred_contribs) {
|
||||
gbm_->PredictContribution(data, out_preds, ntree_limit, approx_contribs);
|
||||
} else if (pred_interactions) {
|
||||
gbm_->PredictInteractionContributions(data, out_preds, ntree_limit, approx_contribs);
|
||||
} else if (pred_leaf) {
|
||||
gbm_->PredictLeaf(data, out_preds, ntree_limit);
|
||||
} else {
|
||||
|
||||
@@ -304,6 +304,52 @@ struct EvalMAP : public EvalRankList {
|
||||
}
|
||||
};
|
||||
|
||||
/*! \brief Cox: Partial likelihood of the Cox proportional hazards model */
|
||||
struct EvalCox : public Metric {
|
||||
public:
|
||||
EvalCox() {}
|
||||
bst_float Eval(const std::vector<bst_float> &preds,
|
||||
const MetaInfo &info,
|
||||
bool distributed) const override {
|
||||
CHECK(!distributed) << "Cox metric does not support distributed evaluation";
|
||||
using namespace std; // NOLINT(*)
|
||||
|
||||
const bst_omp_uint ndata = static_cast<bst_omp_uint>(info.labels.size());
|
||||
const std::vector<size_t> &label_order = info.LabelAbsSort();
|
||||
|
||||
// pre-compute a sum for the denominator
|
||||
double exp_p_sum = 0; // we use double because we might need the precision with large datasets
|
||||
for (omp_ulong i = 0; i < ndata; ++i) {
|
||||
exp_p_sum += preds[i];
|
||||
}
|
||||
|
||||
double out = 0;
|
||||
double accumulated_sum = 0;
|
||||
bst_omp_uint num_events = 0;
|
||||
for (bst_omp_uint i = 0; i < ndata; ++i) {
|
||||
const size_t ind = label_order[i];
|
||||
const auto label = info.labels[ind];
|
||||
if (label > 0) {
|
||||
out -= log(preds[ind]) - log(exp_p_sum);
|
||||
++num_events;
|
||||
}
|
||||
|
||||
// only update the denominator after we move forward in time (labels are sorted)
|
||||
accumulated_sum += preds[ind];
|
||||
if (i == ndata - 1 || std::abs(label) < std::abs(info.labels[label_order[i + 1]])) {
|
||||
exp_p_sum -= accumulated_sum;
|
||||
accumulated_sum = 0;
|
||||
}
|
||||
}
|
||||
|
||||
return out/num_events; // normalize by the number of events
|
||||
}
|
||||
|
||||
const char* Name() const override {
|
||||
return "cox-nloglik";
|
||||
}
|
||||
};
|
||||
|
||||
XGBOOST_REGISTER_METRIC(AMS, "ams")
|
||||
.describe("AMS metric for higgs.")
|
||||
.set_body([](const char* param) { return new EvalAMS(param); });
|
||||
@@ -323,5 +369,9 @@ XGBOOST_REGISTER_METRIC(NDCG, "ndcg")
|
||||
XGBOOST_REGISTER_METRIC(MAP, "map")
|
||||
.describe("map@k for rank.")
|
||||
.set_body([](const char* param) { return new EvalMAP(param); });
|
||||
|
||||
XGBOOST_REGISTER_METRIC(Cox, "cox-nloglik")
|
||||
.describe("Negative log partial likelihood of Cox proportioanl hazards model.")
|
||||
.set_body([](const char* param) { return new EvalCox(); });
|
||||
} // namespace metric
|
||||
} // namespace xgboost
|
||||
|
||||
@@ -197,6 +197,90 @@ XGBOOST_REGISTER_OBJECTIVE(PoissonRegression, "count:poisson")
|
||||
.describe("Possion regression for count data.")
|
||||
.set_body([]() { return new PoissonRegression(); });
|
||||
|
||||
// cox regression for survival data (negative values mean they are censored)
|
||||
class CoxRegression : public ObjFunction {
|
||||
public:
|
||||
// declare functions
|
||||
void Configure(const std::vector<std::pair<std::string, std::string> >& args) override {}
|
||||
void GetGradient(const std::vector<bst_float> &preds,
|
||||
const MetaInfo &info,
|
||||
int iter,
|
||||
std::vector<bst_gpair> *out_gpair) override {
|
||||
CHECK_NE(info.labels.size(), 0U) << "label set cannot be empty";
|
||||
CHECK_EQ(preds.size(), info.labels.size()) << "labels are not correctly provided";
|
||||
out_gpair->resize(preds.size());
|
||||
const std::vector<size_t> &label_order = info.LabelAbsSort();
|
||||
|
||||
const omp_ulong ndata = static_cast<omp_ulong>(preds.size()); // NOLINT(*)
|
||||
|
||||
// pre-compute a sum
|
||||
double exp_p_sum = 0; // we use double because we might need the precision with large datasets
|
||||
for (omp_ulong i = 0; i < ndata; ++i) {
|
||||
exp_p_sum += std::exp(preds[label_order[i]]);
|
||||
}
|
||||
|
||||
// start calculating grad and hess
|
||||
double r_k = 0;
|
||||
double s_k = 0;
|
||||
double last_exp_p = 0.0;
|
||||
double last_abs_y = 0.0;
|
||||
double accumulated_sum = 0;
|
||||
for (omp_ulong i = 0; i < ndata; ++i) { // NOLINT(*)
|
||||
const size_t ind = label_order[i];
|
||||
const double p = preds[ind];
|
||||
const double exp_p = std::exp(p);
|
||||
const double w = info.GetWeight(ind);
|
||||
const double y = info.labels[ind];
|
||||
const double abs_y = std::abs(y);
|
||||
|
||||
// only update the denominator after we move forward in time (labels are sorted)
|
||||
// this is Breslow's method for ties
|
||||
accumulated_sum += last_exp_p;
|
||||
if (last_abs_y < abs_y) {
|
||||
exp_p_sum -= accumulated_sum;
|
||||
accumulated_sum = 0;
|
||||
} else {
|
||||
CHECK(last_abs_y <= abs_y) << "CoxRegression: labels must be in sorted order, " <<
|
||||
"MetaInfo::LabelArgsort failed!";
|
||||
}
|
||||
|
||||
if (y > 0) {
|
||||
r_k += 1.0/exp_p_sum;
|
||||
s_k += 1.0/(exp_p_sum*exp_p_sum);
|
||||
}
|
||||
|
||||
const double grad = exp_p*r_k - static_cast<bst_float>(y > 0);
|
||||
const double hess = exp_p*r_k - exp_p*exp_p * s_k;
|
||||
out_gpair->at(ind) = bst_gpair(grad * w, hess * w);
|
||||
|
||||
last_abs_y = abs_y;
|
||||
last_exp_p = exp_p;
|
||||
}
|
||||
}
|
||||
void PredTransform(std::vector<bst_float> *io_preds) override {
|
||||
std::vector<bst_float> &preds = *io_preds;
|
||||
const long ndata = static_cast<long>(preds.size()); // NOLINT(*)
|
||||
#pragma omp parallel for schedule(static)
|
||||
for (long j = 0; j < ndata; ++j) { // NOLINT(*)
|
||||
preds[j] = std::exp(preds[j]);
|
||||
}
|
||||
}
|
||||
void EvalTransform(std::vector<bst_float> *io_preds) override {
|
||||
PredTransform(io_preds);
|
||||
}
|
||||
bst_float ProbToMargin(bst_float base_score) const override {
|
||||
return std::log(base_score);
|
||||
}
|
||||
const char* DefaultEvalMetric(void) const override {
|
||||
return "cox-nloglik";
|
||||
}
|
||||
};
|
||||
|
||||
// register the objective function
|
||||
XGBOOST_REGISTER_OBJECTIVE(CoxRegression, "survival:cox")
|
||||
.describe("Cox regression for censored survival data (negative labels are considered censored).")
|
||||
.set_body([]() { return new CoxRegression(); });
|
||||
|
||||
// gamma regression
|
||||
class GammaRegression : public ObjFunction {
|
||||
public:
|
||||
|
||||
@@ -215,7 +215,9 @@ class CPUPredictor : public Predictor {
|
||||
|
||||
void PredictContribution(DMatrix* p_fmat, std::vector<bst_float>* out_contribs,
|
||||
const gbm::GBTreeModel& model, unsigned ntree_limit,
|
||||
bool approximate) override {
|
||||
bool approximate,
|
||||
int condition,
|
||||
unsigned condition_feature) override {
|
||||
const int nthread = omp_get_max_threads();
|
||||
InitThreadTemp(nthread, model.param.num_feature);
|
||||
const MetaInfo& info = p_fmat->info();
|
||||
@@ -232,12 +234,10 @@ class CPUPredictor : public Predictor {
|
||||
// make sure contributions is zeroed, we could be reusing a previously
|
||||
// allocated one
|
||||
std::fill(contribs.begin(), contribs.end(), 0);
|
||||
if (approximate) {
|
||||
// initialize tree node mean values
|
||||
#pragma omp parallel for schedule(static)
|
||||
for (bst_omp_uint i = 0; i < ntree_limit; ++i) {
|
||||
model.trees[i]->FillNodeMeanValues();
|
||||
}
|
||||
// initialize tree node mean values
|
||||
#pragma omp parallel for schedule(static)
|
||||
for (bst_omp_uint i = 0; i < ntree_limit; ++i) {
|
||||
model.trees[i]->FillNodeMeanValues();
|
||||
}
|
||||
// start collecting the contributions
|
||||
dmlc::DataIter<RowBatch>* iter = p_fmat->RowIterator();
|
||||
@@ -263,7 +263,8 @@ class CPUPredictor : public Predictor {
|
||||
continue;
|
||||
}
|
||||
if (!approximate) {
|
||||
model.trees[j]->CalculateContributions(feats, root_id, p_contribs);
|
||||
model.trees[j]->CalculateContributions(feats, root_id, p_contribs,
|
||||
condition, condition_feature);
|
||||
} else {
|
||||
model.trees[j]->CalculateContributionsApprox(feats, root_id, p_contribs);
|
||||
}
|
||||
@@ -279,6 +280,50 @@ class CPUPredictor : public Predictor {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void PredictInteractionContributions(DMatrix* p_fmat, std::vector<bst_float>* out_contribs,
|
||||
const gbm::GBTreeModel& model, unsigned ntree_limit,
|
||||
bool approximate) override {
|
||||
const MetaInfo& info = p_fmat->info();
|
||||
const int ngroup = model.param.num_output_group;
|
||||
size_t ncolumns = model.param.num_feature;
|
||||
const unsigned row_chunk = ngroup * (ncolumns + 1) * (ncolumns + 1);
|
||||
const unsigned mrow_chunk = (ncolumns + 1) * (ncolumns + 1);
|
||||
const unsigned crow_chunk = ngroup * (ncolumns + 1);
|
||||
|
||||
// allocate space for (number of features^2) times the number of rows and tmp off/on contribs
|
||||
std::vector<bst_float>& contribs = *out_contribs;
|
||||
contribs.resize(info.num_row * ngroup * (ncolumns + 1) * (ncolumns + 1));
|
||||
std::vector<bst_float> contribs_off(info.num_row * ngroup * (ncolumns + 1));
|
||||
std::vector<bst_float> contribs_on(info.num_row * ngroup * (ncolumns + 1));
|
||||
std::vector<bst_float> contribs_diag(info.num_row * ngroup * (ncolumns + 1));
|
||||
|
||||
// Compute the difference in effects when conditioning on each of the features on and off
|
||||
// see: Axiomatic characterizations of probabilistic and
|
||||
// cardinal-probabilistic interaction indices
|
||||
PredictContribution(p_fmat, &contribs_diag, model, ntree_limit, approximate, 0, 0);
|
||||
for (size_t i = 0; i < ncolumns + 1; ++i) {
|
||||
PredictContribution(p_fmat, &contribs_off, model, ntree_limit, approximate, -1, i);
|
||||
PredictContribution(p_fmat, &contribs_on, model, ntree_limit, approximate, 1, i);
|
||||
|
||||
for (size_t j = 0; j < info.num_row; ++j) {
|
||||
for (int l = 0; l < ngroup; ++l) {
|
||||
const unsigned o_offset = j * row_chunk + l * mrow_chunk + i * (ncolumns + 1);
|
||||
const unsigned c_offset = j * crow_chunk + l * (ncolumns + 1);
|
||||
contribs[o_offset + i] = 0;
|
||||
for (size_t k = 0; k < ncolumns + 1; ++k) {
|
||||
// fill in the diagonal with additive effects, and off-diagonal with the interactions
|
||||
if (k == i) {
|
||||
contribs[o_offset + i] += contribs_diag[c_offset + k];
|
||||
} else {
|
||||
contribs[o_offset + k] = (contribs_on[c_offset + k] - contribs_off[c_offset + k])/2.0;
|
||||
contribs[o_offset + i] -= contribs[o_offset + k];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
std::vector<RegTree::FVec> thread_temp;
|
||||
};
|
||||
|
||||
|
||||
@@ -454,10 +454,22 @@ class GPUPredictor : public xgboost::Predictor {
|
||||
|
||||
void PredictContribution(DMatrix* p_fmat,
|
||||
std::vector<bst_float>* out_contribs,
|
||||
const gbm::GBTreeModel& model, unsigned ntree_limit,
|
||||
bool approximate) override {
|
||||
cpu_predictor->PredictContribution(p_fmat, out_contribs, model, ntree_limit,
|
||||
approximate);
|
||||
const gbm::GBTreeModel& model,
|
||||
unsigned ntree_limit,
|
||||
bool approximate,
|
||||
int condition,
|
||||
unsigned condition_feature) override {
|
||||
cpu_predictor->PredictContribution(p_fmat, out_contribs, model,
|
||||
ntree_limit, approximate, condition, condition_feature);
|
||||
}
|
||||
|
||||
void PredictInteractionContributions(DMatrix* p_fmat,
|
||||
std::vector<bst_float>* out_contribs,
|
||||
const gbm::GBTreeModel& model,
|
||||
unsigned ntree_limit,
|
||||
bool approximate) override {
|
||||
cpu_predictor->PredictInteractionContributions(p_fmat, out_contribs, model,
|
||||
ntree_limit, approximate);
|
||||
}
|
||||
|
||||
void Init(const std::vector<std::pair<std::string, std::string>>& cfg,
|
||||
|
||||
Reference in New Issue
Block a user