From d878c36c84edd625e93a6ba1635cba4f69f3defa Mon Sep 17 00:00:00 2001 From: Scott Lundberg Date: Wed, 7 Feb 2018 18:38:01 -0800 Subject: [PATCH] Add SHAP interaction effects, fix minor bug, and add cox loss (#3043) * Add interaction effects and cox loss * Minimize whitespace changes * Cox loss now no longer needs a pre-sorted dataset. * Address code review comments * Remove mem check, rename to pred_interactions, include bias * Make lint happy * More lint fixes * Fix cox loss indexing * Fix main effects and tests * Fix lint * Use half interaction values on the off-diagonals * Fix lint again --- doc/parameter.md | 7 +- include/xgboost/data.h | 18 ++ include/xgboost/gbm.h | 9 +- include/xgboost/learner.h | 5 +- include/xgboost/predictor.h | 20 +- include/xgboost/tree_model.h | 88 +++++-- python-package/xgboost/core.py | 35 ++- src/c_api/c_api.cc | 3 +- src/gbm/gblinear.cc | 14 +- src/gbm/gbtree.cc | 10 +- src/learner.cc | 5 +- src/metric/rank_metric.cc | 50 ++++ src/objective/regression_obj.cc | 84 +++++++ src/predictor/cpu_predictor.cc | 61 ++++- src/predictor/gpu_predictor.cu | 20 +- tests/cpp/objective/test_regression_obj.cc | 12 + tests/python/test_basic.py | 68 ------ tests/python/test_shap.py | 252 +++++++++++++++++++++ tests/travis/run_test.sh | 2 +- 19 files changed, 638 insertions(+), 125 deletions(-) create mode 100644 tests/python/test_shap.py diff --git a/doc/parameter.md b/doc/parameter.md index d1a8e8f58..316edf89f 100644 --- a/doc/parameter.md +++ b/doc/parameter.md @@ -65,8 +65,8 @@ Parameters for Tree Booster - 'exact': Exact greedy algorithm. - 'approx': Approximate greedy algorithm using sketching and histogram. - 'hist': Fast histogram optimized approximate greedy algorithm. It uses some performance improvements such as bins caching. - - 'gpu_exact': GPU implementation of exact algorithm. - - 'gpu_hist': GPU implementation of hist algorithm. + - 'gpu_exact': GPU implementation of exact algorithm. + - 'gpu_hist': GPU implementation of hist algorithm. * sketch_eps, [default=0.03] - This is only used for approximate greedy algorithm. - This roughly translated into ```O(1 / sketch_eps)``` number of bins. @@ -170,6 +170,8 @@ Specify the learning task and the corresponding learning objective. The objectiv they can only be used when the entire training session uses the same dataset - "count:poisson" --poisson regression for count data, output mean of poisson distribution - max_delta_step is set to 0.7 by default in poisson regression (used to safeguard optimization) + - "survival:cox" --Cox regression for right censored survival time data (negative values are considered right censored). + Note that predictions are returned on the hazard ratio scale (i.e., as HR = exp(marginal_prediction) in the proportional hazard function h(t) = h0(t) * HR). - "multi:softmax" --set XGBoost to do multiclass classification using the softmax objective, you also need to set num_class(number of classes) - "multi:softprob" --same as softmax, but output a vector of ndata * nclass, which can be further reshaped to ndata, nclass matrix. The result contains predicted probability of each data point belonging to each class. - "rank:pairwise" --set XGBoost to do ranking task by minimizing the pairwise loss @@ -197,6 +199,7 @@ Specify the learning task and the corresponding learning objective. The objectiv training repeatedly - "poisson-nloglik": negative log-likelihood for Poisson regression - "gamma-nloglik": negative log-likelihood for gamma regression + - "cox-nloglik": negative partial log-likelihood for Cox proportional hazards regression - "gamma-deviance": residual deviance for gamma regression - "tweedie-nloglik": negative log-likelihood for Tweedie regression (at a specified value of the tweedie_variance_power parameter) * seed [default=0] diff --git a/include/xgboost/data.h b/include/xgboost/data.h index 0859e64ab..f7d9812ab 100644 --- a/include/xgboost/data.h +++ b/include/xgboost/data.h @@ -12,6 +12,7 @@ #include #include #include +#include #include "./base.h" namespace xgboost { @@ -76,6 +77,19 @@ struct MetaInfo { inline unsigned GetRoot(size_t i) const { return root_index.size() != 0 ? root_index[i] : 0U; } + /*! \brief get sorted indexes (argsort) of labels by absolute value (used by cox loss) */ + inline const std::vector& LabelAbsSort() const { + if (label_order_cache.size() == labels.size()) { + return label_order_cache; + } + label_order_cache.resize(labels.size()); + std::iota(label_order_cache.begin(), label_order_cache.end(), 0); + const auto l = labels; + XGBOOST_PARALLEL_SORT(label_order_cache.begin(), label_order_cache.end(), + [&l](size_t i1, size_t i2) {return std::abs(l[i1]) < std::abs(l[i2]);}); + + return label_order_cache; + } /*! \brief clear all the information */ void Clear(); /*! @@ -96,6 +110,10 @@ struct MetaInfo { * \param num Number of elements in the source array. */ void SetInfo(const char* key, const void* dptr, DataType dtype, size_t num); + + private: + /*! \brief argsort of labels */ + mutable std::vector label_order_cache; }; /*! \brief read-only sparse instance batch in CSR format */ diff --git a/include/xgboost/gbm.h b/include/xgboost/gbm.h index 58babdf29..674f46109 100644 --- a/include/xgboost/gbm.h +++ b/include/xgboost/gbm.h @@ -124,10 +124,17 @@ class GradientBooster { * \param ntree_limit limit the number of trees used in prediction, when it equals 0, this means * we do not limit number of trees * \param approximate use a faster (inconsistent) approximation of SHAP values + * \param condition condition on the condition_feature (0=no, -1=cond off, 1=cond on). + * \param condition_feature feature to condition on (i.e. fix) during calculations */ virtual void PredictContribution(DMatrix* dmat, std::vector* out_contribs, - unsigned ntree_limit = 0, bool approximate = false) = 0; + unsigned ntree_limit = 0, bool approximate = false, + int condition = 0, unsigned condition_feature = 0) = 0; + + virtual void PredictInteractionContributions(DMatrix* dmat, + std::vector* out_contribs, + unsigned ntree_limit, bool approximate) = 0; /*! * \brief dump the model in the requested format diff --git a/include/xgboost/learner.h b/include/xgboost/learner.h index 9b4149355..995a590c9 100644 --- a/include/xgboost/learner.h +++ b/include/xgboost/learner.h @@ -105,6 +105,7 @@ class Learner : public rabit::Serializable { * \param pred_leaf whether to only predict the leaf index of each tree in a boosted tree predictor * \param pred_contribs whether to only predict the feature contributions * \param approx_contribs whether to approximate the feature contributions for speed + * \param pred_interactions whether to compute the feature pair contributions */ virtual void Predict(DMatrix* data, bool output_margin, @@ -112,7 +113,9 @@ class Learner : public rabit::Serializable { unsigned ntree_limit = 0, bool pred_leaf = false, bool pred_contribs = false, - bool approx_contribs = false) const = 0; + bool approx_contribs = false, + bool pred_interactions = false) const = 0; + /*! * \brief Set additional attribute to the Booster. * The property will be saved along the booster. diff --git a/include/xgboost/predictor.h b/include/xgboost/predictor.h index 5fd857654..2c408740e 100644 --- a/include/xgboost/predictor.h +++ b/include/xgboost/predictor.h @@ -153,14 +153,24 @@ class Predictor { * a vector of length (nfeats + 1) * num_output_group * nsample, arranged in * that order. * - * \param [in,out] dmat The input feature matrix. - * \param [in,out] out_contribs The output feature contribs. - * \param model Model to make predictions from. - * \param ntree_limit (Optional) The ntree limit. - * \param approximate Use fast approximate algorithm. + * \param [in,out] dmat The input feature matrix. + * \param [in,out] out_contribs The output feature contribs. + * \param model Model to make predictions from. + * \param ntree_limit (Optional) The ntree limit. + * \param approximate Use fast approximate algorithm. + * \param condition Condition on the condition_feature (0=no, -1=cond off, 1=cond on). + * \param condition_feature Feature to condition on (i.e. fix) during calculations. */ virtual void PredictContribution(DMatrix* dmat, + std::vector* out_contribs, + const gbm::GBTreeModel& model, + unsigned ntree_limit = 0, + bool approximate = false, + int condition = 0, + unsigned condition_feature = 0) = 0; + + virtual void PredictInteractionContributions(DMatrix* dmat, std::vector* out_contribs, const gbm::GBTreeModel& model, unsigned ntree_limit = 0, diff --git a/include/xgboost/tree_model.h b/include/xgboost/tree_model.h index 57b673b5e..291826384 100644 --- a/include/xgboost/tree_model.h +++ b/include/xgboost/tree_model.h @@ -501,13 +501,33 @@ class RegTree: public TreeModel { * \param feat dense feature vector, if the feature is missing the field is set to NaN * \param root_id starting root index of the instance * \param out_contribs output vector to hold the contributions + * \param condition fix one feature to either off (-1) on (1) or not fixed (0 default) + * \param condition_feature the index of the feature to fix */ inline void CalculateContributions(const RegTree::FVec& feat, unsigned root_id, - bst_float *out_contribs) const; + bst_float *out_contribs, + int condition = 0, + unsigned condition_feature = 0) const; + /*! + * \brief Recursive function that computes the feature attributions for a single tree. + * \param feat dense feature vector, if the feature is missing the field is set to NaN + * \param phi dense output vector of feature attributions + * \param node_index the index of the current node in the tree + * \param unique_depth how many unique features are above the current node in the tree + * \param parent_unique_path a vector of statistics about our current path through the tree + * \param parent_zero_fraction what fraction of the parent path weight is coming as 0 (integrated) + * \param parent_one_fraction what fraction of the parent path weight is coming as 1 (fixed) + * \param parent_feature_index what feature the parent node used to split + * \param condition fix one feature to either off (-1) on (1) or not fixed (0 default) + * \param condition_feature the index of the feature to fix + * \param condition_fraction what fraction of the current weight matches our conditioning feature + */ inline void TreeShap(const RegTree::FVec& feat, bst_float *phi, unsigned node_index, unsigned unique_depth, PathElement *parent_unique_path, bst_float parent_zero_fraction, - bst_float parent_one_fraction, int parent_feature_index) const; + bst_float parent_one_fraction, int parent_feature_index, + int condition, unsigned condition_feature, + bst_float condition_fraction) const; /*! * \brief calculate the approximate feature contributions for the given root @@ -700,7 +720,7 @@ inline bst_float UnwoundPathSum(const PathElement *unique_path, unsigned unique_ / static_cast((i + 1) * one_fraction); total += tmp; next_one_portion = unique_path[i].pweight - tmp * zero_fraction * ((unique_depth - i) - / static_cast(unique_depth+1)); + / static_cast(unique_depth + 1)); } else { total += (unique_path[i].pweight / zero_fraction) / ((unique_depth - i) / static_cast(unique_depth + 1)); @@ -713,15 +733,22 @@ inline bst_float UnwoundPathSum(const PathElement *unique_path, unsigned unique_ inline void RegTree::TreeShap(const RegTree::FVec& feat, bst_float *phi, unsigned node_index, unsigned unique_depth, PathElement *parent_unique_path, bst_float parent_zero_fraction, - bst_float parent_one_fraction, int parent_feature_index) const { + bst_float parent_one_fraction, int parent_feature_index, + int condition, unsigned condition_feature, + bst_float condition_fraction) const { const auto node = (*this)[node_index]; + // stop if we have no weight coming down to us + if (condition_fraction == 0) return; + // extend the unique path - PathElement *unique_path = parent_unique_path + unique_depth; - if (unique_depth > 0) std::copy(parent_unique_path, - parent_unique_path + unique_depth, unique_path); - ExtendPath(unique_path, unique_depth, parent_zero_fraction, - parent_one_fraction, parent_feature_index); + PathElement *unique_path = parent_unique_path + unique_depth + 1; + std::copy(parent_unique_path, parent_unique_path + unique_depth + 1, unique_path); + + if (condition == 0 || condition_feature != static_cast(parent_feature_index)) { + ExtendPath(unique_path, unique_depth, parent_zero_fraction, + parent_one_fraction, parent_feature_index); + } const unsigned split_index = node.split_index(); // leaf node @@ -729,7 +756,8 @@ inline void RegTree::TreeShap(const RegTree::FVec& feat, bst_float *phi, for (unsigned i = 1; i <= unique_depth; ++i) { const bst_float w = UnwoundPathSum(unique_path, unique_depth, i); const PathElement &el = unique_path[i]; - phi[el.feature_index] += w * (el.one_fraction - el.zero_fraction) * node.leaf_value(); + phi[el.feature_index] += w * (el.one_fraction - el.zero_fraction) + * node.leaf_value() * condition_fraction; } // internal node @@ -764,34 +792,44 @@ inline void RegTree::TreeShap(const RegTree::FVec& feat, bst_float *phi, unique_depth -= 1; } + // divide up the condition_fraction among the recursive calls + bst_float hot_condition_fraction = condition_fraction; + bst_float cold_condition_fraction = condition_fraction; + if (condition > 0 && split_index == condition_feature) { + cold_condition_fraction = 0; + unique_depth -= 1; + } else if (condition < 0 && split_index == condition_feature) { + hot_condition_fraction *= hot_zero_fraction; + cold_condition_fraction *= cold_zero_fraction; + unique_depth -= 1; + } + TreeShap(feat, phi, hot_index, unique_depth + 1, unique_path, - hot_zero_fraction*incoming_zero_fraction, incoming_one_fraction, split_index); + hot_zero_fraction * incoming_zero_fraction, incoming_one_fraction, + split_index, condition, condition_feature, hot_condition_fraction); TreeShap(feat, phi, cold_index, unique_depth + 1, unique_path, - cold_zero_fraction*incoming_zero_fraction, 0, split_index); + cold_zero_fraction * incoming_zero_fraction, 0, + split_index, condition, condition_feature, cold_condition_fraction); } } inline void RegTree::CalculateContributions(const RegTree::FVec& feat, unsigned root_id, - bst_float *out_contribs) const { + bst_float *out_contribs, + int condition, + unsigned condition_feature) const { // find the expected value of the tree's predictions - bst_float base_value = 0.0f; - bst_float total_cover = 0.0f; - for (int i = 0; i < (*this).param.num_nodes; ++i) { - const auto node = (*this)[i]; - if (node.is_leaf()) { - const auto cover = this->stat(i).sum_hess; - base_value += cover * node.leaf_value(); - total_cover += cover; - } + if (condition == 0) { + bst_float node_value = this->node_mean_values[static_cast(root_id)]; + out_contribs[feat.size()] += node_value; } - out_contribs[feat.size()] += base_value / total_cover; // Preallocate space for the unique path data - const int maxd = this->MaxDepth(root_id) + 1; + const int maxd = this->MaxDepth(root_id) + 2; PathElement *unique_path_data = new PathElement[(maxd * (maxd + 1)) / 2]; - TreeShap(feat, out_contribs, root_id, 0, unique_path_data, 1, 1, -1); + TreeShap(feat, out_contribs, root_id, 0, unique_path_data, + 1, 1, -1, condition, condition_feature, 1); delete[] unique_path_data; } diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py index 975f38a40..769df0e53 100644 --- a/python-package/xgboost/core.py +++ b/python-package/xgboost/core.py @@ -992,7 +992,7 @@ class Booster(object): return self.eval_set([(data, name)], iteration) def predict(self, data, output_margin=False, ntree_limit=0, pred_leaf=False, - pred_contribs=False, approx_contribs=False): + pred_contribs=False, approx_contribs=False, pred_interactions=False): """ Predict with data. @@ -1019,14 +1019,21 @@ class Booster(object): in both tree 1 and tree 0. pred_contribs : bool - When this option is on, the output will be a matrix of (nsample, nfeats+1) + When this is True the output will be a matrix of size (nsample, nfeats + 1) with each record indicating the feature contributions (SHAP values) for that - prediction. The sum of all feature contributions is equal to the prediction. - Note that the bias is added as the final column, on top of the regular features. + prediction. The sum of all feature contributions is equal to the raw untransformed + margin value of the prediction. Note the final column is the bias term. approx_contribs : bool Approximate the contributions of each feature + pred_interactions : bool + When this is True the output will be a matrix of size (nsample, nfeats + 1, nfeats + 1) + indicating the SHAP interaction values for each pair of features. The sum of each + row (or column) of the interaction values equals the corresponding SHAP value (from + pred_contribs), and the sum of the entire matrix equals the raw untransformed margin + value of the prediction. Note the last row and column correspond to the bias term. + Returns ------- prediction : numpy array @@ -1040,6 +1047,8 @@ class Booster(object): option_mask |= 0x04 if approx_contribs: option_mask |= 0x08 + if pred_interactions: + option_mask |= 0x10 self._validate_features(data) @@ -1055,8 +1064,22 @@ class Booster(object): preds = preds.astype(np.int32) nrow = data.num_row() if preds.size != nrow and preds.size % nrow == 0: - ncol = int(preds.size / nrow) - preds = preds.reshape(nrow, ncol) + chunk_size = int(preds.size / nrow) + + if pred_interactions: + ngroup = int(chunk_size / ((data.num_col() + 1) * (data.num_col() + 1))) + if ngroup == 1: + preds = preds.reshape(nrow, data.num_col() + 1, data.num_col() + 1) + else: + preds = preds.reshape(nrow, ngroup, data.num_col() + 1, data.num_col() + 1) + elif pred_contribs: + ngroup = int(chunk_size / (data.num_col() + 1)) + if ngroup == 1: + preds = preds.reshape(nrow, data.num_col() + 1) + else: + preds = preds.reshape(nrow, ngroup, data.num_col() + 1) + else: + preds = preds.reshape(nrow, chunk_size) return preds def save_model(self, fname): diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc index 7756628d0..8774defef 100644 --- a/src/c_api/c_api.cc +++ b/src/c_api/c_api.cc @@ -759,7 +759,8 @@ XGB_DLL int XGBoosterPredict(BoosterHandle handle, &preds, ntree_limit, (option_mask & 2) != 0, (option_mask & 4) != 0, - (option_mask & 8) != 0); + (option_mask & 8) != 0, + (option_mask & 16) != 0); *out_result = dmlc::BeginPtr(preds); *len = static_cast(preds.size()); API_END(); diff --git a/src/gbm/gblinear.cc b/src/gbm/gblinear.cc index 4bda25f92..4fa6ad230 100644 --- a/src/gbm/gblinear.cc +++ b/src/gbm/gblinear.cc @@ -224,7 +224,8 @@ class GBLinear : public GradientBooster { void PredictContribution(DMatrix* p_fmat, std::vector* out_contribs, - unsigned ntree_limit, bool approximate) override { + unsigned ntree_limit, bool approximate, int condition = 0, + unsigned condition_feature = 0) override { if (model.weight.size() == 0) { model.InitModel(); } @@ -265,6 +266,17 @@ class GBLinear : public GradientBooster { } } + void PredictInteractionContributions(DMatrix* p_fmat, + std::vector* out_contribs, + unsigned ntree_limit, bool approximate) override { + std::vector& contribs = *out_contribs; + + // linear models have no interaction effects + const size_t nelements = model.param.num_feature*model.param.num_feature; + contribs.resize(p_fmat->info().num_row * nelements * model.param.num_output_group); + std::fill(contribs.begin(), contribs.end(), 0); + } + std::vector DumpModel(const FeatureMap& fmap, bool with_stats, std::string format) const override { diff --git a/src/gbm/gbtree.cc b/src/gbm/gbtree.cc index 0e80386bd..1a509d90d 100644 --- a/src/gbm/gbtree.cc +++ b/src/gbm/gbtree.cc @@ -220,10 +220,18 @@ class GBTree : public GradientBooster { void PredictContribution(DMatrix* p_fmat, std::vector* out_contribs, - unsigned ntree_limit, bool approximate) override { + unsigned ntree_limit, bool approximate, int condition, + unsigned condition_feature) override { predictor->PredictContribution(p_fmat, out_contribs, model_, ntree_limit, approximate); } + void PredictInteractionContributions(DMatrix* p_fmat, + std::vector* out_contribs, + unsigned ntree_limit, bool approximate) override { + predictor->PredictInteractionContributions(p_fmat, out_contribs, model_, + ntree_limit, approximate); + } + std::vector DumpModel(const FeatureMap& fmap, bool with_stats, std::string format) const override { diff --git a/src/learner.cc b/src/learner.cc index 321338d4a..cc93c34ff 100644 --- a/src/learner.cc +++ b/src/learner.cc @@ -443,9 +443,12 @@ class LearnerImpl : public Learner { void Predict(DMatrix* data, bool output_margin, std::vector* out_preds, unsigned ntree_limit, - bool pred_leaf, bool pred_contribs, bool approx_contribs) const override { + bool pred_leaf, bool pred_contribs, bool approx_contribs, + bool pred_interactions) const override { if (pred_contribs) { gbm_->PredictContribution(data, out_preds, ntree_limit, approx_contribs); + } else if (pred_interactions) { + gbm_->PredictInteractionContributions(data, out_preds, ntree_limit, approx_contribs); } else if (pred_leaf) { gbm_->PredictLeaf(data, out_preds, ntree_limit); } else { diff --git a/src/metric/rank_metric.cc b/src/metric/rank_metric.cc index d3047546a..032c08bd2 100644 --- a/src/metric/rank_metric.cc +++ b/src/metric/rank_metric.cc @@ -304,6 +304,52 @@ struct EvalMAP : public EvalRankList { } }; +/*! \brief Cox: Partial likelihood of the Cox proportional hazards model */ +struct EvalCox : public Metric { + public: + EvalCox() {} + bst_float Eval(const std::vector &preds, + const MetaInfo &info, + bool distributed) const override { + CHECK(!distributed) << "Cox metric does not support distributed evaluation"; + using namespace std; // NOLINT(*) + + const bst_omp_uint ndata = static_cast(info.labels.size()); + const std::vector &label_order = info.LabelAbsSort(); + + // pre-compute a sum for the denominator + double exp_p_sum = 0; // we use double because we might need the precision with large datasets + for (omp_ulong i = 0; i < ndata; ++i) { + exp_p_sum += preds[i]; + } + + double out = 0; + double accumulated_sum = 0; + bst_omp_uint num_events = 0; + for (bst_omp_uint i = 0; i < ndata; ++i) { + const size_t ind = label_order[i]; + const auto label = info.labels[ind]; + if (label > 0) { + out -= log(preds[ind]) - log(exp_p_sum); + ++num_events; + } + + // only update the denominator after we move forward in time (labels are sorted) + accumulated_sum += preds[ind]; + if (i == ndata - 1 || std::abs(label) < std::abs(info.labels[label_order[i + 1]])) { + exp_p_sum -= accumulated_sum; + accumulated_sum = 0; + } + } + + return out/num_events; // normalize by the number of events + } + + const char* Name() const override { + return "cox-nloglik"; + } +}; + XGBOOST_REGISTER_METRIC(AMS, "ams") .describe("AMS metric for higgs.") .set_body([](const char* param) { return new EvalAMS(param); }); @@ -323,5 +369,9 @@ XGBOOST_REGISTER_METRIC(NDCG, "ndcg") XGBOOST_REGISTER_METRIC(MAP, "map") .describe("map@k for rank.") .set_body([](const char* param) { return new EvalMAP(param); }); + +XGBOOST_REGISTER_METRIC(Cox, "cox-nloglik") +.describe("Negative log partial likelihood of Cox proportioanl hazards model.") +.set_body([](const char* param) { return new EvalCox(); }); } // namespace metric } // namespace xgboost diff --git a/src/objective/regression_obj.cc b/src/objective/regression_obj.cc index 92f1c5d3f..b3f74c8fb 100644 --- a/src/objective/regression_obj.cc +++ b/src/objective/regression_obj.cc @@ -197,6 +197,90 @@ XGBOOST_REGISTER_OBJECTIVE(PoissonRegression, "count:poisson") .describe("Possion regression for count data.") .set_body([]() { return new PoissonRegression(); }); +// cox regression for survival data (negative values mean they are censored) +class CoxRegression : public ObjFunction { + public: + // declare functions + void Configure(const std::vector >& args) override {} + void GetGradient(const std::vector &preds, + const MetaInfo &info, + int iter, + std::vector *out_gpair) override { + CHECK_NE(info.labels.size(), 0U) << "label set cannot be empty"; + CHECK_EQ(preds.size(), info.labels.size()) << "labels are not correctly provided"; + out_gpair->resize(preds.size()); + const std::vector &label_order = info.LabelAbsSort(); + + const omp_ulong ndata = static_cast(preds.size()); // NOLINT(*) + + // pre-compute a sum + double exp_p_sum = 0; // we use double because we might need the precision with large datasets + for (omp_ulong i = 0; i < ndata; ++i) { + exp_p_sum += std::exp(preds[label_order[i]]); + } + + // start calculating grad and hess + double r_k = 0; + double s_k = 0; + double last_exp_p = 0.0; + double last_abs_y = 0.0; + double accumulated_sum = 0; + for (omp_ulong i = 0; i < ndata; ++i) { // NOLINT(*) + const size_t ind = label_order[i]; + const double p = preds[ind]; + const double exp_p = std::exp(p); + const double w = info.GetWeight(ind); + const double y = info.labels[ind]; + const double abs_y = std::abs(y); + + // only update the denominator after we move forward in time (labels are sorted) + // this is Breslow's method for ties + accumulated_sum += last_exp_p; + if (last_abs_y < abs_y) { + exp_p_sum -= accumulated_sum; + accumulated_sum = 0; + } else { + CHECK(last_abs_y <= abs_y) << "CoxRegression: labels must be in sorted order, " << + "MetaInfo::LabelArgsort failed!"; + } + + if (y > 0) { + r_k += 1.0/exp_p_sum; + s_k += 1.0/(exp_p_sum*exp_p_sum); + } + + const double grad = exp_p*r_k - static_cast(y > 0); + const double hess = exp_p*r_k - exp_p*exp_p * s_k; + out_gpair->at(ind) = bst_gpair(grad * w, hess * w); + + last_abs_y = abs_y; + last_exp_p = exp_p; + } + } + void PredTransform(std::vector *io_preds) override { + std::vector &preds = *io_preds; + const long ndata = static_cast(preds.size()); // NOLINT(*) + #pragma omp parallel for schedule(static) + for (long j = 0; j < ndata; ++j) { // NOLINT(*) + preds[j] = std::exp(preds[j]); + } + } + void EvalTransform(std::vector *io_preds) override { + PredTransform(io_preds); + } + bst_float ProbToMargin(bst_float base_score) const override { + return std::log(base_score); + } + const char* DefaultEvalMetric(void) const override { + return "cox-nloglik"; + } +}; + +// register the objective function +XGBOOST_REGISTER_OBJECTIVE(CoxRegression, "survival:cox") +.describe("Cox regression for censored survival data (negative labels are considered censored).") +.set_body([]() { return new CoxRegression(); }); + // gamma regression class GammaRegression : public ObjFunction { public: diff --git a/src/predictor/cpu_predictor.cc b/src/predictor/cpu_predictor.cc index a56a3d85e..989a847a0 100644 --- a/src/predictor/cpu_predictor.cc +++ b/src/predictor/cpu_predictor.cc @@ -215,7 +215,9 @@ class CPUPredictor : public Predictor { void PredictContribution(DMatrix* p_fmat, std::vector* out_contribs, const gbm::GBTreeModel& model, unsigned ntree_limit, - bool approximate) override { + bool approximate, + int condition, + unsigned condition_feature) override { const int nthread = omp_get_max_threads(); InitThreadTemp(nthread, model.param.num_feature); const MetaInfo& info = p_fmat->info(); @@ -232,12 +234,10 @@ class CPUPredictor : public Predictor { // make sure contributions is zeroed, we could be reusing a previously // allocated one std::fill(contribs.begin(), contribs.end(), 0); - if (approximate) { - // initialize tree node mean values - #pragma omp parallel for schedule(static) - for (bst_omp_uint i = 0; i < ntree_limit; ++i) { - model.trees[i]->FillNodeMeanValues(); - } + // initialize tree node mean values + #pragma omp parallel for schedule(static) + for (bst_omp_uint i = 0; i < ntree_limit; ++i) { + model.trees[i]->FillNodeMeanValues(); } // start collecting the contributions dmlc::DataIter* iter = p_fmat->RowIterator(); @@ -263,7 +263,8 @@ class CPUPredictor : public Predictor { continue; } if (!approximate) { - model.trees[j]->CalculateContributions(feats, root_id, p_contribs); + model.trees[j]->CalculateContributions(feats, root_id, p_contribs, + condition, condition_feature); } else { model.trees[j]->CalculateContributionsApprox(feats, root_id, p_contribs); } @@ -279,6 +280,50 @@ class CPUPredictor : public Predictor { } } } + + void PredictInteractionContributions(DMatrix* p_fmat, std::vector* out_contribs, + const gbm::GBTreeModel& model, unsigned ntree_limit, + bool approximate) override { + const MetaInfo& info = p_fmat->info(); + const int ngroup = model.param.num_output_group; + size_t ncolumns = model.param.num_feature; + const unsigned row_chunk = ngroup * (ncolumns + 1) * (ncolumns + 1); + const unsigned mrow_chunk = (ncolumns + 1) * (ncolumns + 1); + const unsigned crow_chunk = ngroup * (ncolumns + 1); + + // allocate space for (number of features^2) times the number of rows and tmp off/on contribs + std::vector& contribs = *out_contribs; + contribs.resize(info.num_row * ngroup * (ncolumns + 1) * (ncolumns + 1)); + std::vector contribs_off(info.num_row * ngroup * (ncolumns + 1)); + std::vector contribs_on(info.num_row * ngroup * (ncolumns + 1)); + std::vector contribs_diag(info.num_row * ngroup * (ncolumns + 1)); + + // Compute the difference in effects when conditioning on each of the features on and off + // see: Axiomatic characterizations of probabilistic and + // cardinal-probabilistic interaction indices + PredictContribution(p_fmat, &contribs_diag, model, ntree_limit, approximate, 0, 0); + for (size_t i = 0; i < ncolumns + 1; ++i) { + PredictContribution(p_fmat, &contribs_off, model, ntree_limit, approximate, -1, i); + PredictContribution(p_fmat, &contribs_on, model, ntree_limit, approximate, 1, i); + + for (size_t j = 0; j < info.num_row; ++j) { + for (int l = 0; l < ngroup; ++l) { + const unsigned o_offset = j * row_chunk + l * mrow_chunk + i * (ncolumns + 1); + const unsigned c_offset = j * crow_chunk + l * (ncolumns + 1); + contribs[o_offset + i] = 0; + for (size_t k = 0; k < ncolumns + 1; ++k) { + // fill in the diagonal with additive effects, and off-diagonal with the interactions + if (k == i) { + contribs[o_offset + i] += contribs_diag[c_offset + k]; + } else { + contribs[o_offset + k] = (contribs_on[c_offset + k] - contribs_off[c_offset + k])/2.0; + contribs[o_offset + i] -= contribs[o_offset + k]; + } + } + } + } + } + } std::vector thread_temp; }; diff --git a/src/predictor/gpu_predictor.cu b/src/predictor/gpu_predictor.cu index aace2ebe6..b0f34529e 100644 --- a/src/predictor/gpu_predictor.cu +++ b/src/predictor/gpu_predictor.cu @@ -454,10 +454,22 @@ class GPUPredictor : public xgboost::Predictor { void PredictContribution(DMatrix* p_fmat, std::vector* out_contribs, - const gbm::GBTreeModel& model, unsigned ntree_limit, - bool approximate) override { - cpu_predictor->PredictContribution(p_fmat, out_contribs, model, ntree_limit, - approximate); + const gbm::GBTreeModel& model, + unsigned ntree_limit, + bool approximate, + int condition, + unsigned condition_feature) override { + cpu_predictor->PredictContribution(p_fmat, out_contribs, model, + ntree_limit, approximate, condition, condition_feature); + } + + void PredictInteractionContributions(DMatrix* p_fmat, + std::vector* out_contribs, + const gbm::GBTreeModel& model, + unsigned ntree_limit, + bool approximate) override { + cpu_predictor->PredictInteractionContributions(p_fmat, out_contribs, model, + ntree_limit, approximate); } void Init(const std::vector>& cfg, diff --git a/tests/cpp/objective/test_regression_obj.cc b/tests/cpp/objective/test_regression_obj.cc index 40404966d..3c679a6a5 100644 --- a/tests/cpp/objective/test_regression_obj.cc +++ b/tests/cpp/objective/test_regression_obj.cc @@ -172,3 +172,15 @@ TEST(Objective, TweedieRegressionBasic) { EXPECT_NEAR(preds[i], out_preds[i], 0.01f); } } + +TEST(Objective, CoxRegressionGPair) { + xgboost::ObjFunction * obj = xgboost::ObjFunction::Create("survival:cox"); + std::vector > args; + obj->Configure(args); + CheckObjFunction(obj, + { 0, 0.1f, 0.9f, 1, 0, 0.1f, 0.9f, 1}, + { 0, -2, -2, 2, 3, 5, -10, 100}, + { 1, 1, 1, 1, 1, 1, 1, 1}, + { 0, 0, 0, -0.799f, -0.788f, -0.590f, 0.910f, 1.006f}, + { 0, 0, 0, 0.160f, 0.186f, 0.348f, 0.610f, 0.639f}); +} diff --git a/tests/python/test_basic.py b/tests/python/test_basic.py index 60209f263..5668779cb 100644 --- a/tests/python/test_basic.py +++ b/tests/python/test_basic.py @@ -2,7 +2,6 @@ import numpy as np import xgboost as xgb import unittest -import itertools import json dpath = 'demo/data/' @@ -143,35 +142,6 @@ class TestBasic(unittest.TestCase): dm = xgb.DMatrix(dummy, feature_names=list('abcde')) self.assertRaises(ValueError, bst.predict, dm) - def test_feature_importances(self): - data = np.random.randn(100, 5) - target = np.array([0, 1] * 50) - - features = ['Feature1', 'Feature2', 'Feature3', 'Feature4', 'Feature5'] - - dm = xgb.DMatrix(data, label=target, - feature_names=features) - params = {'objective': 'multi:softprob', - 'eval_metric': 'mlogloss', - 'eta': 0.3, - 'num_class': 3} - - bst = xgb.train(params, dm, num_boost_round=10) - - # number of feature importances should == number of features - scores1 = bst.get_score() - scores2 = bst.get_score(importance_type='weight') - scores3 = bst.get_score(importance_type='cover') - scores4 = bst.get_score(importance_type='gain') - assert len(scores1) == len(features) - assert len(scores2) == len(features) - assert len(scores3) == len(features) - assert len(scores4) == len(features) - - # check backwards compatibility of get_fscore - fscores = bst.get_fscore() - assert scores1 == fscores - def test_dump(self): data = np.random.randn(100, 2) target = np.array([0, 1] * 50) @@ -268,41 +238,3 @@ class TestBasic(unittest.TestCase): cv = xgb.cv(params, dm, num_boost_round=10, shuffle=False, nfold=10, as_pandas=False) assert isinstance(cv, dict) assert len(cv) == (4) - - -def test_contributions(): - dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train') - dtest = xgb.DMatrix(dpath + 'agaricus.txt.test') - - def test_fn(max_depth, num_rounds): - # train - params = {'max_depth': max_depth, 'eta': 1, 'silent': 1} - bst = xgb.train(params, dtrain, num_boost_round=num_rounds) - - # predict - preds = bst.predict(dtest) - contribs = bst.predict(dtest, pred_contribs=True) - - # result should be (number of features + BIAS) * number of rows - assert contribs.shape == (dtest.num_row(), dtest.num_col() + 1) - - # sum of contributions should be same as predictions - np.testing.assert_array_almost_equal(np.sum(contribs, axis=1), preds) - - for max_depth, num_rounds in itertools.product(range(0, 3), range(1, 5)): - yield test_fn, max_depth, num_rounds - - # check that we get the right SHAP values for a basic AND example - # (https://arxiv.org/abs/1706.06060) - X = np.zeros((4, 2)) - X[0, :] = 1 - X[1, 0] = 1 - X[2, 1] = 1 - y = np.zeros(4) - y[0] = 1 - param = {"max_depth": 2, "base_score": 0.0, "eta": 1.0, "lambda": 0} - bst = xgb.train(param, xgb.DMatrix(X, label=y), 1) - out = bst.predict(xgb.DMatrix(X[0:1, :]), pred_contribs=True) - assert out[0, 0] == 0.375 - assert out[0, 1] == 0.375 - assert out[0, 2] == 0.25 diff --git a/tests/python/test_shap.py b/tests/python/test_shap.py new file mode 100644 index 000000000..e4256dcc8 --- /dev/null +++ b/tests/python/test_shap.py @@ -0,0 +1,252 @@ +# -*- coding: utf-8 -*- +import numpy as np +import xgboost as xgb +import unittest +import itertools +import re +import scipy +import scipy.special + +dpath = 'demo/data/' +rng = np.random.RandomState(1994) + + +class TestSHAP(unittest.TestCase): + + def test_feature_importances(self): + data = np.random.randn(100, 5) + target = np.array([0, 1] * 50) + + features = ['Feature1', 'Feature2', 'Feature3', 'Feature4', 'Feature5'] + + dm = xgb.DMatrix(data, label=target, + feature_names=features) + params = {'objective': 'multi:softprob', + 'eval_metric': 'mlogloss', + 'eta': 0.3, + 'num_class': 3} + + bst = xgb.train(params, dm, num_boost_round=10) + + # number of feature importances should == number of features + scores1 = bst.get_score() + scores2 = bst.get_score(importance_type='weight') + scores3 = bst.get_score(importance_type='cover') + scores4 = bst.get_score(importance_type='gain') + assert len(scores1) == len(features) + assert len(scores2) == len(features) + assert len(scores3) == len(features) + assert len(scores4) == len(features) + + # check backwards compatibility of get_fscore + fscores = bst.get_fscore() + assert scores1 == fscores + + dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train') + dtest = xgb.DMatrix(dpath + 'agaricus.txt.test') + + def fn(max_depth, num_rounds): + # train + params = {'max_depth': max_depth, 'eta': 1, 'silent': 1} + bst = xgb.train(params, dtrain, num_boost_round=num_rounds) + + # predict + preds = bst.predict(dtest) + contribs = bst.predict(dtest, pred_contribs=True) + + # result should be (number of features + BIAS) * number of rows + assert contribs.shape == (dtest.num_row(), dtest.num_col() + 1) + + # sum of contributions should be same as predictions + np.testing.assert_array_almost_equal(np.sum(contribs, axis=1), preds) + + # for max_depth, num_rounds in itertools.product(range(0, 3), range(1, 5)): + # yield fn, max_depth, num_rounds + + # check that we get the right SHAP values for a basic AND example + # (https://arxiv.org/abs/1706.06060) + X = np.zeros((4, 2)) + X[0, :] = 1 + X[1, 0] = 1 + X[2, 1] = 1 + y = np.zeros(4) + y[0] = 1 + param = {"max_depth": 2, "base_score": 0.0, "eta": 1.0, "lambda": 0} + bst = xgb.train(param, xgb.DMatrix(X, label=y), 1) + out = bst.predict(xgb.DMatrix(X[0:1, :]), pred_contribs=True) + assert out[0, 0] == 0.375 + assert out[0, 1] == 0.375 + assert out[0, 2] == 0.25 + + def parse_model(model): + trees = [] + r_exp = r"([0-9]+):\[f([0-9]+)<([0-9\.e-]+)\] yes=([0-9]+),no=([0-9]+).*cover=([0-9e\.]+)" + r_exp_leaf = r"([0-9]+):leaf=([0-9\.e-]+),cover=([0-9e\.]+)" + for tree in model.get_dump(with_stats=True): + + lines = list(tree.splitlines()) + trees.append([None for i in range(len(lines))]) + for line in lines: + match = re.search(r_exp, line) + if match is not None: + ind = int(match.group(1)) + while ind >= len(trees[-1]): + trees[-1].append(None) + trees[-1][ind] = { + "yes_ind": int(match.group(4)), + "no_ind": int(match.group(5)), + "value": None, + "threshold": float(match.group(3)), + "feature_index": int(match.group(2)), + "cover": float(match.group(6)) + } + else: + + match = re.search(r_exp_leaf, line) + ind = int(match.group(1)) + while ind >= len(trees[-1]): + trees[-1].append(None) + trees[-1][ind] = { + "value": float(match.group(2)), + "cover": float(match.group(3)) + } + return trees + + def exp_value_rec(tree, z, x, i=0): + if tree[i]["value"] is not None: + return tree[i]["value"] + else: + ind = tree[i]["feature_index"] + if z[ind] == 1: + if x[ind] < tree[i]["threshold"]: + return exp_value_rec(tree, z, x, tree[i]["yes_ind"]) + else: + return exp_value_rec(tree, z, x, tree[i]["no_ind"]) + else: + r_yes = tree[tree[i]["yes_ind"]]["cover"] / tree[i]["cover"] + out = exp_value_rec(tree, z, x, tree[i]["yes_ind"]) + val = out * r_yes + + r_no = tree[tree[i]["no_ind"]]["cover"] / tree[i]["cover"] + out = exp_value_rec(tree, z, x, tree[i]["no_ind"]) + val += out * r_no + return val + + def exp_value(trees, z, x): + return np.sum([exp_value_rec(tree, z, x) for tree in trees]) + + def all_subsets(ss): + return itertools.chain(*map(lambda x: itertools.combinations(ss, x), range(0, len(ss) + 1))) + + def shap_value(trees, x, i, cond=None, cond_value=None): + M = len(x) + z = np.zeros(M) + other_inds = list(set(range(M)) - set([i])) + if cond is not None: + other_inds = list(set(other_inds) - set([cond])) + z[cond] = cond_value + M -= 1 + total = 0.0 + + for subset in all_subsets(other_inds): + if len(subset) > 0: + z[list(subset)] = 1 + v1 = exp_value(trees, z, x) + z[i] = 1 + v2 = exp_value(trees, z, x) + total += (v2 - v1) / (scipy.special.binom(M - 1, len(subset)) * M) + z[i] = 0 + z[list(subset)] = 0 + return total + + def shap_values(trees, x): + vals = [shap_value(trees, x, i) for i in range(len(x))] + vals.append(exp_value(trees, np.zeros(len(x)), x)) + return np.array(vals) + + def interaction_values(trees, x): + M = len(x) + out = np.zeros((M + 1, M + 1)) + for i in range(len(x)): + for j in range(len(x)): + if i != j: + out[i, j] = interaction_value(trees, x, i, j) / 2 + svals = shap_values(trees, x) + main_effects = svals - out.sum(1) + out[np.diag_indices_from(out)] = main_effects + return out + + def interaction_value(trees, x, i, j): + M = len(x) + z = np.zeros(M) + other_inds = list(set(range(M)) - set([i, j])) + + total = 0.0 + for subset in all_subsets(other_inds): + if len(subset) > 0: + z[list(subset)] = 1 + v00 = exp_value(trees, z, x) + z[i] = 1 + v10 = exp_value(trees, z, x) + z[j] = 1 + v11 = exp_value(trees, z, x) + z[i] = 0 + v01 = exp_value(trees, z, x) + z[j] = 0 + total += (v11 - v01 - v10 + v00) / (scipy.special.binom(M - 2, len(subset)) * (M - 1)) + z[list(subset)] = 0 + return total + + # test a simple and function + M = 2 + N = 4 + X = np.zeros((N, M)) + X[0, :] = 1 + X[1, 0] = 1 + X[2, 1] = 1 + y = np.zeros(N) + y[0] = 1 + param = {"max_depth": 2, "base_score": 0.0, "eta": 1.0, "lambda": 0} + bst = xgb.train(param, xgb.DMatrix(X, label=y), 1) + brute_force = shap_values(parse_model(bst), X[0, :]) + fast_method = bst.predict(xgb.DMatrix(X[0:1, :]), pred_contribs=True) + assert np.linalg.norm(brute_force - fast_method[0, :]) < 1e-4 + + brute_force = interaction_values(parse_model(bst), X[0, :]) + fast_method = bst.predict(xgb.DMatrix(X[0:1, :]), pred_interactions=True) + assert np.linalg.norm(brute_force - fast_method[0, :, :]) < 1e-4 + + # test a random function + np.random.seed(0) + M = 2 + N = 4 + X = np.random.randn(N, M) + y = np.random.randn(N) + param = {"max_depth": 2, "base_score": 0.0, "eta": 1.0, "lambda": 0} + bst = xgb.train(param, xgb.DMatrix(X, label=y), 1) + brute_force = shap_values(parse_model(bst), X[0, :]) + fast_method = bst.predict(xgb.DMatrix(X[0:1, :]), pred_contribs=True) + assert np.linalg.norm(brute_force - fast_method[0, :]) < 1e-4 + + brute_force = interaction_values(parse_model(bst), X[0, :]) + fast_method = bst.predict(xgb.DMatrix(X[0:1, :]), pred_interactions=True) + assert np.linalg.norm(brute_force - fast_method[0, :, :]) < 1e-4 + + # test another larger more complex random function + np.random.seed(0) + M = 5 + N = 100 + X = np.random.randn(N, M) + y = np.random.randn(N) + base_score = 1.0 + param = {"max_depth": 5, "base_score": base_score, "eta": 0.1, "gamma": 2.0} + bst = xgb.train(param, xgb.DMatrix(X, label=y), 10) + brute_force = shap_values(parse_model(bst), X[0, :]) + brute_force[-1] += base_score + fast_method = bst.predict(xgb.DMatrix(X[0:1, :]), pred_contribs=True) + assert np.linalg.norm(brute_force - fast_method[0, :]) < 1e-4 + + brute_force = interaction_values(parse_model(bst), X[0, :]) + brute_force[-1, -1] += base_score + fast_method = bst.predict(xgb.DMatrix(X[0:1, :]), pred_interactions=True) + assert np.linalg.norm(brute_force - fast_method[0, :, :]) < 1e-4 diff --git a/tests/travis/run_test.sh b/tests/travis/run_test.sh index 92e0ccb8f..c3d5c2391 100755 --- a/tests/travis/run_test.sh +++ b/tests/travis/run_test.sh @@ -103,7 +103,7 @@ if [ ${TASK} == "cmake_test" ]; then # Build/test without AVX mkdir build && cd build - cmake .. -DGOOGLE_TEST=ON + cmake .. -DGOOGLE_TEST=ON make cd .. ./testxgboost