Implement feature score for linear model. (#7048)

* Add feature score support for linear model. * Port R interface to the new implementation. * Add linear model support in Python. Co-authored-by: Philip Hyunsu Cho <chohyu01@cs.washington.edu>
2021-06-25 14:34:02 +08:00
parent b2d300e727
commit 663136aa08
18 changed files with 367 additions and 232 deletions
--- a/include/xgboost/c_api.h
+++ b/include/xgboost/c_api.h
@@ -1195,10 +1195,13 @@ XGB_DLL int XGBoosterGetStrFeatureInfo(BoosterHandle handle, const char *field,
                                       const char ***out_features);

 /*!
- * \brief Calculate feature scores for tree models.
+ * \brief Calculate feature scores for tree models.  When used on linear model, only the
+ * `weight` importance type is defined, and output scores is a row major matrix with shape
+ * [n_features, n_classes] for multi-class model.  For tree model, out_n_feature is always
+ * equal to out_n_scores and has multiple definitions of importance type.
 *
- * \param handle        An instance of Booster
- * \param json_config   Parameters for computing scores.  Accepted JSON keys are:
+ * \param handle          An instance of Booster
+ * \param json_config     Parameters for computing scores.  Accepted JSON keys are:
 *   - importance_type: A JSON string with following possible values:
 *       * 'weight': the number of times a feature is used to split the data across all trees.
 *       * 'gain': the average gain across all splits the feature is used in.
@@ -1206,15 +1209,20 @@ XGB_DLL int XGBoosterGetStrFeatureInfo(BoosterHandle handle, const char *field,
 *       * 'total_gain': the total gain across all splits the feature is used in.
 *       * 'total_cover': the total coverage across all splits the feature is used in.
 *   - feature_map: An optional JSON string with URI or path to the feature map file.
+ *   - feature_names: An optional JSON array with string names for each feature.
 *
- * \param out_length    Length of output arrays.
- * \param out_features  An array of string as feature names, ordered the same as output scores.
- * \param out_scores    An array of floating point as feature scores.
+ * \param out_n_features  Length of output feature names.
+ * \param out_features    An array of string as feature names, ordered the same as output scores.
+ * \param out_dim         Dimension of output feature scores.
+ * \param out_shape       Shape of output feature scores with length of `out_dim`.
+ * \param out_scores      An array of floating point as feature scores with shape of `out_shape`.
 *
 * \return 0 when success, -1 when failure happens
 */
 XGB_DLL int XGBoosterFeatureScore(BoosterHandle handle, const char *json_config,
-                                  bst_ulong *out_length,
-                                  const char ***out_features,
-                                  float **out_scores);
+                                  bst_ulong *out_n_features,
+                                  char const ***out_features,
+                                  bst_ulong *out_dim,
+                                  bst_ulong const **out_shape,
+                                  float const **out_scores);
 #endif  // XGBOOST_C_API_H_
--- a/include/xgboost/gbm.h
+++ b/include/xgboost/gbm.h
@@ -184,9 +184,7 @@ class GradientBooster : public Model, public Configurable {

  virtual void FeatureScore(std::string const &importance_type,
                            std::vector<bst_feature_t> *features,
-                            std::vector<float> *scores) const {
-    LOG(FATAL) << "`feature_score` is not implemented for current booster.";
-  }
+                            std::vector<float> *scores) const = 0;
  /*!
   * \brief Whether the current booster uses GPU.
   */
--- a/include/xgboost/linalg.h
+++ b/include/xgboost/linalg.h
@@ -13,6 +13,7 @@
 #include <array>
 #include <algorithm>
 #include <utility>
+#include <vector>

 namespace xgboost {
 /*!
@@ -59,6 +60,13 @@ template <typename T> class MatrixView {
    strides_[0] = shape[1];
    strides_[1] = 1;
  }
+  MatrixView(std::vector<T> *vec, std::array<size_t, 2> shape)
+      : device_{GenericParameter::kCpuId}, values_{*vec} {
+    CHECK_EQ(vec->size(), shape[0] * shape[1]);
+    std::copy(shape.cbegin(), shape.cend(), shape_);
+    strides_[0] = shape[1];
+    strides_[1] = 1;
+  }
  MatrixView(HostDeviceVector<std::remove_const_t<T>> const *vec,
             std::array<size_t, 2> shape, int32_t device)
      : device_{device}, values_{InferValues(vec, device)} {