use gain for sklearn feature_importances_ (#3876)

* use gain for sklearn feature_importances_ `gain` is a better feature importance criteria than the currently used `weight` * added importance_type to class * fixed test * white space * fix variable name * fix deprecation warning * fix exp array * white spaces
2018-11-13 12:30:40 +01:00
parent 926eb651fe
commit 143475b27b
2 changed files with 41 additions and 7 deletions
--- a/python-package/xgboost/sklearn.py
+++ b/python-package/xgboost/sklearn.py
@@ -100,6 +100,9 @@ class XGBModel(XGBModelBase):
    missing : float, optional
        Value in the data which needs to be present as a missing value. If
        None, defaults to np.nan.
+    importance_type: string, default "gain"
+        The feature importance type for the feature_importances_ property: either "gain",
+        "weight", "cover", "total_gain" or "total_cover".
    \*\*kwargs : dict, optional
        Keyword arguments for XGBoost Booster object.  Full documentation of parameters can
        be found here: https://github.com/dmlc/xgboost/blob/master/doc/parameter.rst.
@@ -133,7 +136,8 @@ class XGBModel(XGBModelBase):
                 n_jobs=1, nthread=None, gamma=0, min_child_weight=1, max_delta_step=0,
                 subsample=1, colsample_bytree=1, colsample_bylevel=1,
                 reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
-                 base_score=0.5, random_state=0, seed=None, missing=None, **kwargs):
+                 base_score=0.5, random_state=0, seed=None, missing=None,
+                 importance_type="gain", **kwargs):
        if not SKLEARN_INSTALLED:
            raise XGBoostError('sklearn needs to be installed in order to use this module')
        self.max_depth = max_depth
@@ -159,6 +163,7 @@ class XGBModel(XGBModelBase):
        self.random_state = random_state
        self.nthread = nthread
        self.n_jobs = n_jobs
+        self.importance_type = importance_type

    def __setstate__(self, state):
        # backward compatibility code
@@ -517,8 +522,8 @@ class XGBModel(XGBModelBase):
            raise AttributeError('Feature importance is not defined for Booster type {}'
                                 .format(self.booster))
        b = self.get_booster()
-        fs = b.get_fscore()
-        all_features = [fs.get(f, 0.) for f in b.feature_names]
+        score = b.get_score(importance_type=self.importance_type)
+        all_features = [score.get(f, 0.) for f in b.feature_names]
        all_features = np.array(all_features, dtype=np.float32)
        return all_features / all_features.sum()