From 143475b27ba195604fe584bf175c6185320724b4 Mon Sep 17 00:00:00 2001
From: "Dr. Kashif Rasul" <kashif.rasul@gmail.com>
Date: Tue, 13 Nov 2018 12:30:40 +0100
Subject: [PATCH] use gain for sklearn feature_importances_ (#3876)

* use gain for sklearn feature_importances_

`gain` is a better feature importance criteria than the currently used `weight`

* added importance_type to class

* fixed test

* white space

* fix variable name

* fix deprecation warning

* fix exp array

* white spaces
---
 python-package/xgboost/sklearn.py | 11 ++++++---
 tests/python/test_with_sklearn.py | 37 +++++++++++++++++++++++++++----
 2 files changed, 41 insertions(+), 7 deletions(-)

diff --git a/python-package/xgboost/sklearn.py b/python-package/xgboost/sklearn.py
index 864da78f3..c89cdb0b5 100644
--- a/python-package/xgboost/sklearn.py
+++ b/python-package/xgboost/sklearn.py
@@ -100,6 +100,9 @@ class XGBModel(XGBModelBase):
     missing : float, optional
         Value in the data which needs to be present as a missing value. If
         None, defaults to np.nan.
+    importance_type: string, default "gain"
+        The feature importance type for the feature_importances_ property: either "gain",
+        "weight", "cover", "total_gain" or "total_cover".
     \*\*kwargs : dict, optional
         Keyword arguments for XGBoost Booster object.  Full documentation of parameters can
         be found here: https://github.com/dmlc/xgboost/blob/master/doc/parameter.rst.
@@ -133,7 +136,8 @@ class XGBModel(XGBModelBase):
                  n_jobs=1, nthread=None, gamma=0, min_child_weight=1, max_delta_step=0,
                  subsample=1, colsample_bytree=1, colsample_bylevel=1,
                  reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
-                 base_score=0.5, random_state=0, seed=None, missing=None, **kwargs):
+                 base_score=0.5, random_state=0, seed=None, missing=None,
+                 importance_type="gain", **kwargs):
         if not SKLEARN_INSTALLED:
             raise XGBoostError('sklearn needs to be installed in order to use this module')
         self.max_depth = max_depth
@@ -159,6 +163,7 @@ class XGBModel(XGBModelBase):
         self.random_state = random_state
         self.nthread = nthread
         self.n_jobs = n_jobs
+        self.importance_type = importance_type
 
     def __setstate__(self, state):
         # backward compatibility code
@@ -517,8 +522,8 @@ class XGBModel(XGBModelBase):
             raise AttributeError('Feature importance is not defined for Booster type {}'
                                  .format(self.booster))
         b = self.get_booster()
-        fs = b.get_fscore()
-        all_features = [fs.get(f, 0.) for f in b.feature_names]
+        score = b.get_score(importance_type=self.importance_type)
+        all_features = [score.get(f, 0.) for f in b.feature_names]
         all_features = np.array(all_features, dtype=np.float32)
         return all_features / all_features.sum()
 
diff --git a/tests/python/test_with_sklearn.py b/tests/python/test_with_sklearn.py
index 93c4c4d10..8808ccdfb 100644
--- a/tests/python/test_with_sklearn.py
+++ b/tests/python/test_with_sklearn.py
@@ -104,14 +104,14 @@ def test_ranking():
     np.testing.assert_almost_equal(pred, pred_orig)
 
 
-def test_feature_importances():
+def test_feature_importances_weight():
     tm._skip_if_no_sklearn()
     from sklearn.datasets import load_digits
 
     digits = load_digits(2)
     y = digits['target']
     X = digits['data']
-    xgb_model = xgb.XGBClassifier(seed=0).fit(X, y)
+    xgb_model = xgb.XGBClassifier(random_state=0, importance_type="weight").fit(X, y)
 
     exp = np.array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.00833333, 0.,
                     0., 0., 0., 0., 0., 0., 0., 0.025, 0.14166667, 0., 0., 0.,
@@ -127,10 +127,39 @@ def test_feature_importances():
     import pandas as pd
     y = pd.Series(digits['target'])
     X = pd.DataFrame(digits['data'])
-    xgb_model = xgb.XGBClassifier(seed=0).fit(X, y)
+    xgb_model = xgb.XGBClassifier(random_state=0, importance_type="weight").fit(X, y)
     np.testing.assert_almost_equal(xgb_model.feature_importances_, exp)
 
-    xgb_model = xgb.XGBClassifier(seed=0).fit(X, y)
+    xgb_model = xgb.XGBClassifier(random_state=0, importance_type="weight").fit(X, y)
+    np.testing.assert_almost_equal(xgb_model.feature_importances_, exp)
+
+
+def test_feature_importances_gain():
+    tm._skip_if_no_sklearn()
+    from sklearn.datasets import load_digits
+
+    digits = load_digits(2)
+    y = digits['target']
+    X = digits['data']
+    xgb_model = xgb.XGBClassifier(random_state=0, importance_type="gain").fit(X, y)
+
+    exp = np.array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.00326159, 0., 0., 0.,
+                    0., 0., 0., 0., 0., 0.00297238, 0.00988034, 0., 0., 0., 0.,
+                    0., 0., 0.03512521, 0.41123885, 0., 0., 0., 0., 0.01326332,
+                    0.00160674, 0., 0.4206952, 0., 0., 0., 0., 0.00616747, 0.01237546,
+                    0., 0., 0., 0., 0., 0., 0., 0.08240705, 0., 0., 0., 0.,
+                    0., 0., 0., 0.00100649, 0., 0., 0., 0., 0.], dtype=np.float32)
+
+    np.testing.assert_almost_equal(xgb_model.feature_importances_, exp)
+
+    # numeric columns
+    import pandas as pd
+    y = pd.Series(digits['target'])
+    X = pd.DataFrame(digits['data'])
+    xgb_model = xgb.XGBClassifier(random_state=0, importance_type="gain").fit(X, y)
+    np.testing.assert_almost_equal(xgb_model.feature_importances_, exp)
+
+    xgb_model = xgb.XGBClassifier(random_state=0, importance_type="gain").fit(X, y)
     np.testing.assert_almost_equal(xgb_model.feature_importances_, exp)