From 143475b27ba195604fe584bf175c6185320724b4 Mon Sep 17 00:00:00 2001 From: "Dr. Kashif Rasul" Date: Tue, 13 Nov 2018 12:30:40 +0100 Subject: [PATCH] use gain for sklearn feature_importances_ (#3876) * use gain for sklearn feature_importances_ `gain` is a better feature importance criteria than the currently used `weight` * added importance_type to class * fixed test * white space * fix variable name * fix deprecation warning * fix exp array * white spaces --- python-package/xgboost/sklearn.py | 11 ++++++--- tests/python/test_with_sklearn.py | 37 +++++++++++++++++++++++++++---- 2 files changed, 41 insertions(+), 7 deletions(-) diff --git a/python-package/xgboost/sklearn.py b/python-package/xgboost/sklearn.py index 864da78f3..c89cdb0b5 100644 --- a/python-package/xgboost/sklearn.py +++ b/python-package/xgboost/sklearn.py @@ -100,6 +100,9 @@ class XGBModel(XGBModelBase): missing : float, optional Value in the data which needs to be present as a missing value. If None, defaults to np.nan. + importance_type: string, default "gain" + The feature importance type for the feature_importances_ property: either "gain", + "weight", "cover", "total_gain" or "total_cover". \*\*kwargs : dict, optional Keyword arguments for XGBoost Booster object. Full documentation of parameters can be found here: https://github.com/dmlc/xgboost/blob/master/doc/parameter.rst. @@ -133,7 +136,8 @@ class XGBModel(XGBModelBase): n_jobs=1, nthread=None, gamma=0, min_child_weight=1, max_delta_step=0, subsample=1, colsample_bytree=1, colsample_bylevel=1, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, - base_score=0.5, random_state=0, seed=None, missing=None, **kwargs): + base_score=0.5, random_state=0, seed=None, missing=None, + importance_type="gain", **kwargs): if not SKLEARN_INSTALLED: raise XGBoostError('sklearn needs to be installed in order to use this module') self.max_depth = max_depth @@ -159,6 +163,7 @@ class XGBModel(XGBModelBase): self.random_state = random_state self.nthread = nthread self.n_jobs = n_jobs + self.importance_type = importance_type def __setstate__(self, state): # backward compatibility code @@ -517,8 +522,8 @@ class XGBModel(XGBModelBase): raise AttributeError('Feature importance is not defined for Booster type {}' .format(self.booster)) b = self.get_booster() - fs = b.get_fscore() - all_features = [fs.get(f, 0.) for f in b.feature_names] + score = b.get_score(importance_type=self.importance_type) + all_features = [score.get(f, 0.) for f in b.feature_names] all_features = np.array(all_features, dtype=np.float32) return all_features / all_features.sum() diff --git a/tests/python/test_with_sklearn.py b/tests/python/test_with_sklearn.py index 93c4c4d10..8808ccdfb 100644 --- a/tests/python/test_with_sklearn.py +++ b/tests/python/test_with_sklearn.py @@ -104,14 +104,14 @@ def test_ranking(): np.testing.assert_almost_equal(pred, pred_orig) -def test_feature_importances(): +def test_feature_importances_weight(): tm._skip_if_no_sklearn() from sklearn.datasets import load_digits digits = load_digits(2) y = digits['target'] X = digits['data'] - xgb_model = xgb.XGBClassifier(seed=0).fit(X, y) + xgb_model = xgb.XGBClassifier(random_state=0, importance_type="weight").fit(X, y) exp = np.array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.00833333, 0., 0., 0., 0., 0., 0., 0., 0., 0.025, 0.14166667, 0., 0., 0., @@ -127,10 +127,39 @@ def test_feature_importances(): import pandas as pd y = pd.Series(digits['target']) X = pd.DataFrame(digits['data']) - xgb_model = xgb.XGBClassifier(seed=0).fit(X, y) + xgb_model = xgb.XGBClassifier(random_state=0, importance_type="weight").fit(X, y) np.testing.assert_almost_equal(xgb_model.feature_importances_, exp) - xgb_model = xgb.XGBClassifier(seed=0).fit(X, y) + xgb_model = xgb.XGBClassifier(random_state=0, importance_type="weight").fit(X, y) + np.testing.assert_almost_equal(xgb_model.feature_importances_, exp) + + +def test_feature_importances_gain(): + tm._skip_if_no_sklearn() + from sklearn.datasets import load_digits + + digits = load_digits(2) + y = digits['target'] + X = digits['data'] + xgb_model = xgb.XGBClassifier(random_state=0, importance_type="gain").fit(X, y) + + exp = np.array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.00326159, 0., 0., 0., + 0., 0., 0., 0., 0., 0.00297238, 0.00988034, 0., 0., 0., 0., + 0., 0., 0.03512521, 0.41123885, 0., 0., 0., 0., 0.01326332, + 0.00160674, 0., 0.4206952, 0., 0., 0., 0., 0.00616747, 0.01237546, + 0., 0., 0., 0., 0., 0., 0., 0.08240705, 0., 0., 0., 0., + 0., 0., 0., 0.00100649, 0., 0., 0., 0., 0.], dtype=np.float32) + + np.testing.assert_almost_equal(xgb_model.feature_importances_, exp) + + # numeric columns + import pandas as pd + y = pd.Series(digits['target']) + X = pd.DataFrame(digits['data']) + xgb_model = xgb.XGBClassifier(random_state=0, importance_type="gain").fit(X, y) + np.testing.assert_almost_equal(xgb_model.feature_importances_, exp) + + xgb_model = xgb.XGBClassifier(random_state=0, importance_type="gain").fit(X, y) np.testing.assert_almost_equal(xgb_model.feature_importances_, exp)