diff --git a/python-package/xgboost/sklearn.py b/python-package/xgboost/sklearn.py index 0a1d8d8e2..50089969b 100644 --- a/python-package/xgboost/sklearn.py +++ b/python-package/xgboost/sklearn.py @@ -495,12 +495,19 @@ class XGBClassifier(XGBModel, XGBClassifierBase): feature_importances_ : array of shape = [n_features] """ - fs = self.booster().get_fscore() - keys = [int(k.replace('f', '')) for k in fs.keys()] - fs_dict = dict(zip(keys, fs.values())) - all_features_dict = dict.fromkeys(range(0, self._features_count), 0) - all_features_dict.update(fs_dict) - all_features = np.fromiter(all_features_dict.values(), np.float32) + b = self.booster() + fs = b.get_fscore() + if b.feature_names is None: + keys = [int(k.replace('f', '')) for k in fs.keys()] + all_features_dict = dict.fromkeys(range(0, self._features_count), 0) + fs_dict = dict(zip(keys, fs.values())) + all_features_dict.update(fs_dict) + all_features = np.fromiter(all_features_dict.values(), + dtype=np.float32) + else: + all_features = [fs.get(f, 0.) for f in b.feature_names] + all_features = np.array(all_features, dtype=np.float32) + return all_features / all_features.sum() diff --git a/tests/python/test_with_sklearn.py b/tests/python/test_with_sklearn.py index 716feee2b..d14838fd7 100644 --- a/tests/python/test_with_sklearn.py +++ b/tests/python/test_with_sklearn.py @@ -1,3 +1,4 @@ +import random import xgboost as xgb import numpy as np from sklearn.metrics import mean_squared_error @@ -48,6 +49,37 @@ def test_multiclass_classification(): check_pred(preds4, labels) +def test_feature_importances(): + digits = load_digits(2) + y = digits['target'] + X = digits['data'] + xgb_model = xgb.XGBClassifier(seed=0).fit(X, y) + + exp = np.array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.00833333, 0., + 0., 0., 0., 0., 0., 0., 0., 0.025, 0.14166667, 0., 0., 0., + 0., 0., 0., 0.00833333, 0.25833333, 0., 0., 0., 0., + 0.03333334, 0.03333334, 0., 0.32499999, 0., 0., 0., 0., + 0.05, 0.06666667, 0., 0., 0., 0., 0., 0., 0., 0.04166667, + 0., 0., 0., 0., 0., 0., 0., 0.00833333, 0., 0., 0., 0., + 0.], dtype=np.float32) + + np.testing.assert_almost_equal(xgb_model.feature_importances_, exp) + + # numeric columns + import pandas as pd + y = pd.Series(digits['target']) + X = pd.DataFrame(digits['data']) + xgb_model = xgb.XGBClassifier(seed=0).fit(X, y) + np.testing.assert_almost_equal(xgb_model.feature_importances_, exp) + + # string columns, the feature order must be kept + chars = list('abcdefghijklmnopqrstuvwxyz') + X.columns = ["".join(random.sample(chars, 5)) for x in range(64)] + + xgb_model = xgb.XGBClassifier(seed=0).fit(X, y) + np.testing.assert_almost_equal(xgb_model.feature_importances_, exp) + + def test_boston_housing_regression(): boston = load_boston() y = boston['target']