use gain for sklearn feature_importances_ (#3876)
* use gain for sklearn feature_importances_ `gain` is a better feature importance criteria than the currently used `weight` * added importance_type to class * fixed test * white space * fix variable name * fix deprecation warning * fix exp array * white spaces
This commit is contained in:
parent
926eb651fe
commit
143475b27b
@ -100,6 +100,9 @@ class XGBModel(XGBModelBase):
|
|||||||
missing : float, optional
|
missing : float, optional
|
||||||
Value in the data which needs to be present as a missing value. If
|
Value in the data which needs to be present as a missing value. If
|
||||||
None, defaults to np.nan.
|
None, defaults to np.nan.
|
||||||
|
importance_type: string, default "gain"
|
||||||
|
The feature importance type for the feature_importances_ property: either "gain",
|
||||||
|
"weight", "cover", "total_gain" or "total_cover".
|
||||||
\*\*kwargs : dict, optional
|
\*\*kwargs : dict, optional
|
||||||
Keyword arguments for XGBoost Booster object. Full documentation of parameters can
|
Keyword arguments for XGBoost Booster object. Full documentation of parameters can
|
||||||
be found here: https://github.com/dmlc/xgboost/blob/master/doc/parameter.rst.
|
be found here: https://github.com/dmlc/xgboost/blob/master/doc/parameter.rst.
|
||||||
@ -133,7 +136,8 @@ class XGBModel(XGBModelBase):
|
|||||||
n_jobs=1, nthread=None, gamma=0, min_child_weight=1, max_delta_step=0,
|
n_jobs=1, nthread=None, gamma=0, min_child_weight=1, max_delta_step=0,
|
||||||
subsample=1, colsample_bytree=1, colsample_bylevel=1,
|
subsample=1, colsample_bytree=1, colsample_bylevel=1,
|
||||||
reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
|
reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
|
||||||
base_score=0.5, random_state=0, seed=None, missing=None, **kwargs):
|
base_score=0.5, random_state=0, seed=None, missing=None,
|
||||||
|
importance_type="gain", **kwargs):
|
||||||
if not SKLEARN_INSTALLED:
|
if not SKLEARN_INSTALLED:
|
||||||
raise XGBoostError('sklearn needs to be installed in order to use this module')
|
raise XGBoostError('sklearn needs to be installed in order to use this module')
|
||||||
self.max_depth = max_depth
|
self.max_depth = max_depth
|
||||||
@ -159,6 +163,7 @@ class XGBModel(XGBModelBase):
|
|||||||
self.random_state = random_state
|
self.random_state = random_state
|
||||||
self.nthread = nthread
|
self.nthread = nthread
|
||||||
self.n_jobs = n_jobs
|
self.n_jobs = n_jobs
|
||||||
|
self.importance_type = importance_type
|
||||||
|
|
||||||
def __setstate__(self, state):
|
def __setstate__(self, state):
|
||||||
# backward compatibility code
|
# backward compatibility code
|
||||||
@ -517,8 +522,8 @@ class XGBModel(XGBModelBase):
|
|||||||
raise AttributeError('Feature importance is not defined for Booster type {}'
|
raise AttributeError('Feature importance is not defined for Booster type {}'
|
||||||
.format(self.booster))
|
.format(self.booster))
|
||||||
b = self.get_booster()
|
b = self.get_booster()
|
||||||
fs = b.get_fscore()
|
score = b.get_score(importance_type=self.importance_type)
|
||||||
all_features = [fs.get(f, 0.) for f in b.feature_names]
|
all_features = [score.get(f, 0.) for f in b.feature_names]
|
||||||
all_features = np.array(all_features, dtype=np.float32)
|
all_features = np.array(all_features, dtype=np.float32)
|
||||||
return all_features / all_features.sum()
|
return all_features / all_features.sum()
|
||||||
|
|
||||||
|
|||||||
@ -104,14 +104,14 @@ def test_ranking():
|
|||||||
np.testing.assert_almost_equal(pred, pred_orig)
|
np.testing.assert_almost_equal(pred, pred_orig)
|
||||||
|
|
||||||
|
|
||||||
def test_feature_importances():
|
def test_feature_importances_weight():
|
||||||
tm._skip_if_no_sklearn()
|
tm._skip_if_no_sklearn()
|
||||||
from sklearn.datasets import load_digits
|
from sklearn.datasets import load_digits
|
||||||
|
|
||||||
digits = load_digits(2)
|
digits = load_digits(2)
|
||||||
y = digits['target']
|
y = digits['target']
|
||||||
X = digits['data']
|
X = digits['data']
|
||||||
xgb_model = xgb.XGBClassifier(seed=0).fit(X, y)
|
xgb_model = xgb.XGBClassifier(random_state=0, importance_type="weight").fit(X, y)
|
||||||
|
|
||||||
exp = np.array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.00833333, 0.,
|
exp = np.array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.00833333, 0.,
|
||||||
0., 0., 0., 0., 0., 0., 0., 0.025, 0.14166667, 0., 0., 0.,
|
0., 0., 0., 0., 0., 0., 0., 0.025, 0.14166667, 0., 0., 0.,
|
||||||
@ -127,10 +127,39 @@ def test_feature_importances():
|
|||||||
import pandas as pd
|
import pandas as pd
|
||||||
y = pd.Series(digits['target'])
|
y = pd.Series(digits['target'])
|
||||||
X = pd.DataFrame(digits['data'])
|
X = pd.DataFrame(digits['data'])
|
||||||
xgb_model = xgb.XGBClassifier(seed=0).fit(X, y)
|
xgb_model = xgb.XGBClassifier(random_state=0, importance_type="weight").fit(X, y)
|
||||||
np.testing.assert_almost_equal(xgb_model.feature_importances_, exp)
|
np.testing.assert_almost_equal(xgb_model.feature_importances_, exp)
|
||||||
|
|
||||||
xgb_model = xgb.XGBClassifier(seed=0).fit(X, y)
|
xgb_model = xgb.XGBClassifier(random_state=0, importance_type="weight").fit(X, y)
|
||||||
|
np.testing.assert_almost_equal(xgb_model.feature_importances_, exp)
|
||||||
|
|
||||||
|
|
||||||
|
def test_feature_importances_gain():
|
||||||
|
tm._skip_if_no_sklearn()
|
||||||
|
from sklearn.datasets import load_digits
|
||||||
|
|
||||||
|
digits = load_digits(2)
|
||||||
|
y = digits['target']
|
||||||
|
X = digits['data']
|
||||||
|
xgb_model = xgb.XGBClassifier(random_state=0, importance_type="gain").fit(X, y)
|
||||||
|
|
||||||
|
exp = np.array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.00326159, 0., 0., 0.,
|
||||||
|
0., 0., 0., 0., 0., 0.00297238, 0.00988034, 0., 0., 0., 0.,
|
||||||
|
0., 0., 0.03512521, 0.41123885, 0., 0., 0., 0., 0.01326332,
|
||||||
|
0.00160674, 0., 0.4206952, 0., 0., 0., 0., 0.00616747, 0.01237546,
|
||||||
|
0., 0., 0., 0., 0., 0., 0., 0.08240705, 0., 0., 0., 0.,
|
||||||
|
0., 0., 0., 0.00100649, 0., 0., 0., 0., 0.], dtype=np.float32)
|
||||||
|
|
||||||
|
np.testing.assert_almost_equal(xgb_model.feature_importances_, exp)
|
||||||
|
|
||||||
|
# numeric columns
|
||||||
|
import pandas as pd
|
||||||
|
y = pd.Series(digits['target'])
|
||||||
|
X = pd.DataFrame(digits['data'])
|
||||||
|
xgb_model = xgb.XGBClassifier(random_state=0, importance_type="gain").fit(X, y)
|
||||||
|
np.testing.assert_almost_equal(xgb_model.feature_importances_, exp)
|
||||||
|
|
||||||
|
xgb_model = xgb.XGBClassifier(random_state=0, importance_type="gain").fit(X, y)
|
||||||
np.testing.assert_almost_equal(xgb_model.feature_importances_, exp)
|
np.testing.assert_almost_equal(xgb_model.feature_importances_, exp)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user