Merge pull request #327 from jseabold/sklearn-eval-set
ENH: Allow early stopping through scikit-learn API
This commit is contained in:
commit
eee0d5b065
@ -8,7 +8,7 @@ import pickle
|
|||||||
import xgboost as xgb
|
import xgboost as xgb
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from sklearn.cross_validation import KFold
|
from sklearn.cross_validation import KFold, train_test_split
|
||||||
from sklearn.metrics import confusion_matrix, mean_squared_error
|
from sklearn.metrics import confusion_matrix, mean_squared_error
|
||||||
from sklearn.grid_search import GridSearchCV
|
from sklearn.grid_search import GridSearchCV
|
||||||
from sklearn.datasets import load_iris, load_digits, load_boston
|
from sklearn.datasets import load_iris, load_digits, load_boston
|
||||||
@ -65,3 +65,23 @@ print("Pickling sklearn API models")
|
|||||||
pickle.dump(clf, open("best_boston.pkl", "wb"))
|
pickle.dump(clf, open("best_boston.pkl", "wb"))
|
||||||
clf2 = pickle.load(open("best_boston.pkl", "rb"))
|
clf2 = pickle.load(open("best_boston.pkl", "rb"))
|
||||||
print(np.allclose(clf.predict(X), clf2.predict(X)))
|
print(np.allclose(clf.predict(X), clf2.predict(X)))
|
||||||
|
|
||||||
|
# Early-stopping
|
||||||
|
|
||||||
|
X = digits['data']
|
||||||
|
y = digits['target']
|
||||||
|
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
|
||||||
|
clf = xgb.XGBClassifier()
|
||||||
|
clf.fit(X_train, y_train, early_stopping_rounds=10, eval_metric="auc",
|
||||||
|
eval_set=[(X_test, y_test)])
|
||||||
|
|
||||||
|
# Custom evaluation function
|
||||||
|
from sklearn.metrics import log_loss
|
||||||
|
|
||||||
|
|
||||||
|
def log_loss_eval(y_pred, y_true):
|
||||||
|
return "log-loss", log_loss(y_true.get_label(), y_pred)
|
||||||
|
|
||||||
|
|
||||||
|
clf.fit(X_train, y_train, early_stopping_rounds=10, eval_metric=log_loss_eval,
|
||||||
|
eval_set=[(X_test, y_test)])
|
||||||
|
|||||||
@ -6,7 +6,7 @@ Version: 0.40
|
|||||||
Authors: Tianqi Chen, Bing Xu
|
Authors: Tianqi Chen, Bing Xu
|
||||||
Early stopping by Zygmunt Zając
|
Early stopping by Zygmunt Zając
|
||||||
"""
|
"""
|
||||||
# pylint: disable=too-many-arguments, too-many-locals, too-many-lines, invalid-name
|
# pylint: disable=too-many-arguments, too-many-locals, too-many-lines, invalid-name, fixme
|
||||||
from __future__ import absolute_import
|
from __future__ import absolute_import
|
||||||
|
|
||||||
import os
|
import os
|
||||||
@ -738,7 +738,7 @@ class Booster(object):
|
|||||||
|
|
||||||
|
|
||||||
def train(params, dtrain, num_boost_round=10, evals=(), obj=None, feval=None,
|
def train(params, dtrain, num_boost_round=10, evals=(), obj=None, feval=None,
|
||||||
early_stopping_rounds=None, evals_result=None):
|
early_stopping_rounds=None, evals_result=None, verbose_eval=True):
|
||||||
# pylint: disable=too-many-statements,too-many-branches, attribute-defined-outside-init
|
# pylint: disable=too-many-statements,too-many-branches, attribute-defined-outside-init
|
||||||
"""Train a booster with given parameters.
|
"""Train a booster with given parameters.
|
||||||
|
|
||||||
@ -767,12 +767,14 @@ def train(params, dtrain, num_boost_round=10, evals=(), obj=None, feval=None,
|
|||||||
bst.best_score and bst.best_iteration.
|
bst.best_score and bst.best_iteration.
|
||||||
evals_result: dict
|
evals_result: dict
|
||||||
This dictionary stores the evaluation results of all the items in watchlist
|
This dictionary stores the evaluation results of all the items in watchlist
|
||||||
|
verbose_eval : bool
|
||||||
|
If `verbose_eval` then the evaluation metric on the validation set, if
|
||||||
|
given, is printed at each boosting stage.
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
booster : a trained booster model
|
booster : a trained booster model
|
||||||
"""
|
"""
|
||||||
|
|
||||||
evals = list(evals)
|
evals = list(evals)
|
||||||
bst = Booster(params, [dtrain] + [d[0] for d in evals])
|
bst = Booster(params, [dtrain] + [d[0] for d in evals])
|
||||||
|
|
||||||
@ -782,7 +784,7 @@ def train(params, dtrain, num_boost_round=10, evals=(), obj=None, feval=None,
|
|||||||
else:
|
else:
|
||||||
evals_name = [d[1] for d in evals]
|
evals_name = [d[1] for d in evals]
|
||||||
evals_result.clear()
|
evals_result.clear()
|
||||||
evals_result.update({key:[] for key in evals_name})
|
evals_result.update({key: [] for key in evals_name})
|
||||||
|
|
||||||
if not early_stopping_rounds:
|
if not early_stopping_rounds:
|
||||||
for i in range(num_boost_round):
|
for i in range(num_boost_round):
|
||||||
@ -794,9 +796,10 @@ def train(params, dtrain, num_boost_round=10, evals=(), obj=None, feval=None,
|
|||||||
else:
|
else:
|
||||||
msg = bst_eval_set.decode()
|
msg = bst_eval_set.decode()
|
||||||
|
|
||||||
sys.stderr.write(msg + '\n')
|
if verbose_eval:
|
||||||
|
sys.stderr.write(msg + '\n')
|
||||||
if evals_result is not None:
|
if evals_result is not None:
|
||||||
res = re.findall(":([0-9.]+).", msg)
|
res = re.findall(":-?([0-9.]+).", msg)
|
||||||
for key, val in zip(evals_name, res):
|
for key, val in zip(evals_name, res):
|
||||||
evals_result[key].append(val)
|
evals_result[key].append(val)
|
||||||
return bst
|
return bst
|
||||||
@ -840,10 +843,11 @@ def train(params, dtrain, num_boost_round=10, evals=(), obj=None, feval=None,
|
|||||||
else:
|
else:
|
||||||
msg = bst_eval_set.decode()
|
msg = bst_eval_set.decode()
|
||||||
|
|
||||||
sys.stderr.write(msg + '\n')
|
if verbose_eval:
|
||||||
|
sys.stderr.write(msg + '\n')
|
||||||
|
|
||||||
if evals_result is not None:
|
if evals_result is not None:
|
||||||
res = re.findall(":([0-9.]+).", msg)
|
res = re.findall(":-([0-9.]+).", msg)
|
||||||
for key, val in zip(evals_name, res):
|
for key, val in zip(evals_name, res):
|
||||||
evals_result[key].append(val)
|
evals_result[key].append(val)
|
||||||
|
|
||||||
@ -1074,6 +1078,8 @@ class XGBModel(XGBModelBase):
|
|||||||
params = super(XGBModel, self).get_params(deep=deep)
|
params = super(XGBModel, self).get_params(deep=deep)
|
||||||
if params['missing'] is np.nan:
|
if params['missing'] is np.nan:
|
||||||
params['missing'] = None # sklearn doesn't handle nan. see #4725
|
params['missing'] = None # sklearn doesn't handle nan. see #4725
|
||||||
|
if not params.get('eval_metric', True):
|
||||||
|
del params['eval_metric'] # don't give as None param to Booster
|
||||||
return params
|
return params
|
||||||
|
|
||||||
def get_xgb_params(self):
|
def get_xgb_params(self):
|
||||||
@ -1086,10 +1092,71 @@ class XGBModel(XGBModelBase):
|
|||||||
xgb_params.pop('nthread', None)
|
xgb_params.pop('nthread', None)
|
||||||
return xgb_params
|
return xgb_params
|
||||||
|
|
||||||
def fit(self, data, y):
|
def fit(self, X, y, eval_set=None, eval_metric=None,
|
||||||
# pylint: disable=missing-docstring,invalid-name
|
early_stopping_rounds=None, verbose=True):
|
||||||
train_dmatrix = DMatrix(data, label=y, missing=self.missing)
|
# pylint: disable=missing-docstring,invalid-name,attribute-defined-outside-init
|
||||||
self._Booster = train(self.get_xgb_params(), train_dmatrix, self.n_estimators)
|
"""
|
||||||
|
Fit the gradient boosting model
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
X : array_like
|
||||||
|
Feature matrix
|
||||||
|
y : array_like
|
||||||
|
Labels
|
||||||
|
eval_set : list, optional
|
||||||
|
A list of (X, y) tuple pairs to use as a validation set for
|
||||||
|
early-stopping
|
||||||
|
eval_metric : str, callable, optional
|
||||||
|
If a str, should be a built-in evaluation metric to use. See
|
||||||
|
doc/parameter.md. If callable, a custom evaluation metric. The call
|
||||||
|
signature is func(y_predicted, y_true) where y_true will be a
|
||||||
|
DMatrix object such that you may need to call the get_label
|
||||||
|
method. It must return a str, value pair where the str is a name
|
||||||
|
for the evaluation and value is the value of the evaluation
|
||||||
|
function. This objective is always minimized.
|
||||||
|
early_stopping_rounds : int
|
||||||
|
Activates early stopping. Validation error needs to decrease at
|
||||||
|
least every <early_stopping_rounds> round(s) to continue training.
|
||||||
|
Requires at least one item in evals. If there's more than one,
|
||||||
|
will use the last. Returns the model from the last iteration
|
||||||
|
(not the best one). If early stopping occurs, the model will
|
||||||
|
have two additional fields: bst.best_score and bst.best_iteration.
|
||||||
|
verbose : bool
|
||||||
|
If `verbose` and an evaluation set is used, writes the evaluation
|
||||||
|
metric measured on the validation set to stderr.
|
||||||
|
"""
|
||||||
|
trainDmatrix = DMatrix(X, label=y, missing=self.missing)
|
||||||
|
|
||||||
|
eval_results = {}
|
||||||
|
if eval_set is not None:
|
||||||
|
evals = list(DMatrix(x[0], label=x[1]) for x in eval_set)
|
||||||
|
evals = list(zip(evals, ["validation_{}".format(i) for i in
|
||||||
|
range(len(evals))]))
|
||||||
|
else:
|
||||||
|
evals = ()
|
||||||
|
|
||||||
|
params = self.get_xgb_params()
|
||||||
|
|
||||||
|
feval = eval_metric if callable(eval_metric) else None
|
||||||
|
if eval_metric is not None:
|
||||||
|
if callable(eval_metric):
|
||||||
|
eval_metric = None
|
||||||
|
else:
|
||||||
|
params.update({'eval_metric': eval_metric})
|
||||||
|
|
||||||
|
self._Booster = train(params, trainDmatrix,
|
||||||
|
self.n_estimators, evals=evals,
|
||||||
|
early_stopping_rounds=early_stopping_rounds,
|
||||||
|
evals_result=eval_results, feval=feval,
|
||||||
|
verbose_eval=verbose)
|
||||||
|
if eval_results:
|
||||||
|
eval_results = {k: np.array(v, dtype=float)
|
||||||
|
for k, v in eval_results.items()}
|
||||||
|
eval_results = {k: np.array(v) for k, v in eval_results.items()}
|
||||||
|
self.eval_results_ = eval_results
|
||||||
|
self.best_score_ = self._Booster.best_score
|
||||||
|
self.best_iteration_ = self._Booster.best_iteration
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def predict(self, data):
|
def predict(self, data):
|
||||||
@ -1117,8 +1184,43 @@ class XGBClassifier(XGBModel, XGBClassifierBase):
|
|||||||
colsample_bytree,
|
colsample_bytree,
|
||||||
base_score, seed, missing)
|
base_score, seed, missing)
|
||||||
|
|
||||||
def fit(self, X, y, sample_weight=None):
|
def fit(self, X, y, sample_weight=None, eval_set=None, eval_metric=None,
|
||||||
|
early_stopping_rounds=None, verbose=True):
|
||||||
# pylint: disable = attribute-defined-outside-init,arguments-differ
|
# pylint: disable = attribute-defined-outside-init,arguments-differ
|
||||||
|
"""
|
||||||
|
Fit gradient boosting classifier
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
X : array_like
|
||||||
|
Feature matrix
|
||||||
|
y : array_like
|
||||||
|
Labels
|
||||||
|
sample_weight : array_like
|
||||||
|
Weight for each instance
|
||||||
|
eval_set : list, optional
|
||||||
|
A list of (X, y) pairs to use as a validation set for
|
||||||
|
early-stopping
|
||||||
|
eval_metric : str, callable, optional
|
||||||
|
If a str, should be a built-in evaluation metric to use. See
|
||||||
|
doc/parameter.md. If callable, a custom evaluation metric. The call
|
||||||
|
signature is func(y_predicted, y_true) where y_true will be a
|
||||||
|
DMatrix object such that you may need to call the get_label
|
||||||
|
method. It must return a str, value pair where the str is a name
|
||||||
|
for the evaluation and value is the value of the evaluation
|
||||||
|
function. This objective is always minimized.
|
||||||
|
early_stopping_rounds : int, optional
|
||||||
|
Activates early stopping. Validation error needs to decrease at
|
||||||
|
least every <early_stopping_rounds> round(s) to continue training.
|
||||||
|
Requires at least one item in evals. If there's more than one,
|
||||||
|
will use the last. Returns the model from the last iteration
|
||||||
|
(not the best one). If early stopping occurs, the model will
|
||||||
|
have two additional fields: bst.best_score and bst.best_iteration.
|
||||||
|
verbose : bool
|
||||||
|
If `verbose` and an evaluation set is used, writes the evaluation
|
||||||
|
metric measured on the validation set to stderr.
|
||||||
|
"""
|
||||||
|
eval_results = {}
|
||||||
self.classes_ = list(np.unique(y))
|
self.classes_ = list(np.unique(y))
|
||||||
self.n_classes_ = len(self.classes_)
|
self.n_classes_ = len(self.classes_)
|
||||||
if self.n_classes_ > 2:
|
if self.n_classes_ > 2:
|
||||||
@ -1129,6 +1231,22 @@ class XGBClassifier(XGBModel, XGBClassifierBase):
|
|||||||
else:
|
else:
|
||||||
xgb_options = self.get_xgb_params()
|
xgb_options = self.get_xgb_params()
|
||||||
|
|
||||||
|
feval = eval_metric if callable(eval_metric) else None
|
||||||
|
if eval_metric is not None:
|
||||||
|
if callable(eval_metric):
|
||||||
|
eval_metric = None
|
||||||
|
else:
|
||||||
|
xgb_options.update({"eval_metric": eval_metric})
|
||||||
|
|
||||||
|
if eval_set is not None:
|
||||||
|
# TODO: use sample_weight if given?
|
||||||
|
evals = list(DMatrix(x[0], label=x[1]) for x in eval_set)
|
||||||
|
nevals = len(evals)
|
||||||
|
eval_names = ["validation_{}".format(i) for i in range(nevals)]
|
||||||
|
evals = list(zip(evals, eval_names))
|
||||||
|
else:
|
||||||
|
evals = ()
|
||||||
|
|
||||||
self._le = LabelEncoder().fit(y)
|
self._le = LabelEncoder().fit(y)
|
||||||
training_labels = self._le.transform(y)
|
training_labels = self._le.transform(y)
|
||||||
|
|
||||||
@ -1139,7 +1257,18 @@ class XGBClassifier(XGBModel, XGBClassifierBase):
|
|||||||
train_dmatrix = DMatrix(X, label=training_labels,
|
train_dmatrix = DMatrix(X, label=training_labels,
|
||||||
missing=self.missing)
|
missing=self.missing)
|
||||||
|
|
||||||
self._Booster = train(xgb_options, train_dmatrix, self.n_estimators)
|
self._Booster = train(xgb_options, train_dmatrix, self.n_estimators,
|
||||||
|
evals=evals,
|
||||||
|
early_stopping_rounds=early_stopping_rounds,
|
||||||
|
evals_result=eval_results, feval=feval,
|
||||||
|
verbose_eval=verbose)
|
||||||
|
|
||||||
|
if eval_results:
|
||||||
|
eval_results = {k: np.array(v, dtype=float)
|
||||||
|
for k, v in eval_results.items()}
|
||||||
|
self.eval_results_ = eval_results
|
||||||
|
self.best_score_ = self._Booster.best_score
|
||||||
|
self.best_iteration_ = self._Booster.best_iteration
|
||||||
|
|
||||||
return self
|
return self
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user