From 0f5f9c03850073ce756f01cd67b0b86aa0934ac7 Mon Sep 17 00:00:00 2001 From: Skipper Seabold Date: Wed, 20 May 2015 14:17:03 -0500 Subject: [PATCH] ENH: Allow early stopping in sklearn API. --- wrapper/xgboost.py | 118 ++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 112 insertions(+), 6 deletions(-) diff --git a/wrapper/xgboost.py b/wrapper/xgboost.py index 96f6c2573..35c24a1f2 100644 --- a/wrapper/xgboost.py +++ b/wrapper/xgboost.py @@ -772,7 +772,6 @@ def train(params, dtrain, num_boost_round=10, evals=(), obj=None, feval=None, ------- booster : a trained booster model """ - evals = list(evals) bst = Booster(params, [dtrain] + [d[0] for d in evals]) @@ -1074,6 +1073,8 @@ class XGBModel(XGBModelBase): params = super(XGBModel, self).get_params(deep=deep) if params['missing'] is np.nan: params['missing'] = None # sklearn doesn't handle nan. see #4725 + if not params.get('eval_metric', True): + del params['eval_metric'] # don't give as None param to Booster return params def get_xgb_params(self): @@ -1086,10 +1087,62 @@ class XGBModel(XGBModelBase): xgb_params.pop('nthread', None) return xgb_params - def fit(self, data, y): + def fit(self, X, y, eval_set=None, eval_metric=None, + early_stopping_rounds=None, feval=None): # pylint: disable=missing-docstring,invalid-name - train_dmatrix = DMatrix(data, label=y, missing=self.missing) - self._Booster = train(self.get_xgb_params(), train_dmatrix, self.n_estimators) + """ + Fit the gradient boosting model + + Parameters + ---------- + X : array_like + Feature matrix + y : array_like + Labels + eval_set : list, optional + A list of (X, y) tuple pairs to use as a validation set for + early-stopping + eval_metric : str, optional + Built-in evaluation metric to use. + early_stopping_rounds : int + Activates early stopping. Validation error needs to decrease at + least every round(s) to continue training. + Requires at least one item in evals. If there's more than one, + will use the last. Returns the model from the last iteration + (not the best one). If early stopping occurs, the model will + have two additional fields: bst.best_score and bst.best_iteration. + feval : function, optional + Custom evaluation metric to use. The call signature is + feval(y_predicted, y_true) where y_true will be a DMatrix object + such that you may need to call the get_label method. This objective + if always assumed to be minimized, so use -feval when appropriate. + """ + trainDmatrix = DMatrix(X, label=y, missing=self.missing) + + eval_results = {} + if eval_set is not None: + evals = list(DMatrix(x[0], label=x[1]) for x in eval_set) + evals = list(zip(evals, + ["validation_{}" for i in range(len(evals))])) + else: + evals = () + + params = self.get_xgb_params() + + if eval_metric is not None: + params.update({'eval_metric': eval_metric}) + + self._Booster = train(params, trainDmatrix, + self.n_estimators, evals=evals, + early_stopping_rounds=early_stopping_rounds, + evals_result=eval_results, feval=None) + if eval_results: + eval_results = {k: np.array(v, dtype=float) + for k, v in eval_results.items()} + eval_results = {k: np.array(v) for k, v in eval_results.items()} + self.eval_results_ = eval_results + self.best_score_ = self._Booster.best_score + self.best_iteration_ = self._Booster.best_iteration return self def predict(self, data): @@ -1117,8 +1170,39 @@ class XGBClassifier(XGBModel, XGBClassifierBase): colsample_bytree, base_score, seed, missing) - def fit(self, X, y, sample_weight=None): + def fit(self, X, y, sample_weight=None, eval_set=None, eval_metric=None, + early_stopping_rounds=None, feval=None): # pylint: disable = attribute-defined-outside-init,arguments-differ + """ + Fit gradient boosting classifier + + Parameters + ---------- + X : array_like + Feature matrix + y : array_like + Labels + sample_weight : array_like + Weight for each instance + eval_set : list, optional + A list of (X, y) pairs to use as a validation set for + early-stopping + eval_metric : str + Built-in evaluation metric to use. + early_stopping_rounds : int, optional + Activates early stopping. Validation error needs to decrease at + least every round(s) to continue training. + Requires at least one item in evals. If there's more than one, + will use the last. Returns the model from the last iteration + (not the best one). If early stopping occurs, the model will + have two additional fields: bst.best_score and bst.best_iteration. + feval : function, optional + Custom evaluation metric to use. The call signature is + feval(y_predicted, y_true) where y_true will be a DMatrix object + such that you may need to call the get_label method. This objective + if always assumed to be minimized, so use -feval when appropriate. + """ + eval_results = {} self.classes_ = list(np.unique(y)) self.n_classes_ = len(self.classes_) if self.n_classes_ > 2: @@ -1129,6 +1213,18 @@ class XGBClassifier(XGBModel, XGBClassifierBase): else: xgb_options = self.get_xgb_params() + if eval_metric is not None: + xgb_options.update({"eval_metric": eval_metric}) + + if eval_set is not None: + # TODO: use sample_weight if given? + evals = list(DMatrix(x[0], label=x[1]) for x in eval_set) + nevals = len(evals) + eval_names = ["validation_{}".format(i) for i in range(nevals)] + evals = list(zip(evals, eval_names)) + else: + evals = () + self._le = LabelEncoder().fit(y) training_labels = self._le.transform(y) @@ -1139,7 +1235,17 @@ class XGBClassifier(XGBModel, XGBClassifierBase): train_dmatrix = DMatrix(X, label=training_labels, missing=self.missing) - self._Booster = train(xgb_options, train_dmatrix, self.n_estimators) + self._Booster = train(xgb_options, train_dmatrix, self.n_estimators, + evals=evals, + early_stopping_rounds=early_stopping_rounds, + evals_result=eval_results, feval=feval) + + if eval_results: + eval_results = {k: np.array(v, dtype=float) + for k, v in eval_results.items()} + self.eval_results_ = eval_results + self.best_score_ = self._Booster.best_score + self.best_iteration_ = self._Booster.best_iteration return self