ENH: Allow early stopping in sklearn API.

2015-05-20 14:17:03 -05:00 · 2015-05-20 14:17:03 -05:00 · 0f5f9c0385
commit 0f5f9c0385
parent 167544d792
1 changed files with 112 additions and 6 deletions
--- a/wrapper/xgboost.py
+++ b/wrapper/xgboost.py
@ -772,7 +772,6 @@ def train(params, dtrain, num_boost_round=10, evals=(), obj=None, feval=None,
    -------
    booster : a trained booster model
    """
    evals = list(evals)
    bst = Booster(params, [dtrain] + [d[0] for d in evals])
@ -1074,6 +1073,8 @@ class XGBModel(XGBModelBase):
        params = super(XGBModel, self).get_params(deep=deep)
        if params['missing'] is np.nan:
            params['missing'] = None  # sklearn doesn't handle nan. see #4725
        if not params.get('eval_metric', True):
            del params['eval_metric']  # don't give as None param to Booster
        return params
    def get_xgb_params(self):
@ -1086,10 +1087,62 @@ class XGBModel(XGBModelBase):
            xgb_params.pop('nthread', None)
        return xgb_params
-    def fit(self, data, y):
+    def fit(self, X, y, eval_set=None, eval_metric=None,
            early_stopping_rounds=None, feval=None):
        # pylint: disable=missing-docstring,invalid-name
-        train_dmatrix = DMatrix(data, label=y, missing=self.missing)
+        """
-        self._Booster = train(self.get_xgb_params(), train_dmatrix, self.n_estimators)
+        Fit the gradient boosting model
        Parameters
        ----------
        X : array_like
            Feature matrix
        y : array_like
            Labels
        eval_set : list, optional
            A list of (X, y) tuple pairs to use as a validation set for
            early-stopping
        eval_metric : str, optional
            Built-in evaluation metric to use.
        early_stopping_rounds : int
            Activates early stopping. Validation error needs to decrease at
            least every <early_stopping_rounds> round(s) to continue training.
            Requires at least one item in evals.  If there's more than one,
            will use the last. Returns the model from the last iteration
            (not the best one). If early stopping occurs, the model will
            have two additional fields: bst.best_score and bst.best_iteration.
        feval : function, optional
            Custom evaluation metric to use. The call signature is
            feval(y_predicted, y_true) where y_true will be a DMatrix object
            such that you may need to call the get_label method. This objective
            if always assumed to be minimized, so use -feval when appropriate.
        """
        trainDmatrix = DMatrix(X, label=y, missing=self.missing)
        eval_results = {}
        if eval_set is not None:
            evals = list(DMatrix(x[0], label=x[1]) for x in eval_set)
            evals = list(zip(evals,
                             ["validation_{}" for i in range(len(evals))]))
        else:
            evals = ()
        params = self.get_xgb_params()
        if eval_metric is not None:
            params.update({'eval_metric': eval_metric})
        self._Booster = train(params, trainDmatrix,
                              self.n_estimators, evals=evals,
                              early_stopping_rounds=early_stopping_rounds,
                              evals_result=eval_results, feval=None)
        if eval_results:
            eval_results = {k: np.array(v, dtype=float)
                            for k, v in eval_results.items()}
            eval_results = {k: np.array(v) for k, v in eval_results.items()}
            self.eval_results_ = eval_results
            self.best_score_ = self._Booster.best_score
            self.best_iteration_ = self._Booster.best_iteration
        return self
    def predict(self, data):
@ -1117,8 +1170,39 @@ class XGBClassifier(XGBModel, XGBClassifierBase):
                                            colsample_bytree,
                                            base_score, seed, missing)
-    def fit(self, X, y, sample_weight=None):
+    def fit(self, X, y, sample_weight=None, eval_set=None, eval_metric=None,
            early_stopping_rounds=None, feval=None):
        # pylint: disable = attribute-defined-outside-init,arguments-differ
        """
        Fit gradient boosting classifier
        Parameters
        ----------
        X : array_like
            Feature matrix
        y : array_like
            Labels
        sample_weight : array_like
            Weight for each instance
        eval_set : list, optional
            A list of (X, y) pairs to use as a validation set for
            early-stopping
        eval_metric : str
            Built-in evaluation metric to use.
        early_stopping_rounds : int, optional
            Activates early stopping. Validation error needs to decrease at
            least every <early_stopping_rounds> round(s) to continue training.
            Requires at least one item in evals.  If there's more than one,
            will use the last. Returns the model from the last iteration
            (not the best one). If early stopping occurs, the model will
            have two additional fields: bst.best_score and bst.best_iteration.
        feval : function, optional
            Custom evaluation metric to use. The call signature is
            feval(y_predicted, y_true) where y_true will be a DMatrix object
            such that you may need to call the get_label method. This objective
            if always assumed to be minimized, so use -feval when appropriate.
        """
        eval_results = {}
        self.classes_ = list(np.unique(y))
        self.n_classes_ = len(self.classes_)
        if self.n_classes_ > 2:
@ -1129,6 +1213,18 @@ class XGBClassifier(XGBModel, XGBClassifierBase):
        else:
            xgb_options = self.get_xgb_params()
        if eval_metric is not None:
            xgb_options.update({"eval_metric": eval_metric})
        if eval_set is not None:
            # TODO: use sample_weight if given?
            evals = list(DMatrix(x[0], label=x[1]) for x in eval_set)
            nevals = len(evals)
            eval_names = ["validation_{}".format(i) for i in range(nevals)]
            evals = list(zip(evals, eval_names))
        else:
            evals = ()
        self._le = LabelEncoder().fit(y)
        training_labels = self._le.transform(y)
@ -1139,7 +1235,17 @@ class XGBClassifier(XGBModel, XGBClassifierBase):
            train_dmatrix = DMatrix(X, label=training_labels,
                                    missing=self.missing)
-        self._Booster = train(xgb_options, train_dmatrix, self.n_estimators)
+        self._Booster = train(xgb_options, train_dmatrix, self.n_estimators,
                              evals=evals,
                              early_stopping_rounds=early_stopping_rounds,
                              evals_result=eval_results, feval=feval)
        if eval_results:
            eval_results = {k: np.array(v, dtype=float)
                            for k, v in eval_results.items()}
            self.eval_results_ = eval_results
            self.best_score_ = self._Booster.best_score
            self.best_iteration_ = self._Booster.best_iteration
        return self