Save and load model in sklearn API (#3192)

* Add (load|save)_model to XGBModel * Add docstring * Fix docstring * Fix mixed use of space and tab * Add a test * Fix Flake8 style errors
2018-06-30 12:21:49 -07:00 · 2018-06-30 12:21:49 -07:00 · 594bcea83e
commit 594bcea83e
parent 24fde92660
2 changed files with 67 additions and 1 deletions
--- a/python-package/xgboost/sklearn.py
+++ b/python-package/xgboost/sklearn.py
@ -176,7 +176,7 @@ class XGBModel(XGBModelBase):
        booster : a xgboost booster of underlying model
        """
        if self._Booster is None:
-            raise XGBoostError('need to call fit beforehand')
+            raise XGBoostError('need to call fit or load_model beforehand')
        return self._Booster
    def get_params(self, deep=False):
@ -214,6 +214,28 @@ class XGBModel(XGBModelBase):
            xgb_params.pop('nthread', None)
        return xgb_params
    def save_model(self, fname):
        """
        Save the model to a file.
        Parameters
        ----------
        fname : string
            Output file name
        """
        self.get_booster().save_model(fname)
    def load_model(self, fname):
        """
        Load the model from a file.
        Parameters
        ----------
        fname : string or a memory buffer
            Input file name or memory buffer(see also save_raw)
        """
        if self._Booster is None:
            self._Booster = Booster({'nthread': self.n_jobs})
        self._Booster.load_model(fname)
    def fit(self, X, y, sample_weight=None, eval_set=None, eval_metric=None,
            early_stopping_rounds=None, verbose=True, xgb_model=None,
            sample_weight_eval_set=None):
--- a/tests/python/test_with_sklearn.py
+++ b/tests/python/test_with_sklearn.py
@ -1,11 +1,24 @@
 import numpy as np
 import xgboost as xgb
 import testing as tm
 import tempfile
 import os
 import shutil
 from nose.tools import raises
 rng = np.random.RandomState(1994)
 class TemporaryDirectory(object):
    """Context manager for tempfile.mkdtemp()"""
    def __enter__(self):
        self.name = tempfile.mkdtemp()
        return self.name
    def __exit__(self, exc_type, exc_value, traceback):
        shutil.rmtree(self.name)
 def test_binary_classification():
    tm._skip_if_no_sklearn()
    from sklearn.datasets import load_digits
@ -458,3 +471,34 @@ def test_validation_weights_xgbclassifier():
    # check that the logloss in the test set is actually different when using weights
    # than when not using them
    assert all((logloss_with_weights[i] != logloss_without_weights[i] for i in [0, 1]))
 def test_save_load_model():
    tm._skip_if_no_sklearn()
    from sklearn.datasets import load_digits
    try:
        from sklearn.model_selection import KFold
    except:
        from sklearn.cross_validation import KFold
    digits = load_digits(2)
    y = digits['target']
    X = digits['data']
    try:
        kf = KFold(y.shape[0], n_folds=2, shuffle=True, random_state=rng)
    except TypeError:  # sklearn.model_selection.KFold uses n_split
        kf = KFold(
            n_splits=2, shuffle=True, random_state=rng
        ).split(np.arange(y.shape[0]))
    with TemporaryDirectory() as tempdir:
        model_path = os.path.join(tempdir, 'digits.model')
        for train_index, test_index in kf:
            xgb_model = xgb.XGBClassifier().fit(X[train_index], y[train_index])
            xgb_model.save_model(model_path)
            xgb_model = xgb.XGBModel()
            xgb_model.load_model(model_path)
            preds = xgb_model.predict(X[test_index])
            labels = y[test_index]
            err = sum(1 for i in range(len(preds))
                      if int(preds[i] > 0.5) != labels[i]) / float(len(preds))
            assert err < 0.1