stratified cv for python wrapper

finalize docstring
2016-02-14 11:00:41 +01:00
parent 9b2b81e6a4
commit 4b3a053913
3 changed files with 70 additions and 10 deletions
--- a/python-package/xgboost/compat.py
+++ b/python-package/xgboost/compat.py
@@ -33,8 +33,11 @@ try:
    from sklearn.base import BaseEstimator
    from sklearn.base import RegressorMixin, ClassifierMixin
    from sklearn.preprocessing import LabelEncoder
+    from sklearn.cross_validation import KFold, StratifiedKFold
    SKLEARN_INSTALLED = True

+    XGBKFold = KFold
+    XGBStratifiedKFold = StratifiedKFold
    XGBModelBase = BaseEstimator
    XGBRegressorBase = RegressorMixin
    XGBClassifierBase = ClassifierMixin
--- a/python-package/xgboost/training.py
+++ b/python-package/xgboost/training.py
@@ -8,6 +8,7 @@ import sys
 import re
 import numpy as np
 from .core import Booster, STRING_TYPES
+from .compat import (SKLEARN_INSTALLED, XGBStratifiedKFold, XGBKFold)

 def train(params, dtrain, num_boost_round=10, evals=(), obj=None, feval=None,
          maximize=False, early_stopping_rounds=None, evals_result=None,
@@ -261,15 +262,26 @@ class CVPack(object):
        return self.bst.eval_set(self.watchlist, iteration, feval)


-def mknfold(dall, nfold, param, seed, evals=(), fpreproc=None):
+def mknfold(dall, nfold, param, seed, evals=(), fpreproc=None, stratified=False, folds=None):
    """
    Make an n-fold list of CVPack from random indices.
    """
    evals = list(evals)
    np.random.seed(seed)
-    randidx = np.random.permutation(dall.num_row())
-    kstep = len(randidx) / nfold
-    idset = [randidx[(i * kstep): min(len(randidx), (i + 1) * kstep)] for i in range(nfold)]
+
+    if stratified is False and folds is None:
+        randidx = np.random.permutation(dall.num_row())
+        kstep = len(randidx) / nfold
+        idset = [randidx[(i * kstep): min(len(randidx), (i + 1) * kstep)] for i in range(nfold)]
+    elif folds is not None:
+        idset = [x[1] for x in folds]
+        nfold = len(idset)
+    else:
+        idset = [x[1] for x in XGBStratifiedKFold(dall.get_label(),
+                                                  n_folds=nfold,
+                                                  shuffle=True,
+                                                  random_state=seed)]
+
    ret = []
    for k in range(nfold):
        dtrain = dall.slice(np.concatenate([idset[i] for i in range(nfold) if k != i]))
@@ -345,8 +357,8 @@ def aggcv(rlist, show_stdv=True, show_progress=None, as_pandas=True, trial=0):
    return results


-def cv(params, dtrain, num_boost_round=10, nfold=3, metrics=(),
-       obj=None, feval=None, maximize=False, early_stopping_rounds=None,
+def cv(params, dtrain, num_boost_round=10, nfold=3, stratified=False, folds=None,
+       metrics=(), obj=None, feval=None, maximize=False, early_stopping_rounds=None,
       fpreproc=None, as_pandas=True, show_progress=None, show_stdv=True, seed=0):
    # pylint: disable = invalid-name
    """Cross-validation with given paramaters.
@@ -361,6 +373,10 @@ def cv(params, dtrain, num_boost_round=10, nfold=3, metrics=(),
        Number of boosting iterations.
    nfold : int
        Number of folds in CV.
+    stratified : bool
+        Perform stratified sampling.
+    folds : KFold or StratifiedKFold
+        Sklearn KFolds or StratifiedKFolds.
    metrics : string or list of strings
        Evaluation metrics to be watched in CV.
    obj : function
@@ -381,9 +397,9 @@ def cv(params, dtrain, num_boost_round=10, nfold=3, metrics=(),
        If False or pandas is not installed, return np.ndarray
    show_progress : bool, int, or None, default None
        Whether to display the progress. If None, progress will be displayed
-        when np.ndarray is returned. If True, progress will be displayed at 
-        boosting stage. If an integer is given, progress will be displayed 
-        at every given `show_progress` boosting stage. 
+        when np.ndarray is returned. If True, progress will be displayed at
+        boosting stage. If an integer is given, progress will be displayed
+        at every given `show_progress` boosting stage.
    show_stdv : bool, default True
        Whether to display the standard deviation in progress.
        Results are not affected, and always contains std.
@@ -394,6 +410,9 @@ def cv(params, dtrain, num_boost_round=10, nfold=3, metrics=(),
    -------
    evaluation history : list(string)
    """
+    if stratified == True and not SKLEARN_INSTALLED:
+            raise XGBoostError('sklearn needs to be installed in order to use stratified cv')
+
    if isinstance(metrics, str):
        metrics = [metrics]

@@ -436,7 +455,7 @@ def cv(params, dtrain, num_boost_round=10, nfold=3, metrics=(),

    best_score_i = 0
    results = []
-    cvfolds = mknfold(dtrain, nfold, params, seed, metrics, fpreproc)
+    cvfolds = mknfold(dtrain, nfold, params, seed, metrics, fpreproc, stratified, folds)
    for i in range(num_boost_round):
        for fold in cvfolds:
            fold.update(i, obj)
@@ -466,3 +485,4 @@ def cv(params, dtrain, num_boost_round=10, nfold=3, metrics=(),
        results = np.array(results)

    return results
+
--- a/tests/python/test_cv.py
+++ b/tests/python/test_cv.py
@@ -0,0 +1,37 @@
+import xgboost as xgb
+import numpy as np
+from sklearn.datasets import load_digits
+from sklearn.cross_validation import KFold, StratifiedKFold, train_test_split
+from sklearn.metrics import mean_squared_error
+import unittest
+
+rng = np.random.RandomState(1994)
+
+
+class TestCrossValidation(unittest.TestCase):
+    def test_cv(self):
+        digits = load_digits(3)
+        X = digits['data']
+        y = digits['target']
+        dm = xgb.DMatrix(X, label=y)
+        
+        params = {
+            'max_depth': 2,
+            'eta': 1,
+            'silent': 1,
+            'objective':
+            'multi:softprob',
+            'num_class': 3
+        }
+
+        seed = 2016
+        nfolds = 5
+        skf = StratifiedKFold(y, n_folds=nfolds, shuffle=True, random_state=seed)
+
+        import pandas as pd
+        cv1 = xgb.cv(params, dm, num_boost_round=10, nfold=nfolds, seed=seed)
+        cv2 = xgb.cv(params, dm, num_boost_round=10, folds=skf, seed=seed)
+        cv3 = xgb.cv(params, dm, num_boost_round=10, nfold=nfolds, stratified=True, seed=seed)
+        assert cv1.shape[0] == cv2.shape[0] and cv2.shape[0] == cv3.shape[0]
+        assert cv2.iloc[-1,0] == cv3.iloc[-1,0]
+