stratified cv for python wrapper

finalize docstring
This commit is contained in:
Faron 2016-02-14 11:00:41 +01:00
parent 9b2b81e6a4
commit 4b3a053913
3 changed files with 70 additions and 10 deletions

View File

@ -33,8 +33,11 @@ try:
from sklearn.base import BaseEstimator
from sklearn.base import RegressorMixin, ClassifierMixin
from sklearn.preprocessing import LabelEncoder
from sklearn.cross_validation import KFold, StratifiedKFold
SKLEARN_INSTALLED = True
XGBKFold = KFold
XGBStratifiedKFold = StratifiedKFold
XGBModelBase = BaseEstimator
XGBRegressorBase = RegressorMixin
XGBClassifierBase = ClassifierMixin

View File

@ -8,6 +8,7 @@ import sys
import re
import numpy as np
from .core import Booster, STRING_TYPES
from .compat import (SKLEARN_INSTALLED, XGBStratifiedKFold, XGBKFold)
def train(params, dtrain, num_boost_round=10, evals=(), obj=None, feval=None,
maximize=False, early_stopping_rounds=None, evals_result=None,
@ -261,15 +262,26 @@ class CVPack(object):
return self.bst.eval_set(self.watchlist, iteration, feval)
def mknfold(dall, nfold, param, seed, evals=(), fpreproc=None):
def mknfold(dall, nfold, param, seed, evals=(), fpreproc=None, stratified=False, folds=None):
"""
Make an n-fold list of CVPack from random indices.
"""
evals = list(evals)
np.random.seed(seed)
randidx = np.random.permutation(dall.num_row())
kstep = len(randidx) / nfold
idset = [randidx[(i * kstep): min(len(randidx), (i + 1) * kstep)] for i in range(nfold)]
if stratified is False and folds is None:
randidx = np.random.permutation(dall.num_row())
kstep = len(randidx) / nfold
idset = [randidx[(i * kstep): min(len(randidx), (i + 1) * kstep)] for i in range(nfold)]
elif folds is not None:
idset = [x[1] for x in folds]
nfold = len(idset)
else:
idset = [x[1] for x in XGBStratifiedKFold(dall.get_label(),
n_folds=nfold,
shuffle=True,
random_state=seed)]
ret = []
for k in range(nfold):
dtrain = dall.slice(np.concatenate([idset[i] for i in range(nfold) if k != i]))
@ -345,8 +357,8 @@ def aggcv(rlist, show_stdv=True, show_progress=None, as_pandas=True, trial=0):
return results
def cv(params, dtrain, num_boost_round=10, nfold=3, metrics=(),
obj=None, feval=None, maximize=False, early_stopping_rounds=None,
def cv(params, dtrain, num_boost_round=10, nfold=3, stratified=False, folds=None,
metrics=(), obj=None, feval=None, maximize=False, early_stopping_rounds=None,
fpreproc=None, as_pandas=True, show_progress=None, show_stdv=True, seed=0):
# pylint: disable = invalid-name
"""Cross-validation with given paramaters.
@ -361,6 +373,10 @@ def cv(params, dtrain, num_boost_round=10, nfold=3, metrics=(),
Number of boosting iterations.
nfold : int
Number of folds in CV.
stratified : bool
Perform stratified sampling.
folds : KFold or StratifiedKFold
Sklearn KFolds or StratifiedKFolds.
metrics : string or list of strings
Evaluation metrics to be watched in CV.
obj : function
@ -381,9 +397,9 @@ def cv(params, dtrain, num_boost_round=10, nfold=3, metrics=(),
If False or pandas is not installed, return np.ndarray
show_progress : bool, int, or None, default None
Whether to display the progress. If None, progress will be displayed
when np.ndarray is returned. If True, progress will be displayed at
boosting stage. If an integer is given, progress will be displayed
at every given `show_progress` boosting stage.
when np.ndarray is returned. If True, progress will be displayed at
boosting stage. If an integer is given, progress will be displayed
at every given `show_progress` boosting stage.
show_stdv : bool, default True
Whether to display the standard deviation in progress.
Results are not affected, and always contains std.
@ -394,6 +410,9 @@ def cv(params, dtrain, num_boost_round=10, nfold=3, metrics=(),
-------
evaluation history : list(string)
"""
if stratified == True and not SKLEARN_INSTALLED:
raise XGBoostError('sklearn needs to be installed in order to use stratified cv')
if isinstance(metrics, str):
metrics = [metrics]
@ -436,7 +455,7 @@ def cv(params, dtrain, num_boost_round=10, nfold=3, metrics=(),
best_score_i = 0
results = []
cvfolds = mknfold(dtrain, nfold, params, seed, metrics, fpreproc)
cvfolds = mknfold(dtrain, nfold, params, seed, metrics, fpreproc, stratified, folds)
for i in range(num_boost_round):
for fold in cvfolds:
fold.update(i, obj)
@ -466,3 +485,4 @@ def cv(params, dtrain, num_boost_round=10, nfold=3, metrics=(),
results = np.array(results)
return results

37
tests/python/test_cv.py Normal file
View File

@ -0,0 +1,37 @@
import xgboost as xgb
import numpy as np
from sklearn.datasets import load_digits
from sklearn.cross_validation import KFold, StratifiedKFold, train_test_split
from sklearn.metrics import mean_squared_error
import unittest
rng = np.random.RandomState(1994)
class TestCrossValidation(unittest.TestCase):
def test_cv(self):
digits = load_digits(3)
X = digits['data']
y = digits['target']
dm = xgb.DMatrix(X, label=y)
params = {
'max_depth': 2,
'eta': 1,
'silent': 1,
'objective':
'multi:softprob',
'num_class': 3
}
seed = 2016
nfolds = 5
skf = StratifiedKFold(y, n_folds=nfolds, shuffle=True, random_state=seed)
import pandas as pd
cv1 = xgb.cv(params, dm, num_boost_round=10, nfold=nfolds, seed=seed)
cv2 = xgb.cv(params, dm, num_boost_round=10, folds=skf, seed=seed)
cv3 = xgb.cv(params, dm, num_boost_round=10, nfold=nfolds, stratified=True, seed=seed)
assert cv1.shape[0] == cv2.shape[0] and cv2.shape[0] == cv3.shape[0]
assert cv2.iloc[-1,0] == cv3.iloc[-1,0]