stratified cv for python wrapper
finalize docstring
This commit is contained in:
parent
9b2b81e6a4
commit
4b3a053913
@ -33,8 +33,11 @@ try:
|
||||
from sklearn.base import BaseEstimator
|
||||
from sklearn.base import RegressorMixin, ClassifierMixin
|
||||
from sklearn.preprocessing import LabelEncoder
|
||||
from sklearn.cross_validation import KFold, StratifiedKFold
|
||||
SKLEARN_INSTALLED = True
|
||||
|
||||
XGBKFold = KFold
|
||||
XGBStratifiedKFold = StratifiedKFold
|
||||
XGBModelBase = BaseEstimator
|
||||
XGBRegressorBase = RegressorMixin
|
||||
XGBClassifierBase = ClassifierMixin
|
||||
|
||||
@ -8,6 +8,7 @@ import sys
|
||||
import re
|
||||
import numpy as np
|
||||
from .core import Booster, STRING_TYPES
|
||||
from .compat import (SKLEARN_INSTALLED, XGBStratifiedKFold, XGBKFold)
|
||||
|
||||
def train(params, dtrain, num_boost_round=10, evals=(), obj=None, feval=None,
|
||||
maximize=False, early_stopping_rounds=None, evals_result=None,
|
||||
@ -261,15 +262,26 @@ class CVPack(object):
|
||||
return self.bst.eval_set(self.watchlist, iteration, feval)
|
||||
|
||||
|
||||
def mknfold(dall, nfold, param, seed, evals=(), fpreproc=None):
|
||||
def mknfold(dall, nfold, param, seed, evals=(), fpreproc=None, stratified=False, folds=None):
|
||||
"""
|
||||
Make an n-fold list of CVPack from random indices.
|
||||
"""
|
||||
evals = list(evals)
|
||||
np.random.seed(seed)
|
||||
|
||||
if stratified is False and folds is None:
|
||||
randidx = np.random.permutation(dall.num_row())
|
||||
kstep = len(randidx) / nfold
|
||||
idset = [randidx[(i * kstep): min(len(randidx), (i + 1) * kstep)] for i in range(nfold)]
|
||||
elif folds is not None:
|
||||
idset = [x[1] for x in folds]
|
||||
nfold = len(idset)
|
||||
else:
|
||||
idset = [x[1] for x in XGBStratifiedKFold(dall.get_label(),
|
||||
n_folds=nfold,
|
||||
shuffle=True,
|
||||
random_state=seed)]
|
||||
|
||||
ret = []
|
||||
for k in range(nfold):
|
||||
dtrain = dall.slice(np.concatenate([idset[i] for i in range(nfold) if k != i]))
|
||||
@ -345,8 +357,8 @@ def aggcv(rlist, show_stdv=True, show_progress=None, as_pandas=True, trial=0):
|
||||
return results
|
||||
|
||||
|
||||
def cv(params, dtrain, num_boost_round=10, nfold=3, metrics=(),
|
||||
obj=None, feval=None, maximize=False, early_stopping_rounds=None,
|
||||
def cv(params, dtrain, num_boost_round=10, nfold=3, stratified=False, folds=None,
|
||||
metrics=(), obj=None, feval=None, maximize=False, early_stopping_rounds=None,
|
||||
fpreproc=None, as_pandas=True, show_progress=None, show_stdv=True, seed=0):
|
||||
# pylint: disable = invalid-name
|
||||
"""Cross-validation with given paramaters.
|
||||
@ -361,6 +373,10 @@ def cv(params, dtrain, num_boost_round=10, nfold=3, metrics=(),
|
||||
Number of boosting iterations.
|
||||
nfold : int
|
||||
Number of folds in CV.
|
||||
stratified : bool
|
||||
Perform stratified sampling.
|
||||
folds : KFold or StratifiedKFold
|
||||
Sklearn KFolds or StratifiedKFolds.
|
||||
metrics : string or list of strings
|
||||
Evaluation metrics to be watched in CV.
|
||||
obj : function
|
||||
@ -394,6 +410,9 @@ def cv(params, dtrain, num_boost_round=10, nfold=3, metrics=(),
|
||||
-------
|
||||
evaluation history : list(string)
|
||||
"""
|
||||
if stratified == True and not SKLEARN_INSTALLED:
|
||||
raise XGBoostError('sklearn needs to be installed in order to use stratified cv')
|
||||
|
||||
if isinstance(metrics, str):
|
||||
metrics = [metrics]
|
||||
|
||||
@ -436,7 +455,7 @@ def cv(params, dtrain, num_boost_round=10, nfold=3, metrics=(),
|
||||
|
||||
best_score_i = 0
|
||||
results = []
|
||||
cvfolds = mknfold(dtrain, nfold, params, seed, metrics, fpreproc)
|
||||
cvfolds = mknfold(dtrain, nfold, params, seed, metrics, fpreproc, stratified, folds)
|
||||
for i in range(num_boost_round):
|
||||
for fold in cvfolds:
|
||||
fold.update(i, obj)
|
||||
@ -466,3 +485,4 @@ def cv(params, dtrain, num_boost_round=10, nfold=3, metrics=(),
|
||||
results = np.array(results)
|
||||
|
||||
return results
|
||||
|
||||
|
||||
37
tests/python/test_cv.py
Normal file
37
tests/python/test_cv.py
Normal file
@ -0,0 +1,37 @@
|
||||
import xgboost as xgb
|
||||
import numpy as np
|
||||
from sklearn.datasets import load_digits
|
||||
from sklearn.cross_validation import KFold, StratifiedKFold, train_test_split
|
||||
from sklearn.metrics import mean_squared_error
|
||||
import unittest
|
||||
|
||||
rng = np.random.RandomState(1994)
|
||||
|
||||
|
||||
class TestCrossValidation(unittest.TestCase):
|
||||
def test_cv(self):
|
||||
digits = load_digits(3)
|
||||
X = digits['data']
|
||||
y = digits['target']
|
||||
dm = xgb.DMatrix(X, label=y)
|
||||
|
||||
params = {
|
||||
'max_depth': 2,
|
||||
'eta': 1,
|
||||
'silent': 1,
|
||||
'objective':
|
||||
'multi:softprob',
|
||||
'num_class': 3
|
||||
}
|
||||
|
||||
seed = 2016
|
||||
nfolds = 5
|
||||
skf = StratifiedKFold(y, n_folds=nfolds, shuffle=True, random_state=seed)
|
||||
|
||||
import pandas as pd
|
||||
cv1 = xgb.cv(params, dm, num_boost_round=10, nfold=nfolds, seed=seed)
|
||||
cv2 = xgb.cv(params, dm, num_boost_round=10, folds=skf, seed=seed)
|
||||
cv3 = xgb.cv(params, dm, num_boost_round=10, nfold=nfolds, stratified=True, seed=seed)
|
||||
assert cv1.shape[0] == cv2.shape[0] and cv2.shape[0] == cv3.shape[0]
|
||||
assert cv2.iloc[-1,0] == cv3.iloc[-1,0]
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user