add cv for python
This commit is contained in:
parent
586d6ae740
commit
da9c856701
@ -12,7 +12,7 @@ This is a list of short codes introducing different functionalities of xgboost a
|
|||||||
* Cutomize loss function, and evaluation metric. [python](guide-python/custom_objective.py)
|
* Cutomize loss function, and evaluation metric. [python](guide-python/custom_objective.py)
|
||||||
* Boosting from existing prediction. [python](guide-python/boost_from_prediction.py)
|
* Boosting from existing prediction. [python](guide-python/boost_from_prediction.py)
|
||||||
* Predicting using first n trees. [python](guide-python/predict_first_ntree.py)
|
* Predicting using first n trees. [python](guide-python/predict_first_ntree.py)
|
||||||
* Cross validation(to come)
|
* Cross validation [python](guide-python/cross_validation.py)
|
||||||
|
|
||||||
Basic Examples by Tasks
|
Basic Examples by Tasks
|
||||||
====
|
====
|
||||||
|
|||||||
@ -4,3 +4,4 @@ XGBoost Python Feature Walkthrough
|
|||||||
* [Cutomize loss function, and evaluation metric](custom_objective.py)
|
* [Cutomize loss function, and evaluation metric](custom_objective.py)
|
||||||
* [Boosting from existing prediction](boost_from_prediction.py)
|
* [Boosting from existing prediction](boost_from_prediction.py)
|
||||||
* [Predicting using first n trees](predict_first_ntree.py)
|
* [Predicting using first n trees](predict_first_ntree.py)
|
||||||
|
* [Cross validation](cross_validation.py)
|
||||||
|
|||||||
63
demo/guide-python/cross_validation.py
Executable file
63
demo/guide-python/cross_validation.py
Executable file
@ -0,0 +1,63 @@
|
|||||||
|
#!/usr/bin/python
|
||||||
|
import sys
|
||||||
|
import numpy as np
|
||||||
|
sys.path.append('../../wrapper')
|
||||||
|
import xgboost as xgb
|
||||||
|
|
||||||
|
### load data in do training
|
||||||
|
dtrain = xgb.DMatrix('../data/agaricus.txt.train')
|
||||||
|
param = {'max_depth':2, 'eta':1, 'silent':1, 'objective':'binary:logistic'}
|
||||||
|
num_round = 2
|
||||||
|
|
||||||
|
print ('running cross validation')
|
||||||
|
# do cross validation, this will print result out as
|
||||||
|
# [iteration] metric_name:mean_value+std_value
|
||||||
|
# std_value is standard deviation of the metric
|
||||||
|
xgb.cv(param, dtrain, num_round, nfold=5,
|
||||||
|
metrics={'error'}, seed = 0)
|
||||||
|
|
||||||
|
print ('running cross validation, disable standard deviation display')
|
||||||
|
# do cross validation, this will print result out as
|
||||||
|
# [iteration] metric_name:mean_value+std_value
|
||||||
|
# std_value is standard deviation of the metric
|
||||||
|
xgb.cv(param, dtrain, num_round, nfold=5,
|
||||||
|
metrics={'error'}, seed = 0, show_stdv = False)
|
||||||
|
|
||||||
|
print ('running cross validation, with preprocessing function')
|
||||||
|
# define the preprocessing function
|
||||||
|
# used to return the preprocessed training, test data, and parameter
|
||||||
|
# we can use this to do weight rescale, etc.
|
||||||
|
# as a example, we try to set scale_pos_weight
|
||||||
|
def fpreproc(dtrain, dtest, param):
|
||||||
|
label = dtrain.get_label()
|
||||||
|
ratio = float(np.sum(label == 0)) / np.sum(label==1)
|
||||||
|
param['scale_pos_weight'] = ratio
|
||||||
|
return (dtrain, dtest, param)
|
||||||
|
|
||||||
|
# do cross validation, for each fold
|
||||||
|
# the dtrain, dtest, param will be passed into fpreproc
|
||||||
|
# then the return value of fpreproc will be used to generate
|
||||||
|
# results of that fold
|
||||||
|
xgb.cv(param, dtrain, num_round, nfold=5,
|
||||||
|
metrics={'auc'}, seed = 0, fpreproc = fpreproc)
|
||||||
|
|
||||||
|
###
|
||||||
|
# you can also do cross validation with cutomized loss function
|
||||||
|
# See custom_objective.py
|
||||||
|
##
|
||||||
|
print ('running cross validation, with cutomsized loss function')
|
||||||
|
def logregobj(preds, dtrain):
|
||||||
|
labels = dtrain.get_label()
|
||||||
|
preds = 1.0 / (1.0 + np.exp(-preds))
|
||||||
|
grad = preds - labels
|
||||||
|
hess = preds * (1.0-preds)
|
||||||
|
return grad, hess
|
||||||
|
def evalerror(preds, dtrain):
|
||||||
|
labels = dtrain.get_label()
|
||||||
|
return 'error', float(sum(labels != (preds > 0.0))) / len(labels)
|
||||||
|
|
||||||
|
param = {'max_depth':2, 'eta':1, 'silent':1}
|
||||||
|
# train with customized objective
|
||||||
|
xgb.cv(param, dtrain, num_round, nfold = 5, seed = 0,
|
||||||
|
obj = logregobj, feval=evalerror)
|
||||||
|
|
||||||
@ -80,6 +80,9 @@ class EvalSet{
|
|||||||
}
|
}
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
inline size_t Size(void) const {
|
||||||
|
return evals_.size();
|
||||||
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
std::vector<const IEvaluator*> evals_;
|
std::vector<const IEvaluator*> evals_;
|
||||||
|
|||||||
@ -244,8 +244,10 @@ class BoostLearner {
|
|||||||
obj_->SetParam(cfg_[i].first.c_str(), cfg_[i].second.c_str());
|
obj_->SetParam(cfg_[i].first.c_str(), cfg_[i].second.c_str());
|
||||||
gbm_->SetParam(cfg_[i].first.c_str(), cfg_[i].second.c_str());
|
gbm_->SetParam(cfg_[i].first.c_str(), cfg_[i].second.c_str());
|
||||||
}
|
}
|
||||||
|
if (evaluator_.Size() == 0) {
|
||||||
evaluator_.AddEval(obj_->DefaultEvalMetric());
|
evaluator_.AddEval(obj_->DefaultEvalMetric());
|
||||||
}
|
}
|
||||||
|
}
|
||||||
/*!
|
/*!
|
||||||
* \brief get un-transformed prediction
|
* \brief get un-transformed prediction
|
||||||
* \param data training data matrix
|
* \param data training data matrix
|
||||||
|
|||||||
@ -448,11 +448,13 @@ def mknfold(dall, nfold, param, seed, evals=[], fpreproc = None):
|
|||||||
# run preprocessing on the data set if needed
|
# run preprocessing on the data set if needed
|
||||||
if fpreproc is not None:
|
if fpreproc is not None:
|
||||||
dtrain, dtest, tparam = fpreproc(dtrain, dtest, param.copy())
|
dtrain, dtest, tparam = fpreproc(dtrain, dtest, param.copy())
|
||||||
|
else:
|
||||||
|
tparam = param
|
||||||
plst = tparam.items() + [('eval_metric', itm) for itm in evals]
|
plst = tparam.items() + [('eval_metric', itm) for itm in evals]
|
||||||
ret.append(CVPack(dtrain, dtest, plst))
|
ret.append(CVPack(dtrain, dtest, plst))
|
||||||
return ret
|
return ret
|
||||||
|
|
||||||
def aggcv(rlist):
|
def aggcv(rlist, show_stdv=True):
|
||||||
"""
|
"""
|
||||||
aggregate cross validation results
|
aggregate cross validation results
|
||||||
"""
|
"""
|
||||||
@ -468,11 +470,14 @@ def aggcv(rlist):
|
|||||||
cvmap[k].append(float(v))
|
cvmap[k].append(float(v))
|
||||||
for k, v in sorted(cvmap.items(), key = lambda x:x[0]):
|
for k, v in sorted(cvmap.items(), key = lambda x:x[0]):
|
||||||
v = np.array(v)
|
v = np.array(v)
|
||||||
ret += '\t%s:%f+%f' % (k, np.mean(v), np.std(v))
|
if show_stdv:
|
||||||
|
ret += '\tcv-%s:%f+%f' % (k, np.mean(v), np.std(v))
|
||||||
|
else:
|
||||||
|
ret += '\tcv-%s:%f' % (k, np.mean(v))
|
||||||
return ret
|
return ret
|
||||||
|
|
||||||
def cv(params, dtrain, num_boost_round = 10, nfold=3, eval_metric = [], \
|
def cv(params, dtrain, num_boost_round = 10, nfold=3, metrics=[], \
|
||||||
obj = None, feval = None, fpreproc = None):
|
obj = None, feval = None, fpreproc = None, show_stdv = True, seed = 0):
|
||||||
""" cross validation with given paramaters
|
""" cross validation with given paramaters
|
||||||
Args:
|
Args:
|
||||||
params: dict
|
params: dict
|
||||||
@ -485,14 +490,21 @@ def cv(params, dtrain, num_boost_round = 10, nfold=3, eval_metric = [], \
|
|||||||
folds to do cv
|
folds to do cv
|
||||||
evals: list or
|
evals: list or
|
||||||
list of items to be evaluated
|
list of items to be evaluated
|
||||||
obj:
|
obj: custom objective function
|
||||||
feval:
|
feval: custom evaluation function
|
||||||
fpreproc: preprocessing function that takes dtrain, dtest,
|
fpreproc: preprocessing function that takes dtrain, dtest,
|
||||||
param and return transformed version of dtrain, dtest, param
|
param and return transformed version of dtrain, dtest, param
|
||||||
|
show_stdv: whether display standard deviation
|
||||||
|
seed: seed used to generate the folds
|
||||||
|
|
||||||
|
Returns: list(string) of evaluation history
|
||||||
"""
|
"""
|
||||||
cvfolds = mknfold(dtrain, nfold, params, 0, eval_metric, fpreproc)
|
results = []
|
||||||
|
cvfolds = mknfold(dtrain, nfold, params, seed, metrics, fpreproc)
|
||||||
for i in range(num_boost_round):
|
for i in range(num_boost_round):
|
||||||
for f in cvfolds:
|
for f in cvfolds:
|
||||||
f.update(i, obj)
|
f.update(i, obj)
|
||||||
res = aggcv([f.eval(i, feval) for f in cvfolds])
|
res = aggcv([f.eval(i, feval) for f in cvfolds], show_stdv)
|
||||||
sys.stderr.write(res+'\n')
|
sys.stderr.write(res+'\n')
|
||||||
|
results.append(res)
|
||||||
|
return results
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user