diff --git a/demo/README.md b/demo/README.md index 5b199633b..e83bde6b4 100644 --- a/demo/README.md +++ b/demo/README.md @@ -12,7 +12,7 @@ This is a list of short codes introducing different functionalities of xgboost a * Cutomize loss function, and evaluation metric. [python](guide-python/custom_objective.py) * Boosting from existing prediction. [python](guide-python/boost_from_prediction.py) * Predicting using first n trees. [python](guide-python/predict_first_ntree.py) -* Cross validation(to come) +* Cross validation [python](guide-python/cross_validation.py) Basic Examples by Tasks ==== diff --git a/demo/guide-python/README.md b/demo/guide-python/README.md index b2cad6b54..3014ee23e 100644 --- a/demo/guide-python/README.md +++ b/demo/guide-python/README.md @@ -4,3 +4,4 @@ XGBoost Python Feature Walkthrough * [Cutomize loss function, and evaluation metric](custom_objective.py) * [Boosting from existing prediction](boost_from_prediction.py) * [Predicting using first n trees](predict_first_ntree.py) +* [Cross validation](cross_validation.py) diff --git a/demo/guide-python/cross_validation.py b/demo/guide-python/cross_validation.py new file mode 100755 index 000000000..a50586c58 --- /dev/null +++ b/demo/guide-python/cross_validation.py @@ -0,0 +1,63 @@ +#!/usr/bin/python +import sys +import numpy as np +sys.path.append('../../wrapper') +import xgboost as xgb + +### load data in do training +dtrain = xgb.DMatrix('../data/agaricus.txt.train') +param = {'max_depth':2, 'eta':1, 'silent':1, 'objective':'binary:logistic'} +num_round = 2 + +print ('running cross validation') +# do cross validation, this will print result out as +# [iteration] metric_name:mean_value+std_value +# std_value is standard deviation of the metric +xgb.cv(param, dtrain, num_round, nfold=5, + metrics={'error'}, seed = 0) + +print ('running cross validation, disable standard deviation display') +# do cross validation, this will print result out as +# [iteration] metric_name:mean_value+std_value +# std_value is standard deviation of the metric +xgb.cv(param, dtrain, num_round, nfold=5, + metrics={'error'}, seed = 0, show_stdv = False) + +print ('running cross validation, with preprocessing function') +# define the preprocessing function +# used to return the preprocessed training, test data, and parameter +# we can use this to do weight rescale, etc. +# as a example, we try to set scale_pos_weight +def fpreproc(dtrain, dtest, param): + label = dtrain.get_label() + ratio = float(np.sum(label == 0)) / np.sum(label==1) + param['scale_pos_weight'] = ratio + return (dtrain, dtest, param) + +# do cross validation, for each fold +# the dtrain, dtest, param will be passed into fpreproc +# then the return value of fpreproc will be used to generate +# results of that fold +xgb.cv(param, dtrain, num_round, nfold=5, + metrics={'auc'}, seed = 0, fpreproc = fpreproc) + +### +# you can also do cross validation with cutomized loss function +# See custom_objective.py +## +print ('running cross validation, with cutomsized loss function') +def logregobj(preds, dtrain): + labels = dtrain.get_label() + preds = 1.0 / (1.0 + np.exp(-preds)) + grad = preds - labels + hess = preds * (1.0-preds) + return grad, hess +def evalerror(preds, dtrain): + labels = dtrain.get_label() + return 'error', float(sum(labels != (preds > 0.0))) / len(labels) + +param = {'max_depth':2, 'eta':1, 'silent':1} +# train with customized objective +xgb.cv(param, dtrain, num_round, nfold = 5, seed = 0, + obj = logregobj, feval=evalerror) + diff --git a/src/learner/evaluation.h b/src/learner/evaluation.h index f34d832c8..33370e706 100644 --- a/src/learner/evaluation.h +++ b/src/learner/evaluation.h @@ -80,6 +80,9 @@ class EvalSet{ } return result; } + inline size_t Size(void) const { + return evals_.size(); + } private: std::vector evals_; diff --git a/src/learner/learner-inl.hpp b/src/learner/learner-inl.hpp index 05519de8b..88026975d 100644 --- a/src/learner/learner-inl.hpp +++ b/src/learner/learner-inl.hpp @@ -244,7 +244,9 @@ class BoostLearner { obj_->SetParam(cfg_[i].first.c_str(), cfg_[i].second.c_str()); gbm_->SetParam(cfg_[i].first.c_str(), cfg_[i].second.c_str()); } - evaluator_.AddEval(obj_->DefaultEvalMetric()); + if (evaluator_.Size() == 0) { + evaluator_.AddEval(obj_->DefaultEvalMetric()); + } } /*! * \brief get un-transformed prediction diff --git a/wrapper/xgboost.py b/wrapper/xgboost.py index 5b4eee6b8..34d61bede 100644 --- a/wrapper/xgboost.py +++ b/wrapper/xgboost.py @@ -448,11 +448,13 @@ def mknfold(dall, nfold, param, seed, evals=[], fpreproc = None): # run preprocessing on the data set if needed if fpreproc is not None: dtrain, dtest, tparam = fpreproc(dtrain, dtest, param.copy()) + else: + tparam = param plst = tparam.items() + [('eval_metric', itm) for itm in evals] ret.append(CVPack(dtrain, dtest, plst)) return ret -def aggcv(rlist): +def aggcv(rlist, show_stdv=True): """ aggregate cross validation results """ @@ -468,11 +470,14 @@ def aggcv(rlist): cvmap[k].append(float(v)) for k, v in sorted(cvmap.items(), key = lambda x:x[0]): v = np.array(v) - ret += '\t%s:%f+%f' % (k, np.mean(v), np.std(v)) + if show_stdv: + ret += '\tcv-%s:%f+%f' % (k, np.mean(v), np.std(v)) + else: + ret += '\tcv-%s:%f' % (k, np.mean(v)) return ret -def cv(params, dtrain, num_boost_round = 10, nfold=3, eval_metric = [], \ - obj = None, feval = None, fpreproc = None): +def cv(params, dtrain, num_boost_round = 10, nfold=3, metrics=[], \ + obj = None, feval = None, fpreproc = None, show_stdv = True, seed = 0): """ cross validation with given paramaters Args: params: dict @@ -485,14 +490,21 @@ def cv(params, dtrain, num_boost_round = 10, nfold=3, eval_metric = [], \ folds to do cv evals: list or list of items to be evaluated - obj: - feval: + obj: custom objective function + feval: custom evaluation function fpreproc: preprocessing function that takes dtrain, dtest, param and return transformed version of dtrain, dtest, param + show_stdv: whether display standard deviation + seed: seed used to generate the folds + + Returns: list(string) of evaluation history """ - cvfolds = mknfold(dtrain, nfold, params, 0, eval_metric, fpreproc) + results = [] + cvfolds = mknfold(dtrain, nfold, params, seed, metrics, fpreproc) for i in range(num_boost_round): for f in cvfolds: f.update(i, obj) - res = aggcv([f.eval(i, feval) for f in cvfolds]) + res = aggcv([f.eval(i, feval) for f in cvfolds], show_stdv) sys.stderr.write(res+'\n') + results.append(res) + return results