add cv for python

2014-09-03 22:43:55 -07:00 · 2014-09-03 22:43:55 -07:00 · da9c856701
commit da9c856701
parent 586d6ae740
6 changed files with 91 additions and 10 deletions
--- a/demo/README.md
+++ b/demo/README.md
@ -12,7 +12,7 @@ This is a list of short codes introducing different functionalities of xgboost a
 * Cutomize loss function, and evaluation metric. [python](guide-python/custom_objective.py)
 * Boosting from existing prediction. [python](guide-python/boost_from_prediction.py)
 * Predicting using first n trees. [python](guide-python/predict_first_ntree.py)
-* Cross validation(to come)
+* Cross validation [python](guide-python/cross_validation.py)
 Basic Examples by Tasks
 ====
--- a/demo/guide-python/README.md
+++ b/demo/guide-python/README.md
@ -4,3 +4,4 @@ XGBoost Python Feature Walkthrough
 * [Cutomize loss function, and evaluation metric](custom_objective.py)
 * [Boosting from existing prediction](boost_from_prediction.py)
 * [Predicting using first n trees](predict_first_ntree.py)
 * [Cross validation](cross_validation.py)
--- a/demo/guide-python/cross_validation.py
+++ b/demo/guide-python/cross_validation.py
@ -0,0 +1,63 @@
 #!/usr/bin/python
 import sys
 import numpy as np
 sys.path.append('../../wrapper')
 import xgboost as xgb
 ### load data in do training
 dtrain = xgb.DMatrix('../data/agaricus.txt.train')
 param = {'max_depth':2, 'eta':1, 'silent':1, 'objective':'binary:logistic'}
 num_round = 2
 print ('running cross validation')
 # do cross validation, this will print result out as
 # [iteration]  metric_name:mean_value+std_value
 # std_value is standard deviation of the metric
 xgb.cv(param, dtrain, num_round, nfold=5,
       metrics={'error'}, seed = 0)
 print ('running cross validation, disable standard deviation display')
 # do cross validation, this will print result out as
 # [iteration]  metric_name:mean_value+std_value
 # std_value is standard deviation of the metric
 xgb.cv(param, dtrain, num_round, nfold=5,
       metrics={'error'}, seed = 0, show_stdv = False)
 print ('running cross validation, with preprocessing function')
 # define the preprocessing function
 # used to return the preprocessed training, test data, and parameter
 # we can use this to do weight rescale, etc.
 # as a example, we try to set scale_pos_weight
 def fpreproc(dtrain, dtest, param):
    label = dtrain.get_label()
    ratio = float(np.sum(label == 0)) / np.sum(label==1)
    param['scale_pos_weight'] = ratio
    return (dtrain, dtest, param)
 # do cross validation, for each fold
 # the dtrain, dtest, param will be passed into fpreproc
 # then the return value of fpreproc will be used to generate
 # results of that fold
 xgb.cv(param, dtrain, num_round, nfold=5,
       metrics={'auc'}, seed = 0, fpreproc = fpreproc)
 ###
 # you can also do cross validation with cutomized loss function
 # See custom_objective.py
 ##
 print ('running cross validation, with cutomsized loss function')
 def logregobj(preds, dtrain):
    labels = dtrain.get_label()
    preds = 1.0 / (1.0 + np.exp(-preds))
    grad = preds - labels
    hess = preds * (1.0-preds)
    return grad, hess
 def evalerror(preds, dtrain):
    labels = dtrain.get_label()
    return 'error', float(sum(labels != (preds > 0.0))) / len(labels)
 param = {'max_depth':2, 'eta':1, 'silent':1} 
 # train with customized objective
 xgb.cv(param, dtrain, num_round, nfold = 5, seed = 0,
       obj = logregobj, feval=evalerror)
--- a/src/learner/evaluation.h
+++ b/src/learner/evaluation.h
@ -80,6 +80,9 @@ class EvalSet{
    }
    return result;
  }
  inline size_t Size(void) const {
    return evals_.size();
  }
 private:
  std::vector<const IEvaluator*> evals_;
--- a/src/learner/learner-inl.hpp
+++ b/src/learner/learner-inl.hpp
@ -244,8 +244,10 @@ class BoostLearner {
      obj_->SetParam(cfg_[i].first.c_str(), cfg_[i].second.c_str());
      gbm_->SetParam(cfg_[i].first.c_str(), cfg_[i].second.c_str());
    }
    if (evaluator_.Size() == 0) {
      evaluator_.AddEval(obj_->DefaultEvalMetric());
    }
  }
  /*! 
   * \brief get un-transformed prediction
   * \param data training data matrix
--- a/wrapper/xgboost.py
+++ b/wrapper/xgboost.py
@ -448,11 +448,13 @@ def mknfold(dall, nfold, param, seed, evals=[], fpreproc = None):
        # run preprocessing on the data set if needed
        if fpreproc is not None:
            dtrain, dtest, tparam = fpreproc(dtrain, dtest, param.copy())
        else:
            tparam = param
        plst = tparam.items() + [('eval_metric', itm) for itm in evals]
        ret.append(CVPack(dtrain, dtest, plst))
    return ret
-def aggcv(rlist):
+def aggcv(rlist, show_stdv=True):
    """
    aggregate cross validation results
    """
@ -468,11 +470,14 @@ def aggcv(rlist):
            cvmap[k].append(float(v))
    for k, v in sorted(cvmap.items(), key = lambda x:x[0]):
        v = np.array(v)
-        ret += '\t%s:%f+%f' % (k, np.mean(v), np.std(v))
+        if show_stdv:
            ret += '\tcv-%s:%f+%f' % (k, np.mean(v), np.std(v))
        else:
            ret += '\tcv-%s:%f' % (k, np.mean(v))
    return ret
-def cv(params, dtrain, num_boost_round = 10, nfold=3, eval_metric = [], \
+def cv(params, dtrain, num_boost_round = 10, nfold=3, metrics=[], \
-        obj = None, feval = None, fpreproc = None):
+        obj = None, feval = None, fpreproc = None, show_stdv = True, seed = 0):
    """ cross validation  with given paramaters
        Args:
            params: dict
@ -485,14 +490,21 @@ def cv(params, dtrain, num_boost_round = 10, nfold=3, eval_metric = [], \
                   folds to do cv
            evals: list or
                   list of items to be evaluated
-            obj:
+            obj: custom objective function
-            feval:
+            feval: custom evaluation function
            fpreproc: preprocessing function that takes dtrain, dtest,
                      param and return transformed version of dtrain, dtest, param
            show_stdv: whether display standard deviation
            seed: seed used to generate the folds
        Returns: list(string) of evaluation history
    """
-    cvfolds = mknfold(dtrain, nfold, params, 0, eval_metric, fpreproc)
+    results = []
    cvfolds = mknfold(dtrain, nfold, params, seed, metrics, fpreproc)
    for i in range(num_boost_round):
        for f in cvfolds:
            f.update(i, obj)
-        res = aggcv([f.eval(i, feval) for f in cvfolds])
+        res = aggcv([f.eval(i, feval) for f in cvfolds], show_stdv)
        sys.stderr.write(res+'\n')
        results.append(res)
    return results