add cv for python

2014-09-03 22:43:55 -07:00 · 2014-09-03 22:43:55 -07:00 · da9c856701
commit da9c856701
parent 586d6ae740
6 changed files with 91 additions and 10 deletions
--- a/demo/README.md
+++ b/demo/README.md
@ -12,7 +12,7 @@ This is a list of short codes introducing different functionalities of xgboost a
 * Cutomize loss function, and evaluation metric. [python](guide-python/custom_objective.py)
 * Boosting from existing prediction. [python](guide-python/boost_from_prediction.py)
 * Predicting using first n trees. [python](guide-python/predict_first_ntree.py)
-* Cross validation(to come)
+* Cross validation [python](guide-python/cross_validation.py)

 Basic Examples by Tasks
 ====
--- a/demo/guide-python/README.md
+++ b/demo/guide-python/README.md
@ -4,3 +4,4 @@ XGBoost Python Feature Walkthrough
 * [Cutomize loss function, and evaluation metric](custom_objective.py)
 * [Boosting from existing prediction](boost_from_prediction.py)
 * [Predicting using first n trees](predict_first_ntree.py)
+* [Cross validation](cross_validation.py)
--- a/demo/guide-python/cross_validation.py
+++ b/demo/guide-python/cross_validation.py
@ -0,0 +1,63 @@
+#!/usr/bin/python
+import sys
+import numpy as np
+sys.path.append('../../wrapper')
+import xgboost as xgb
+
+### load data in do training
+dtrain = xgb.DMatrix('../data/agaricus.txt.train')
+param = {'max_depth':2, 'eta':1, 'silent':1, 'objective':'binary:logistic'}
+num_round = 2
+
+print ('running cross validation')
+# do cross validation, this will print result out as
+# [iteration]  metric_name:mean_value+std_value
+# std_value is standard deviation of the metric
+xgb.cv(param, dtrain, num_round, nfold=5,
+       metrics={'error'}, seed = 0)
+
+print ('running cross validation, disable standard deviation display')
+# do cross validation, this will print result out as
+# [iteration]  metric_name:mean_value+std_value
+# std_value is standard deviation of the metric
+xgb.cv(param, dtrain, num_round, nfold=5,
+       metrics={'error'}, seed = 0, show_stdv = False)
+
+print ('running cross validation, with preprocessing function')
+# define the preprocessing function
+# used to return the preprocessed training, test data, and parameter
+# we can use this to do weight rescale, etc.
+# as a example, we try to set scale_pos_weight
+def fpreproc(dtrain, dtest, param):
+    label = dtrain.get_label()
+    ratio = float(np.sum(label == 0)) / np.sum(label==1)
+    param['scale_pos_weight'] = ratio
+    return (dtrain, dtest, param)
+
+# do cross validation, for each fold
+# the dtrain, dtest, param will be passed into fpreproc
+# then the return value of fpreproc will be used to generate
+# results of that fold
+xgb.cv(param, dtrain, num_round, nfold=5,
+       metrics={'auc'}, seed = 0, fpreproc = fpreproc)
+
+###
+# you can also do cross validation with cutomized loss function
+# See custom_objective.py
+##
+print ('running cross validation, with cutomsized loss function')
+def logregobj(preds, dtrain):
+    labels = dtrain.get_label()
+    preds = 1.0 / (1.0 + np.exp(-preds))
+    grad = preds - labels
+    hess = preds * (1.0-preds)
+    return grad, hess
+def evalerror(preds, dtrain):
+    labels = dtrain.get_label()
+    return 'error', float(sum(labels != (preds > 0.0))) / len(labels)
+
+param = {'max_depth':2, 'eta':1, 'silent':1} 
+# train with customized objective
+xgb.cv(param, dtrain, num_round, nfold = 5, seed = 0,
+       obj = logregobj, feval=evalerror)
+
--- a/src/learner/evaluation.h
+++ b/src/learner/evaluation.h
@ -80,6 +80,9 @@ class EvalSet{
    }
    return result;
  }
+  inline size_t Size(void) const {
+    return evals_.size();
+  }

 private:
  std::vector<const IEvaluator*> evals_;
--- a/src/learner/learner-inl.hpp
+++ b/src/learner/learner-inl.hpp
@ -244,8 +244,10 @@ class BoostLearner {
      obj_->SetParam(cfg_[i].first.c_str(), cfg_[i].second.c_str());
      gbm_->SetParam(cfg_[i].first.c_str(), cfg_[i].second.c_str());
    }
+    if (evaluator_.Size() == 0) {
      evaluator_.AddEval(obj_->DefaultEvalMetric());
    }
+  }
  /*! 
   * \brief get un-transformed prediction
   * \param data training data matrix
--- a/wrapper/xgboost.py
+++ b/wrapper/xgboost.py
@ -448,11 +448,13 @@ def mknfold(dall, nfold, param, seed, evals=[], fpreproc = None):
        # run preprocessing on the data set if needed
        if fpreproc is not None:
            dtrain, dtest, tparam = fpreproc(dtrain, dtest, param.copy())
+        else:
+            tparam = param
        plst = tparam.items() + [('eval_metric', itm) for itm in evals]
        ret.append(CVPack(dtrain, dtest, plst))
    return ret

-def aggcv(rlist):
+def aggcv(rlist, show_stdv=True):
    """
    aggregate cross validation results
    """
@ -468,11 +470,14 @@ def aggcv(rlist):
            cvmap[k].append(float(v))
    for k, v in sorted(cvmap.items(), key = lambda x:x[0]):
        v = np.array(v)
-        ret += '\t%s:%f+%f' % (k, np.mean(v), np.std(v))
+        if show_stdv:
+            ret += '\tcv-%s:%f+%f' % (k, np.mean(v), np.std(v))
+        else:
+            ret += '\tcv-%s:%f' % (k, np.mean(v))
    return ret

-def cv(params, dtrain, num_boost_round = 10, nfold=3, eval_metric = [], \
-        obj = None, feval = None, fpreproc = None):
+def cv(params, dtrain, num_boost_round = 10, nfold=3, metrics=[], \
+        obj = None, feval = None, fpreproc = None, show_stdv = True, seed = 0):
    """ cross validation  with given paramaters
        Args:
            params: dict
@ -485,14 +490,21 @@ def cv(params, dtrain, num_boost_round = 10, nfold=3, eval_metric = [], \
                   folds to do cv
            evals: list or
                   list of items to be evaluated
-            obj:
-            feval:
+            obj: custom objective function
+            feval: custom evaluation function
            fpreproc: preprocessing function that takes dtrain, dtest,
                      param and return transformed version of dtrain, dtest, param
+            show_stdv: whether display standard deviation
+            seed: seed used to generate the folds
+
+        Returns: list(string) of evaluation history
    """
-    cvfolds = mknfold(dtrain, nfold, params, 0, eval_metric, fpreproc)
+    results = []
+    cvfolds = mknfold(dtrain, nfold, params, seed, metrics, fpreproc)
    for i in range(num_boost_round):
        for f in cvfolds:
            f.update(i, obj)
-        res = aggcv([f.eval(i, feval) for f in cvfolds])
+        res = aggcv([f.eval(i, feval) for f in cvfolds], show_stdv)
        sys.stderr.write(res+'\n')
+        results.append(res)
+    return results