diff --git a/demo/kaggle-higgs/higgs-cv.py b/demo/kaggle-higgs/higgs-cv.py new file mode 100755 index 000000000..3e36fa66b --- /dev/null +++ b/demo/kaggle-higgs/higgs-cv.py @@ -0,0 +1,39 @@ +#!/usr/bin/python +import sys +import numpy as np +sys.path.append('../../wrapper') +import xgboost as xgb + +### load data in do training +train = np.loadtxt('./data/training.csv', delimiter=',', skiprows=1, converters={32: lambda x:int(x=='s'.encode('utf-8')) } ) +label = train[:,32] +data = train[:,1:31] +weight = train[:,31] +dtrain = xgb.DMatrix( data, label=label, missing = -999.0, weight=weight ) +param = {'max_depth':6, 'eta':0.1, 'silent':1, 'objective':'binary:logitraw', 'nthread':4} +num_round = 120 + +print ('running cross validation, with preprocessing function') +# define the preprocessing function +# used to return the preprocessed training, test data, and parameter +# we can use this to do weight rescale, etc. +# as a example, we try to set scale_pos_weight +def fpreproc(dtrain, dtest, param): + label = dtrain.get_label() + ratio = float(np.sum(label == 0)) / np.sum(label==1) + param['scale_pos_weight'] = ratio + wtrain = dtrain.get_weight() + wtest = dtest.get_weight() + sum_weight = sum(wtrain) + sum(wtest) + wtrain *= sum_weight / sum(wtrain) + wtest *= sum_weight / sum(wtest) + dtrain.set_weight(wtrain) + dtest.set_weight(wtest) + return (dtrain, dtest, param) + +# do cross validation, for each fold +# the dtrain, dtest, param will be passed into fpreproc +# then the return value of fpreproc will be used to generate +# results of that fold +xgb.cv(param, dtrain, num_round, nfold=5, + metrics={'ams@0.15', 'auc'}, seed = 0, fpreproc = fpreproc)