From a7f3d7edd70d7f760056a742af4b79ba1398b446 Mon Sep 17 00:00:00 2001 From: tqchen Date: Thu, 15 May 2014 20:05:22 -0700 Subject: [PATCH] ok --- demo/kaggle-higgs/README.md | 1 + demo/kaggle-higgs/higgs-numpy.py | 50 ++++++++++++++++++++++++++++++ demo/kaggle-higgs/higgs-pred.py | 52 ++++++++++++++++++++++++++++++++ demo/kaggle-higgs/run.sh | 4 +++ 4 files changed, 107 insertions(+) create mode 100644 demo/kaggle-higgs/README.md create mode 100755 demo/kaggle-higgs/higgs-numpy.py create mode 100755 demo/kaggle-higgs/higgs-pred.py create mode 100755 demo/kaggle-higgs/run.sh diff --git a/demo/kaggle-higgs/README.md b/demo/kaggle-higgs/README.md new file mode 100644 index 000000000..189358f40 --- /dev/null +++ b/demo/kaggle-higgs/README.md @@ -0,0 +1 @@ +This is the folder giving example of how to use XGBoost to run Kaggle Higgs competition diff --git a/demo/kaggle-higgs/higgs-numpy.py b/demo/kaggle-higgs/higgs-numpy.py new file mode 100755 index 000000000..b98df88c6 --- /dev/null +++ b/demo/kaggle-higgs/higgs-numpy.py @@ -0,0 +1,50 @@ +#!/usr/bin/python +# this is the example script to use xgboost to train +import sys +import numpy as np +# add path of xgboost python module +sys.path.append('../../python/') +import xgboost as xgb + +test_size = 550000 + +# path to where the data lies +dpath = 'data' + +# load in training data, directly use numpy +dtrain = np.loadtxt( dpath+'/training.csv', delimiter=',', skiprows=1, converters={32: lambda x:int(x=='s') } ) +label = dtrain[:,32] +data = dtrain[:,1:31] +# rescale weight to make it same as test set +weight = dtrain[:,31] * float(test_size) / len(label) + +sum_wpos = sum( weight[i] for i in xrange(len(label)) if label[i] == 1.0 ) +sum_wneg = sum( weight[i] for i in xrange(len(label)) if label[i] == 0.0 ) + +# print weight statistics +print 'weight statistics: wpos=%g, wneg=%g, ratio=%g' % ( sum_wpos, sum_wneg, sum_wneg/sum_wpos ) + +# construct xgboost.DMatrix from numpy array, treat -999.0 as missing value +xtrain = xgb.DMatrix( data, label=label, missing = -999.0 ) + +# setup parameters for xgboost +params = {} +# use logistic regression loss +param['loss_type'] = 3 +# scale weight of positive examples +param['scale_pos_weight'] = sum_wpos/sum_wpos +param['bst:eta'] = 0.1 +param['bst:max_depth'] = 6 +param['eval_metric'] = 'ams@0.15' +param['silent'] = 1 +param['eval_train'] = 1 +param['nthread'] = 16 + +# boost 120 tres +num_round = 120 +print 'loading data end, start to boost trees' +bst = xgb.train( xtrain, param, num_round ); +# save out model +bst.save_model('higgs.model') + +print 'finish training' diff --git a/demo/kaggle-higgs/higgs-pred.py b/demo/kaggle-higgs/higgs-pred.py new file mode 100755 index 000000000..ebae9188c --- /dev/null +++ b/demo/kaggle-higgs/higgs-pred.py @@ -0,0 +1,52 @@ +#!/usr/bin/python +# this is the example script to use xgboost to train +import sys +import numpy as np +# add path of xgboost python module +sys.path.append('../../python/') +import xgboost as xgb + +# path to where the data lies +dpath = 'data' + +modelfile = 'higgs.model' +outfile = 'higgs.pred.csv' +# make top 15% as positive +threshold_ratio = 0.15 + +# load in training data, directly use numpy +dtest = np.loadtxt( dpath+'/test.csv', delimiter=',', skiprows=1 ) +data = dtest[:,1:31] +idx = dtest[:,1] + +xtest = xgb.DMatrix( data, missing = -999.0 ) +bst = xgb.Booster() +bst.load_model( modelfile ) + +ypred = bst.predict( dtest ) +res = [ ( int(idx[i]), ypred[i] ) for i in xrange(len(ypred)) ] + +rorder = {} +for k, v in sorted( res, key = lambda x:-x[1] ): + rorder[ k ] = len(rorder) + 1 + +# write out predictions +ntop = int( ratio * len(rorder ) ) +fo = open(outfile, 'w') +nhit = 0 +ntot = 0 +fo.write('EventId,RankOrder,Class\n') +for k, v in res: + if rorder[k] <= ntop: + lb = 's' + nhit += 1 + else: + lb = 'b' + fo.write('%s,%d,%s\n' % ( k, rorder[k], lb ) ) + ntot += 1 +fo.close() + +print 'finished writing into model file' + + + diff --git a/demo/kaggle-higgs/run.sh b/demo/kaggle-higgs/run.sh new file mode 100755 index 000000000..e6b5d91fa --- /dev/null +++ b/demo/kaggle-higgs/run.sh @@ -0,0 +1,4 @@ +#!/bin/bash + +./higgs-numpy.py +./higgs-pred.py \ No newline at end of file