Compatibility with both Python 2(.7) and 3
This commit is contained in:
@@ -24,7 +24,7 @@ def loadfmap( fname ):
|
||||
return fmap, nmap
|
||||
|
||||
def write_nmap( fo, nmap ):
|
||||
for i in xrange( len(nmap) ):
|
||||
for i in range( len(nmap) ):
|
||||
fo.write('%d\t%s\ti\n' % (i, nmap[i]) )
|
||||
|
||||
# start here
|
||||
@@ -41,7 +41,7 @@ for l in open( 'agaricus-lepiota.data' ):
|
||||
else:
|
||||
assert arr[0] == 'e'
|
||||
fo.write('0')
|
||||
for i in xrange( 1,len(arr) ):
|
||||
for i in range( 1,len(arr) ):
|
||||
fo.write( ' %d:1' % fmap[i][arr[i].strip()] )
|
||||
fo.write('\n')
|
||||
|
||||
|
||||
@@ -3,7 +3,7 @@ import sys
|
||||
import random
|
||||
|
||||
if len(sys.argv) < 2:
|
||||
print 'Usage:<filename> <k> [nfold = 5]'
|
||||
print ('Usage:<filename> <k> [nfold = 5]')
|
||||
exit(0)
|
||||
|
||||
random.seed( 10 )
|
||||
|
||||
@@ -1,9 +1,15 @@
|
||||
#!/usr/bin/python
|
||||
# this is the example script to use xgboost to train
|
||||
import inspect
|
||||
import os
|
||||
import sys
|
||||
import numpy as np
|
||||
# add path of xgboost python module
|
||||
sys.path.append('../../python/')
|
||||
code_path = os.path.join(
|
||||
os.path.split(inspect.getfile(inspect.currentframe()))[0], "../../python")
|
||||
|
||||
sys.path.append(code_path)
|
||||
|
||||
import xgboost as xgb
|
||||
|
||||
test_size = 550000
|
||||
@@ -12,19 +18,19 @@ test_size = 550000
|
||||
dpath = 'data'
|
||||
|
||||
# load in training data, directly use numpy
|
||||
dtrain = np.loadtxt( dpath+'/training.csv', delimiter=',', skiprows=1, converters={32: lambda x:int(x=='s') } )
|
||||
print 'finish loading from csv '
|
||||
dtrain = np.loadtxt( dpath+'/training.csv', delimiter=',', skiprows=1, converters={32: lambda x:int(x=='s'.encode('utf-8')) } )
|
||||
print ('finish loading from csv ')
|
||||
|
||||
label = dtrain[:,32]
|
||||
data = dtrain[:,1:31]
|
||||
# rescale weight to make it same as test set
|
||||
weight = dtrain[:,31] * float(test_size) / len(label)
|
||||
|
||||
sum_wpos = sum( weight[i] for i in xrange(len(label)) if label[i] == 1.0 )
|
||||
sum_wneg = sum( weight[i] for i in xrange(len(label)) if label[i] == 0.0 )
|
||||
sum_wpos = sum( weight[i] for i in range(len(label)) if label[i] == 1.0 )
|
||||
sum_wneg = sum( weight[i] for i in range(len(label)) if label[i] == 0.0 )
|
||||
|
||||
# print weight statistics
|
||||
print 'weight statistics: wpos=%g, wneg=%g, ratio=%g' % ( sum_wpos, sum_wneg, sum_wneg/sum_wpos )
|
||||
print ('weight statistics: wpos=%g, wneg=%g, ratio=%g' % ( sum_wpos, sum_wneg, sum_wneg/sum_wpos ))
|
||||
|
||||
# construct xgboost.DMatrix from numpy array, treat -999.0 as missing value
|
||||
xgmat = xgb.DMatrix( data, label=label, missing = -999.0, weight=weight )
|
||||
@@ -43,14 +49,14 @@ param['silent'] = 1
|
||||
param['nthread'] = 16
|
||||
|
||||
# you can directly throw param in, though we want to watch multiple metrics here
|
||||
plst = param.items()+[('eval_metric', 'ams@0.15')]
|
||||
plst = list(param.items())+[('eval_metric', 'ams@0.15')]
|
||||
|
||||
watchlist = [ (xgmat,'train') ]
|
||||
# boost 120 tres
|
||||
num_round = 120
|
||||
print 'loading data end, start to boost trees'
|
||||
print ('loading data end, start to boost trees')
|
||||
bst = xgb.train( plst, xgmat, num_round, watchlist );
|
||||
# save out model
|
||||
bst.save_model('higgs.model')
|
||||
|
||||
print 'finish training'
|
||||
print ('finish training')
|
||||
|
||||
@@ -19,13 +19,13 @@ dtest = np.loadtxt( dpath+'/test.csv', delimiter=',', skiprows=1 )
|
||||
data = dtest[:,1:31]
|
||||
idx = dtest[:,0]
|
||||
|
||||
print 'finish loading from csv '
|
||||
print ('finish loading from csv ')
|
||||
xgmat = xgb.DMatrix( data, missing = -999.0 )
|
||||
bst = xgb.Booster({'nthread':16})
|
||||
bst.load_model( modelfile )
|
||||
ypred = bst.predict( xgmat )
|
||||
|
||||
res = [ ( int(idx[i]), ypred[i] ) for i in xrange(len(ypred)) ]
|
||||
res = [ ( int(idx[i]), ypred[i] ) for i in range(len(ypred)) ]
|
||||
|
||||
rorder = {}
|
||||
for k, v in sorted( res, key = lambda x:-x[1] ):
|
||||
@@ -47,7 +47,7 @@ for k, v in res:
|
||||
ntot += 1
|
||||
fo.close()
|
||||
|
||||
print 'finished writing into prediction file'
|
||||
print ('finished writing into prediction file')
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -1,4 +1,14 @@
|
||||
#!/bin/bash
|
||||
|
||||
python higgs-numpy.py
|
||||
python higgs-pred.py
|
||||
python -u higgs-numpy.py
|
||||
ret=$?
|
||||
if [[ $ret != 0 ]]; then
|
||||
echo "ERROR in higgs-numpy.py"
|
||||
exit $ret
|
||||
fi
|
||||
python -u higgs-pred.py
|
||||
ret=$?
|
||||
if [[ $ret != 0 ]]; then
|
||||
echo "ERROR in higgs-pred.py"
|
||||
exit $ret
|
||||
fi
|
||||
|
||||
@@ -14,18 +14,18 @@ dpath = 'data'
|
||||
|
||||
# load in training data, directly use numpy
|
||||
dtrain = np.loadtxt( dpath+'/training.csv', delimiter=',', skiprows=1, converters={32: lambda x:int(x=='s') } )
|
||||
print 'finish loading from csv '
|
||||
print ('finish loading from csv ')
|
||||
|
||||
label = dtrain[:,32]
|
||||
data = dtrain[:,1:31]
|
||||
# rescale weight to make it same as test set
|
||||
weight = dtrain[:,31] * float(test_size) / len(label)
|
||||
|
||||
sum_wpos = sum( weight[i] for i in xrange(len(label)) if label[i] == 1.0 )
|
||||
sum_wneg = sum( weight[i] for i in xrange(len(label)) if label[i] == 0.0 )
|
||||
sum_wpos = sum( weight[i] for i in range(len(label)) if label[i] == 1.0 )
|
||||
sum_wneg = sum( weight[i] for i in range(len(label)) if label[i] == 0.0 )
|
||||
|
||||
# print weight statistics
|
||||
print 'weight statistics: wpos=%g, wneg=%g, ratio=%g' % ( sum_wpos, sum_wneg, sum_wneg/sum_wpos )
|
||||
print ('weight statistics: wpos=%g, wneg=%g, ratio=%g' % ( sum_wpos, sum_wneg, sum_wneg/sum_wpos ))
|
||||
|
||||
# construct xgboost.DMatrix from numpy array, treat -999.0 as missing value
|
||||
xgmat = xgb.DMatrix( data, label=label, missing = -999.0, weight=weight )
|
||||
@@ -47,20 +47,20 @@ plst = param.items()+[('eval_metric', 'ams@0.15')]
|
||||
watchlist = [ (xgmat,'train') ]
|
||||
# boost 10 tres
|
||||
num_round = 10
|
||||
print 'loading data end, start to boost trees'
|
||||
print "training GBM from sklearn"
|
||||
print ('loading data end, start to boost trees')
|
||||
print ("training GBM from sklearn")
|
||||
tmp = time.time()
|
||||
gbm = GradientBoostingClassifier(n_estimators=num_round, max_depth=6, verbose=2)
|
||||
gbm.fit(data, label)
|
||||
print "sklearn.GBM costs: %s seconds" % str(time.time() - tmp)
|
||||
print ("sklearn.GBM costs: %s seconds" % str(time.time() - tmp))
|
||||
#raw_input()
|
||||
print "training xgboost"
|
||||
print ("training xgboost")
|
||||
threads = [1, 2, 4, 16]
|
||||
for i in threads:
|
||||
param['nthread'] = i
|
||||
tmp = time.time()
|
||||
plst = param.items()+[('eval_metric', 'ams@0.15')]
|
||||
bst = xgb.train( plst, xgmat, num_round, watchlist );
|
||||
print "XGBoost with %d thread costs: %s seconds" % (i, str(time.time() - tmp))
|
||||
print ("XGBoost with %d thread costs: %s seconds" % (i, str(time.time() - tmp)))
|
||||
|
||||
print 'finish training'
|
||||
print ('finish training')
|
||||
|
||||
@@ -37,6 +37,6 @@ bst = xgb.train(param, xg_train, num_round, watchlist );
|
||||
# get prediction
|
||||
pred = bst.predict( xg_test );
|
||||
|
||||
print 'predicting, classification error=%f' % (sum( int(pred[i]) != test_Y[i] for i in xrange(len(test_Y))) / float(len(test_Y)) )
|
||||
print ('predicting, classification error=%f' % (sum( int(pred[i]) != test_Y[i] for i in range(len(test_Y))) / float(len(test_Y)) ))
|
||||
|
||||
|
||||
|
||||
@@ -2,18 +2,18 @@ import sys
|
||||
|
||||
def save_data(group_data,output_feature,output_group):
|
||||
if len(group_data) == 0:
|
||||
return
|
||||
return
|
||||
|
||||
output_group.write(str(len(group_data))+"\n")
|
||||
for data in group_data:
|
||||
# only include nonzero features
|
||||
feats = [ p for p in data[2:] if float(p.split(':')[1]) != 0.0 ]
|
||||
output_feature.write(data[0] + " " + " ".join(feats) + "\n")
|
||||
output_feature.write(data[0] + " " + " ".join(feats) + "\n")
|
||||
|
||||
if __name__ == "__main__":
|
||||
if len(sys.argv) != 4:
|
||||
print "Usage: python trans_data.py [Ranksvm Format Input] [Output Feature File] [Output Group File]"
|
||||
sys.exit(0)
|
||||
print ("Usage: python trans_data.py [Ranksvm Format Input] [Output Feature File] [Output Group File]")
|
||||
sys.exit(0)
|
||||
|
||||
fi = open(sys.argv[1])
|
||||
output_feature = open(sys.argv[2],"w")
|
||||
@@ -22,16 +22,16 @@ if __name__ == "__main__":
|
||||
group_data = []
|
||||
group = ""
|
||||
for line in fi:
|
||||
if not line:
|
||||
break
|
||||
if "#" in line:
|
||||
line = line[:line.index("#")]
|
||||
if not line:
|
||||
break
|
||||
if "#" in line:
|
||||
line = line[:line.index("#")]
|
||||
splits = line.strip().split(" ")
|
||||
if splits[1] != group:
|
||||
save_data(group_data,output_feature,output_group)
|
||||
group_data = []
|
||||
group = splits[1]
|
||||
group_data.append(splits)
|
||||
if splits[1] != group:
|
||||
save_data(group_data,output_feature,output_group)
|
||||
group_data = []
|
||||
group = splits[1]
|
||||
group_data.append(splits)
|
||||
|
||||
save_data(group_data,output_feature,output_group)
|
||||
|
||||
|
||||
@@ -7,7 +7,7 @@ fmap = {}
|
||||
for l in open( 'machine.data' ):
|
||||
arr = l.split(',')
|
||||
fo.write(arr[8])
|
||||
for i in xrange( 0,6 ):
|
||||
for i in range( 0,6 ):
|
||||
fo.write( ' %d:%s' %(i,arr[i+2]) )
|
||||
|
||||
if arr[0] not in fmap:
|
||||
@@ -24,9 +24,9 @@ fo = open('featmap.txt', 'w')
|
||||
# list from machine.names
|
||||
names = ['vendor','MYCT', 'MMIN', 'MMAX', 'CACH', 'CHMIN', 'CHMAX', 'PRP', 'ERP' ];
|
||||
|
||||
for i in xrange(0,6):
|
||||
for i in range(0,6):
|
||||
fo.write( '%d\t%s\tint\n' % (i, names[i+1]))
|
||||
|
||||
for v, k in sorted( fmap.iteritems(), key = lambda x:x[1] ):
|
||||
for v, k in sorted( fmap.items(), key = lambda x:x[1] ):
|
||||
fo.write( '%d\tvendor=%s\ti\n' % (k, v))
|
||||
fo.close()
|
||||
|
||||
@@ -3,7 +3,7 @@ import sys
|
||||
import random
|
||||
|
||||
if len(sys.argv) < 2:
|
||||
print 'Usage:<filename> <k> [nfold = 5]'
|
||||
print ('Usage:<filename> <k> [nfold = 5]')
|
||||
exit(0)
|
||||
|
||||
random.seed( 10 )
|
||||
|
||||
Reference in New Issue
Block a user