diff --git a/demo/binary_classification/mapfeat.py b/demo/binary_classification/mapfeat.py index 74ca22d32..186af29e6 100755 --- a/demo/binary_classification/mapfeat.py +++ b/demo/binary_classification/mapfeat.py @@ -24,7 +24,7 @@ def loadfmap( fname ): return fmap, nmap def write_nmap( fo, nmap ): - for i in xrange( len(nmap) ): + for i in range( len(nmap) ): fo.write('%d\t%s\ti\n' % (i, nmap[i]) ) # start here @@ -41,7 +41,7 @@ for l in open( 'agaricus-lepiota.data' ): else: assert arr[0] == 'e' fo.write('0') - for i in xrange( 1,len(arr) ): + for i in range( 1,len(arr) ): fo.write( ' %d:1' % fmap[i][arr[i].strip()] ) fo.write('\n') diff --git a/demo/binary_classification/mknfold.py b/demo/binary_classification/mknfold.py index 0f42c301d..a941f8609 100755 --- a/demo/binary_classification/mknfold.py +++ b/demo/binary_classification/mknfold.py @@ -3,7 +3,7 @@ import sys import random if len(sys.argv) < 2: - print 'Usage: [nfold = 5]' + print ('Usage: [nfold = 5]') exit(0) random.seed( 10 ) diff --git a/demo/kaggle-higgs/higgs-numpy.py b/demo/kaggle-higgs/higgs-numpy.py index 2bf4a82a5..1cb7ec00c 100755 --- a/demo/kaggle-higgs/higgs-numpy.py +++ b/demo/kaggle-higgs/higgs-numpy.py @@ -1,9 +1,15 @@ #!/usr/bin/python # this is the example script to use xgboost to train +import inspect +import os import sys import numpy as np # add path of xgboost python module -sys.path.append('../../python/') +code_path = os.path.join( + os.path.split(inspect.getfile(inspect.currentframe()))[0], "../../python") + +sys.path.append(code_path) + import xgboost as xgb test_size = 550000 @@ -12,19 +18,19 @@ test_size = 550000 dpath = 'data' # load in training data, directly use numpy -dtrain = np.loadtxt( dpath+'/training.csv', delimiter=',', skiprows=1, converters={32: lambda x:int(x=='s') } ) -print 'finish loading from csv ' +dtrain = np.loadtxt( dpath+'/training.csv', delimiter=',', skiprows=1, converters={32: lambda x:int(x=='s'.encode('utf-8')) } ) +print ('finish loading from csv ') label = dtrain[:,32] data = dtrain[:,1:31] # rescale weight to make it same as test set weight = dtrain[:,31] * float(test_size) / len(label) -sum_wpos = sum( weight[i] for i in xrange(len(label)) if label[i] == 1.0 ) -sum_wneg = sum( weight[i] for i in xrange(len(label)) if label[i] == 0.0 ) +sum_wpos = sum( weight[i] for i in range(len(label)) if label[i] == 1.0 ) +sum_wneg = sum( weight[i] for i in range(len(label)) if label[i] == 0.0 ) # print weight statistics -print 'weight statistics: wpos=%g, wneg=%g, ratio=%g' % ( sum_wpos, sum_wneg, sum_wneg/sum_wpos ) +print ('weight statistics: wpos=%g, wneg=%g, ratio=%g' % ( sum_wpos, sum_wneg, sum_wneg/sum_wpos )) # construct xgboost.DMatrix from numpy array, treat -999.0 as missing value xgmat = xgb.DMatrix( data, label=label, missing = -999.0, weight=weight ) @@ -43,14 +49,14 @@ param['silent'] = 1 param['nthread'] = 16 # you can directly throw param in, though we want to watch multiple metrics here -plst = param.items()+[('eval_metric', 'ams@0.15')] +plst = list(param.items())+[('eval_metric', 'ams@0.15')] watchlist = [ (xgmat,'train') ] # boost 120 tres num_round = 120 -print 'loading data end, start to boost trees' +print ('loading data end, start to boost trees') bst = xgb.train( plst, xgmat, num_round, watchlist ); # save out model bst.save_model('higgs.model') -print 'finish training' +print ('finish training') diff --git a/demo/kaggle-higgs/higgs-pred.py b/demo/kaggle-higgs/higgs-pred.py index 3fad9c217..a38f6d82a 100755 --- a/demo/kaggle-higgs/higgs-pred.py +++ b/demo/kaggle-higgs/higgs-pred.py @@ -19,13 +19,13 @@ dtest = np.loadtxt( dpath+'/test.csv', delimiter=',', skiprows=1 ) data = dtest[:,1:31] idx = dtest[:,0] -print 'finish loading from csv ' +print ('finish loading from csv ') xgmat = xgb.DMatrix( data, missing = -999.0 ) bst = xgb.Booster({'nthread':16}) bst.load_model( modelfile ) ypred = bst.predict( xgmat ) -res = [ ( int(idx[i]), ypred[i] ) for i in xrange(len(ypred)) ] +res = [ ( int(idx[i]), ypred[i] ) for i in range(len(ypred)) ] rorder = {} for k, v in sorted( res, key = lambda x:-x[1] ): @@ -47,7 +47,7 @@ for k, v in res: ntot += 1 fo.close() -print 'finished writing into prediction file' +print ('finished writing into prediction file') diff --git a/demo/kaggle-higgs/run.sh b/demo/kaggle-higgs/run.sh index c69426c25..23cde394b 100755 --- a/demo/kaggle-higgs/run.sh +++ b/demo/kaggle-higgs/run.sh @@ -1,4 +1,14 @@ #!/bin/bash -python higgs-numpy.py -python higgs-pred.py \ No newline at end of file +python -u higgs-numpy.py +ret=$? +if [[ $ret != 0 ]]; then + echo "ERROR in higgs-numpy.py" + exit $ret +fi +python -u higgs-pred.py +ret=$? +if [[ $ret != 0 ]]; then + echo "ERROR in higgs-pred.py" + exit $ret +fi diff --git a/demo/kaggle-higgs/speedtest.py b/demo/kaggle-higgs/speedtest.py index 8bef29ff2..86d5e3a3c 100755 --- a/demo/kaggle-higgs/speedtest.py +++ b/demo/kaggle-higgs/speedtest.py @@ -14,18 +14,18 @@ dpath = 'data' # load in training data, directly use numpy dtrain = np.loadtxt( dpath+'/training.csv', delimiter=',', skiprows=1, converters={32: lambda x:int(x=='s') } ) -print 'finish loading from csv ' +print ('finish loading from csv ') label = dtrain[:,32] data = dtrain[:,1:31] # rescale weight to make it same as test set weight = dtrain[:,31] * float(test_size) / len(label) -sum_wpos = sum( weight[i] for i in xrange(len(label)) if label[i] == 1.0 ) -sum_wneg = sum( weight[i] for i in xrange(len(label)) if label[i] == 0.0 ) +sum_wpos = sum( weight[i] for i in range(len(label)) if label[i] == 1.0 ) +sum_wneg = sum( weight[i] for i in range(len(label)) if label[i] == 0.0 ) # print weight statistics -print 'weight statistics: wpos=%g, wneg=%g, ratio=%g' % ( sum_wpos, sum_wneg, sum_wneg/sum_wpos ) +print ('weight statistics: wpos=%g, wneg=%g, ratio=%g' % ( sum_wpos, sum_wneg, sum_wneg/sum_wpos )) # construct xgboost.DMatrix from numpy array, treat -999.0 as missing value xgmat = xgb.DMatrix( data, label=label, missing = -999.0, weight=weight ) @@ -47,20 +47,20 @@ plst = param.items()+[('eval_metric', 'ams@0.15')] watchlist = [ (xgmat,'train') ] # boost 10 tres num_round = 10 -print 'loading data end, start to boost trees' -print "training GBM from sklearn" +print ('loading data end, start to boost trees') +print ("training GBM from sklearn") tmp = time.time() gbm = GradientBoostingClassifier(n_estimators=num_round, max_depth=6, verbose=2) gbm.fit(data, label) -print "sklearn.GBM costs: %s seconds" % str(time.time() - tmp) +print ("sklearn.GBM costs: %s seconds" % str(time.time() - tmp)) #raw_input() -print "training xgboost" +print ("training xgboost") threads = [1, 2, 4, 16] for i in threads: param['nthread'] = i tmp = time.time() plst = param.items()+[('eval_metric', 'ams@0.15')] bst = xgb.train( plst, xgmat, num_round, watchlist ); - print "XGBoost with %d thread costs: %s seconds" % (i, str(time.time() - tmp)) + print ("XGBoost with %d thread costs: %s seconds" % (i, str(time.time() - tmp))) -print 'finish training' +print ('finish training') diff --git a/demo/multiclass_classification/train.py b/demo/multiclass_classification/train.py index 38d818890..df5e112aa 100755 --- a/demo/multiclass_classification/train.py +++ b/demo/multiclass_classification/train.py @@ -37,6 +37,6 @@ bst = xgb.train(param, xg_train, num_round, watchlist ); # get prediction pred = bst.predict( xg_test ); -print 'predicting, classification error=%f' % (sum( int(pred[i]) != test_Y[i] for i in xrange(len(test_Y))) / float(len(test_Y)) ) +print ('predicting, classification error=%f' % (sum( int(pred[i]) != test_Y[i] for i in range(len(test_Y))) / float(len(test_Y)) )) diff --git a/demo/rank/trans_data.py b/demo/rank/trans_data.py index 3c9865106..7282848c4 100644 --- a/demo/rank/trans_data.py +++ b/demo/rank/trans_data.py @@ -2,18 +2,18 @@ import sys def save_data(group_data,output_feature,output_group): if len(group_data) == 0: - return + return output_group.write(str(len(group_data))+"\n") for data in group_data: # only include nonzero features feats = [ p for p in data[2:] if float(p.split(':')[1]) != 0.0 ] - output_feature.write(data[0] + " " + " ".join(feats) + "\n") + output_feature.write(data[0] + " " + " ".join(feats) + "\n") if __name__ == "__main__": if len(sys.argv) != 4: - print "Usage: python trans_data.py [Ranksvm Format Input] [Output Feature File] [Output Group File]" - sys.exit(0) + print ("Usage: python trans_data.py [Ranksvm Format Input] [Output Feature File] [Output Group File]") + sys.exit(0) fi = open(sys.argv[1]) output_feature = open(sys.argv[2],"w") @@ -22,16 +22,16 @@ if __name__ == "__main__": group_data = [] group = "" for line in fi: - if not line: - break - if "#" in line: - line = line[:line.index("#")] + if not line: + break + if "#" in line: + line = line[:line.index("#")] splits = line.strip().split(" ") - if splits[1] != group: - save_data(group_data,output_feature,output_group) - group_data = [] - group = splits[1] - group_data.append(splits) + if splits[1] != group: + save_data(group_data,output_feature,output_group) + group_data = [] + group = splits[1] + group_data.append(splits) save_data(group_data,output_feature,output_group) diff --git a/demo/regression/mapfeat.py b/demo/regression/mapfeat.py index 5ee494fb1..d86dca38a 100755 --- a/demo/regression/mapfeat.py +++ b/demo/regression/mapfeat.py @@ -7,7 +7,7 @@ fmap = {} for l in open( 'machine.data' ): arr = l.split(',') fo.write(arr[8]) - for i in xrange( 0,6 ): + for i in range( 0,6 ): fo.write( ' %d:%s' %(i,arr[i+2]) ) if arr[0] not in fmap: @@ -24,9 +24,9 @@ fo = open('featmap.txt', 'w') # list from machine.names names = ['vendor','MYCT', 'MMIN', 'MMAX', 'CACH', 'CHMIN', 'CHMAX', 'PRP', 'ERP' ]; -for i in xrange(0,6): +for i in range(0,6): fo.write( '%d\t%s\tint\n' % (i, names[i+1])) -for v, k in sorted( fmap.iteritems(), key = lambda x:x[1] ): +for v, k in sorted( fmap.items(), key = lambda x:x[1] ): fo.write( '%d\tvendor=%s\ti\n' % (k, v)) fo.close() diff --git a/demo/regression/mknfold.py b/demo/regression/mknfold.py index 0f42c301d..a941f8609 100755 --- a/demo/regression/mknfold.py +++ b/demo/regression/mknfold.py @@ -3,7 +3,7 @@ import sys import random if len(sys.argv) < 2: - print 'Usage: [nfold = 5]' + print ('Usage: [nfold = 5]') exit(0) random.seed( 10 ) diff --git a/python/example/demo.py b/python/example/demo.py index 5cf3fb5db..d6c91b5ea 100755 --- a/python/example/demo.py +++ b/python/example/demo.py @@ -22,7 +22,7 @@ bst = xgb.train( param, dtrain, num_round, evallist ) # this is prediction preds = bst.predict( dtest ) labels = dtest.get_label() -print 'error=%f' % ( sum(1 for i in xrange(len(preds)) if int(preds[i]>0.5)!=labels[i]) /float(len(preds))) +print ('error=%f' % ( sum(1 for i in range(len(preds)) if int(preds[i]>0.5)!=labels[i]) /float(len(preds)))) bst.save_model('0001.model') # dump model bst.dump_model('dump.raw.txt') @@ -32,7 +32,7 @@ bst.dump_model('dump.raw.txt','featmap.txt') ### # build dmatrix in python iteratively # -print 'start running example of build DMatrix in python' +print ('start running example of build DMatrix in python') dtrain = xgb.DMatrix() labels = [] for l in open('agaricus.txt.train'): @@ -50,7 +50,7 @@ bst = xgb.train( param, dtrain, num_round, evallist ) ### # build dmatrix from scipy.sparse -print 'start running example of build DMatrix from scipy.sparse' +print ('start running example of build DMatrix from scipy.sparse') labels = [] row = []; col = []; dat = [] i = 0 @@ -68,7 +68,7 @@ dtrain.set_label(labels) evallist = [(dtest,'eval'), (dtrain,'train')] bst = xgb.train( param, dtrain, num_round, evallist ) -print 'start running example of build DMatrix from numpy array' +print ('start running example of build DMatrix from numpy array') # NOTE: npymat is numpy array, we will convert it into scipy.sparse.csr_matrix in internal implementation,then convert to DMatrix npymat = csr.todense() dtrain = xgb.DMatrix( npymat ) @@ -79,7 +79,7 @@ bst = xgb.train( param, dtrain, num_round, evallist ) ### # advanced: cutomsized loss function, set loss_type to 0, so that predict get untransformed score # -print 'start running example to used cutomized objective function' +print ('start running example to used cutomized objective function') # note: set loss_type properly, loss_type=2 means the prediction will get logistic transformed # in most case, we may want to set loss_type = 0, to get untransformed score to compute gradient diff --git a/python/xgboost.py b/python/xgboost.py index f51ef7ea5..690d0d4ed 100644 --- a/python/xgboost.py +++ b/python/xgboost.py @@ -41,7 +41,7 @@ class DMatrix: if data == None: return if isinstance(data,str): - xglib.XGDMatrixLoad(self.handle, ctypes.c_char_p(data), 1) + xglib.XGDMatrixLoad(self.handle, ctypes.c_char_p(data.encode('utf-8')), 1) elif isinstance(data,scp.csr_matrix): self.__init_from_csr(data) elif isinstance(data, numpy.ndarray) and len(data.shape) == 2: @@ -51,7 +51,7 @@ class DMatrix: csr = scp.csr_matrix(data) self.__init_from_csr(csr) except: - raise Exception, "can not intialize DMatrix from"+str(type(data)) + raise Exception("can not intialize DMatrix from"+str(type(data))) if label != None: self.set_label(label) if weight !=None: @@ -76,10 +76,10 @@ class DMatrix: xglib.XGDMatrixFree(self.handle) # load data from file def load(self, fname, silent=True): - xglib.XGDMatrixLoad(self.handle, ctypes.c_char_p(fname), int(silent)) + xglib.XGDMatrixLoad(self.handle, ctypes.c_char_p(fname.encode('utf-8')), int(silent)) # load data from file def save_binary(self, fname, silent=True): - xglib.XGDMatrixSaveBinary(self.handle, ctypes.c_char_p(fname), int(silent)) + xglib.XGDMatrixSaveBinary(self.handle, ctypes.c_char_p(fname.encode('utf-8')), int(silent)) # set label of dmatrix def set_label(self, label): xglib.XGDMatrixSetLabel(self.handle, (ctypes.c_float*len(label))(*label), len(label) ) @@ -111,7 +111,7 @@ class DMatrix: def __getitem__(self, ridx): length = ctypes.c_ulong() row = xglib.XGDMatrixGetRow(self.handle, ridx, ctypes.byref(length) ); - return [ (int(row[i].findex),row[i].fvalue) for i in xrange(length.value) ] + return [ (int(row[i].findex),row[i].fvalue) for i in range(length.value) ] class Booster: """learner class """ @@ -124,15 +124,21 @@ class Booster: self.set_param( params ) def __del__(self): xglib.XGBoosterFree(self.handle) - def set_param(self, params,pv=None): + def set_param(self, params, pv=None): if isinstance(params,dict): - for k, v in params.iteritems(): - xglib.XGBoosterSetParam( self.handle, ctypes.c_char_p(k), ctypes.c_char_p(str(v)) ) + for k, v in params.items(): + xglib.XGBoosterSetParam( + self.handle, ctypes.c_char_p(k.encode('utf-8')), + ctypes.c_char_p(str(v).encode('utf-8'))) elif isinstance(params,str) and pv != None: - xglib.XGBoosterSetParam( self.handle, ctypes.c_char_p(params), ctypes.c_char_p(str(pv)) ) + xglib.XGBoosterSetParam( + self.handle, ctypes.c_char_p(params.encode('utf-8')), + ctypes.c_char_p(str(pv).encode('utf-8')) ) else: for k, v in params: - xglib.XGBoosterSetParam( self.handle, ctypes.c_char_p(k), ctypes.c_char_p(str(v)) ) + xglib.XGBoosterSetParam( + self.handle, ctypes.c_char_p(k.encode('utf-8')), + ctypes.c_char_p(str(v).encode('utf-8')) ) def update(self, dtrain): """ update """ assert isinstance(dtrain, DMatrix) @@ -150,13 +156,15 @@ class Booster: assert isinstance(dtrain, DMatrix) if booster_index != None: self.set_param('interact:booster_index', str(booster_index)) - xglib.XGBoosterUpdateInteract( self.handle, dtrain.handle, ctypes.c_char_p(str(action)) ) + xglib.XGBoosterUpdateInteract( + self.handle, dtrain.handle, ctypes.c_char_p(str(action)) ) def eval_set(self, evals, it = 0): for d in evals: assert isinstance(d[0], DMatrix) assert isinstance(d[1], str) dmats = ( ctypes.c_void_p * len(evals) )(*[ d[0].handle for d in evals]) - evnames = ( ctypes.c_char_p * len(evals) )(*[ ctypes.c_char_p(d[1]) for d in evals]) + evnames = ( ctypes.c_char_p * len(evals) )( + *[ctypes.c_char_p(d[1].encode('utf-8')) for d in evals]) xglib.XGBoosterEvalOneIter( self.handle, it, dmats, evnames, len(evals) ) def eval(self, mat, name = 'eval', it = 0 ): self.eval_set( [(mat,name)], it) @@ -166,25 +174,27 @@ class Booster: return ctypes2numpy( preds, length.value ) def save_model(self, fname): """ save model to file """ - xglib.XGBoosterSaveModel( self.handle, ctypes.c_char_p(fname) ) + xglib.XGBoosterSaveModel(self.handle, ctypes.c_char_p(fname.encode('utf-8'))) def load_model(self, fname): """load model from file""" - xglib.XGBoosterLoadModel( self.handle, ctypes.c_char_p(fname) ) + xglib.XGBoosterLoadModel( self.handle, ctypes.c_char_p(fname.encode('utf-8')) ) def dump_model(self, fname, fmap=''): """dump model into text file""" - xglib.XGBoosterDumpModel( self.handle, ctypes.c_char_p(fname), ctypes.c_char_p(fmap) ) + xglib.XGBoosterDumpModel( + self.handle, ctypes.c_char_p(fname.encode('utf-8')), + ctypes.c_char_p(fmap.encode('utf-8'))) def train(params, dtrain, num_boost_round = 10, evals = [], obj=None): """ train a booster with given paramaters """ bst = Booster(params, [dtrain] ) if obj == None: - for i in xrange(num_boost_round): + for i in range(num_boost_round): bst.update( dtrain ) if len(evals) != 0: bst.eval_set( evals, i ) else: # try customized objective function - for i in xrange(num_boost_round): + for i in range(num_boost_round): pred = bst.predict( dtrain ) grad, hess = obj( pred, dtrain ) bst.boost( dtrain, grad, hess )