update regression
This commit is contained in:
parent
0a971cb466
commit
7d97d6b1d4
@ -2,11 +2,12 @@ xgboost: eXtreme Gradient Boosting
|
|||||||
=======
|
=======
|
||||||
A General purpose gradient boosting (tree) library.
|
A General purpose gradient boosting (tree) library.
|
||||||
|
|
||||||
Creater: Tianqi Chen
|
Authors:
|
||||||
|
* Tianqi Chen, project creater
|
||||||
|
* Kailong Chen, contributes regression module
|
||||||
|
|
||||||
Turorial and Documentation: https://github.com/tqchen/xgboost/wiki
|
Turorial and Documentation: https://github.com/tqchen/xgboost/wiki
|
||||||
|
|
||||||
|
|
||||||
Features
|
Features
|
||||||
=======
|
=======
|
||||||
* Sparse feature format:
|
* Sparse feature format:
|
||||||
@ -37,4 +38,3 @@ File extension convention
|
|||||||
* .h are interface, utils and data structures, with detailed comment;
|
* .h are interface, utils and data structures, with detailed comment;
|
||||||
* .cpp are implementations that will be compiled, with less comment;
|
* .cpp are implementations that will be compiled, with less comment;
|
||||||
* .hpp are implementations that will be included by .cpp, with less comment
|
* .hpp are implementations that will be included by .cpp, with less comment
|
||||||
|
|
||||||
|
|||||||
@ -1,80 +0,0 @@
|
|||||||
#!/usr/bin/python
|
|
||||||
import sys
|
|
||||||
import json
|
|
||||||
|
|
||||||
def loadnmap( fname ):
|
|
||||||
nmap = {}
|
|
||||||
for l in open(fname):
|
|
||||||
arr = l.split()
|
|
||||||
nmap[int(arr[0])] = arr[1].strip()
|
|
||||||
return nmap
|
|
||||||
|
|
||||||
def recstats( rec, l, label ):
|
|
||||||
for it in l.split(','):
|
|
||||||
k = int( it )
|
|
||||||
if k not in rec:
|
|
||||||
rec[ k ] = (0,0)
|
|
||||||
else:
|
|
||||||
if label == 0:
|
|
||||||
rec[k] = (rec[k][0]+1,rec[k][1])
|
|
||||||
else:
|
|
||||||
rec[k] = (rec[k][0],rec[k][1]+1)
|
|
||||||
|
|
||||||
def loadstats( fname, fpath ):
|
|
||||||
res = {}
|
|
||||||
fp = open( fname )
|
|
||||||
for l in open( fpath ):
|
|
||||||
label = int( fp.readline().split()[0] )
|
|
||||||
arr = l.split()
|
|
||||||
for i in xrange( len(arr) ):
|
|
||||||
if i not in res:
|
|
||||||
res[ i ] = {}
|
|
||||||
recstats( res[ i ], arr[i], label )
|
|
||||||
return res
|
|
||||||
|
|
||||||
def mapid( idmap, fid, bid ):
|
|
||||||
if (bid, fid) not in idmap:
|
|
||||||
idmap[ (bid,fid) ] = len(idmap)
|
|
||||||
return idmap[ (bid,fid) ]
|
|
||||||
|
|
||||||
def dumpjson( fo, trees ):
|
|
||||||
fo.write('{\n')
|
|
||||||
fo.write(' \"roots\":'+json.dumps( trees['roots'], separators=(' , ',' : ') ) +',\n' )
|
|
||||||
fo.write(' \"weights\":'+json.dumps( trees['weights'], separators=(' , ',' : ') ) +',\n' )
|
|
||||||
fo.write(' \"nodes\":[\n' )
|
|
||||||
fo.write('%s\n ]' % ',\n'.join((' %s' % json.dumps( n, separators=(' , ',' : ') ) ) for n in trees['nodes']) )
|
|
||||||
fo.write('\n}\n')
|
|
||||||
|
|
||||||
fo = sys.stdout
|
|
||||||
nmap = loadnmap( 'featmap.txt' )
|
|
||||||
stat = loadstats( 'agaricus.txt.test', 'dump.path.txt' )
|
|
||||||
|
|
||||||
trees = {'roots':[], 'weights':[], 'nodes':[] }
|
|
||||||
idmap = {}
|
|
||||||
|
|
||||||
for l in open( 'dump.raw.txt'):
|
|
||||||
if l.startswith('booster['):
|
|
||||||
bid = int( l.split('[')[1].split(']')[0] )
|
|
||||||
trees['roots'].append( mapid(idmap,bid,0) )
|
|
||||||
trees['weights'].append( 1.0 )
|
|
||||||
continue
|
|
||||||
|
|
||||||
node = {}
|
|
||||||
rid = int( l.split(':')[0] )
|
|
||||||
node['id'] = mapid( idmap, bid, rid )
|
|
||||||
node['neg_cnt' ] = stat[ bid ][ rid ][ 0 ]
|
|
||||||
node['pos_cnt' ] = stat[ bid ][ rid ][ 1 ]
|
|
||||||
|
|
||||||
idx = l.find('[f')
|
|
||||||
if idx != -1:
|
|
||||||
fid = int( l[idx+2:len(l)].split('<')[0])
|
|
||||||
node['label'] = nmap[ fid ]
|
|
||||||
node['children'] = [ mapid( idmap, bid, int(it.split('=')[1]) ) for it in l.split()[1].split(',') ]
|
|
||||||
node['edge_tags'] = ['yes','no']
|
|
||||||
else:
|
|
||||||
node['label'] = l.split(':')[1].strip()
|
|
||||||
node['value'] = float(l.split(':')[1].split('=')[1])
|
|
||||||
|
|
||||||
trees['nodes'].append( node )
|
|
||||||
trees['nodes'].sort( key = lambda x:x['id'] )
|
|
||||||
dumpjson( sys.stderr, trees)
|
|
||||||
@ -1,17 +1,27 @@
|
|||||||
# General Parameters
|
# General Parameters, see comment for each definition
|
||||||
|
# choose the tree booster, 0: tree, 1: linear
|
||||||
booster_type = 0
|
booster_type = 0
|
||||||
|
# choose logistic regression loss function for binary classification
|
||||||
loss_type = 2
|
loss_type = 2
|
||||||
|
|
||||||
# Tree Booster Parameters
|
# Tree Booster Parameters
|
||||||
bst:tree_maker=2
|
# step size shrinkage
|
||||||
bst:eta=1.0
|
bst:eta = 1.0
|
||||||
bst:gamma=1.0
|
# minimum loss reduction required to make a further partition
|
||||||
bst:min_child_weight=1
|
bst:gamma = 1.0
|
||||||
bst:max_depth=3
|
# minimum sum of instance weight(hessian) needed in a child
|
||||||
|
bst:min_child_weight = 1
|
||||||
|
# maximum depth of a tree
|
||||||
|
bst:max_depth = 3
|
||||||
|
|
||||||
# Binary Classification Parameters
|
# Task Parameters
|
||||||
num_round=2
|
# the number of round to do boosting
|
||||||
save_period=0
|
num_round = 2
|
||||||
|
# 0 means do not save any model except the final round model
|
||||||
|
save_period = 0
|
||||||
|
# The path of training data
|
||||||
data = "agaricus.txt.train"
|
data = "agaricus.txt.train"
|
||||||
|
# The path of validation data, used to monitor training process, here [test] sets name of the validation set
|
||||||
eval[test] = "agaricus.txt.test"
|
eval[test] = "agaricus.txt.test"
|
||||||
|
# The path of test data
|
||||||
test:data = "agaricus.txt.test"
|
test:data = "agaricus.txt.test"
|
||||||
@ -1,80 +0,0 @@
|
|||||||
#!/usr/bin/python
|
|
||||||
import sys
|
|
||||||
import json
|
|
||||||
|
|
||||||
def loadnmap( fname ):
|
|
||||||
nmap = {}
|
|
||||||
for l in open(fname):
|
|
||||||
arr = l.split()
|
|
||||||
nmap[int(arr[0])] = arr[1].strip()
|
|
||||||
return nmap
|
|
||||||
|
|
||||||
def recstats( rec, l, label ):
|
|
||||||
for it in l.split(','):
|
|
||||||
k = int( it )
|
|
||||||
if k not in rec:
|
|
||||||
rec[ k ] = (0,0)
|
|
||||||
else:
|
|
||||||
if label == 0:
|
|
||||||
rec[k] = (rec[k][0]+1,rec[k][1])
|
|
||||||
else:
|
|
||||||
rec[k] = (rec[k][0],rec[k][1]+1)
|
|
||||||
|
|
||||||
def loadstats( fname, fpath ):
|
|
||||||
res = {}
|
|
||||||
fp = open( fname )
|
|
||||||
for l in open( fpath ):
|
|
||||||
label = int( fp.readline().split()[0] )
|
|
||||||
arr = l.split()
|
|
||||||
for i in xrange( len(arr) ):
|
|
||||||
if i not in res:
|
|
||||||
res[ i ] = {}
|
|
||||||
recstats( res[ i ], arr[i], label )
|
|
||||||
return res
|
|
||||||
|
|
||||||
def mapid( idmap, fid, bid ):
|
|
||||||
if (bid, fid) not in idmap:
|
|
||||||
idmap[ (bid,fid) ] = len(idmap)
|
|
||||||
return idmap[ (bid,fid) ]
|
|
||||||
|
|
||||||
def dumpjson( fo, trees ):
|
|
||||||
fo.write('{\n')
|
|
||||||
fo.write(' \"roots\":'+json.dumps( trees['roots'], separators=(' , ',' : ') ) +',\n' )
|
|
||||||
fo.write(' \"weights\":'+json.dumps( trees['weights'], separators=(' , ',' : ') ) +',\n' )
|
|
||||||
fo.write(' \"nodes\":[\n' )
|
|
||||||
fo.write('%s\n ]' % ',\n'.join((' %s' % json.dumps( n, separators=(' , ',' : ') ) ) for n in trees['nodes']) )
|
|
||||||
fo.write('\n}\n')
|
|
||||||
|
|
||||||
fo = sys.stdout
|
|
||||||
nmap = loadnmap( 'featmap.txt' )
|
|
||||||
stat = loadstats( 'agaricus.txt.test', 'dump.path.txt' )
|
|
||||||
|
|
||||||
trees = {'roots':[], 'weights':[], 'nodes':[] }
|
|
||||||
idmap = {}
|
|
||||||
|
|
||||||
for l in open( 'dump.raw.txt'):
|
|
||||||
if l.startswith('booster['):
|
|
||||||
bid = int( l.split('[')[1].split(']')[0] )
|
|
||||||
trees['roots'].append( mapid(idmap,bid,0) )
|
|
||||||
trees['weights'].append( 1.0 )
|
|
||||||
continue
|
|
||||||
|
|
||||||
node = {}
|
|
||||||
rid = int( l.split(':')[0] )
|
|
||||||
node['id'] = mapid( idmap, bid, rid )
|
|
||||||
node['neg_cnt' ] = stat[ bid ][ rid ][ 0 ]
|
|
||||||
node['pos_cnt' ] = stat[ bid ][ rid ][ 1 ]
|
|
||||||
|
|
||||||
idx = l.find('[f')
|
|
||||||
if idx != -1:
|
|
||||||
fid = int( l[idx+2:len(l)].split('<')[0])
|
|
||||||
node['label'] = nmap[ fid ]
|
|
||||||
node['children'] = [ mapid( idmap, bid, int(it.split('=')[1]) ) for it in l.split()[1].split(',') ]
|
|
||||||
node['edge_tags'] = ['yes','no']
|
|
||||||
else:
|
|
||||||
node['label'] = l.split(':')[1].strip()
|
|
||||||
node['value'] = float(l.split(':')[1].split('=')[1])
|
|
||||||
|
|
||||||
trees['nodes'].append( node )
|
|
||||||
trees['nodes'].sort( key = lambda x:x['id'] )
|
|
||||||
dumpjson( sys.stderr, trees)
|
|
||||||
30
demo/regression/machine.conf
Normal file
30
demo/regression/machine.conf
Normal file
@ -0,0 +1,30 @@
|
|||||||
|
# General Parameters, see comment for each definition
|
||||||
|
# choose the tree booster, 0: tree, 1: linear
|
||||||
|
booster_type = 0
|
||||||
|
# this is the only difference with classification, use 0: linear regression
|
||||||
|
# when labels are in [0,1] we can also use 1: logistic regression
|
||||||
|
loss_type = 0
|
||||||
|
|
||||||
|
# Tree Booster Parameters
|
||||||
|
# step size shrinkage
|
||||||
|
bst:eta = 1.0
|
||||||
|
# minimum loss reduction required to make a further partition
|
||||||
|
bst:gamma = 1.0
|
||||||
|
# minimum sum of instance weight(hessian) needed in a child
|
||||||
|
bst:min_child_weight = 1
|
||||||
|
# maximum depth of a tree
|
||||||
|
bst:max_depth = 3
|
||||||
|
|
||||||
|
# Task parameters
|
||||||
|
# the number of round to do boosting
|
||||||
|
num_round = 2
|
||||||
|
# 0 means do not save any model except the final round model
|
||||||
|
save_period = 0
|
||||||
|
# The path of training data
|
||||||
|
data = "machine.txt.train"
|
||||||
|
# The path of validation data, used to monitor training process, here [test] sets name of the validation set
|
||||||
|
eval[test] = "machine.txt.test"
|
||||||
|
# The path of test data
|
||||||
|
test:data = "machine.txt.test"
|
||||||
|
|
||||||
|
|
||||||
2
demo/regression/mapfeat.py
Normal file → Executable file
2
demo/regression/mapfeat.py
Normal file → Executable file
@ -19,5 +19,3 @@ for l in open( 'machine.data' ):
|
|||||||
fo.write('\n')
|
fo.write('\n')
|
||||||
|
|
||||||
fo.close()
|
fo.close()
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
0
demo/regression/mknfold.py
Normal file → Executable file
0
demo/regression/mknfold.py
Normal file → Executable file
6
demo/regression/runexp.sh
Normal file → Executable file
6
demo/regression/runexp.sh
Normal file → Executable file
@ -6,6 +6,6 @@ python mknfold.py machine.txt 1
|
|||||||
# training and output the models
|
# training and output the models
|
||||||
../../xgboost machine.conf
|
../../xgboost machine.conf
|
||||||
# output predictions of test data
|
# output predictions of test data
|
||||||
../../xgboost machine.conf task=pred model_in=0003.model
|
../../xgboost machine.conf task=pred model_in=0002.model
|
||||||
# print the boosters of 00003.model in dump.raw.txt
|
# print the boosters of 00002.model in dump.raw.txt
|
||||||
../../xgboost machine.conf task=dump model_in=0003.model name_dump=dump.raw.txt
|
../../xgboost machine.conf task=dump model_in=0002.model name_dump=dump.raw.txt
|
||||||
Loading…
x
Reference in New Issue
Block a user