update regression

2014-03-26 16:25:44 -07:00 · 2014-03-26 16:25:44 -07:00 · 1440dc9c8f
commit 1440dc9c8f
parent 27bd5496a8
8 changed files with 60 additions and 182 deletions
--- a/README.md
+++ b/README.md
@ -2,11 +2,12 @@ xgboost: eXtreme Gradient Boosting
 =======
 A General purpose gradient boosting (tree) library.
-Creater: Tianqi Chen
+Authors:
 * Tianqi Chen, project creater
 * Kailong Chen, contributes regression module
 Turorial and Documentation: https://github.com/tqchen/xgboost/wiki
 Features
 =======
 * Sparse feature format:
@ -37,4 +38,3 @@ File extension convention
 * .h are interface, utils and data structures, with detailed comment; 
 * .cpp are implementations that will be compiled, with less comment; 
 * .hpp are implementations that will be included by .cpp, with less comment
--- a/demo/binary_classification/dump2json.py
+++ b/demo/binary_classification/dump2json.py
@ -1,80 +0,0 @@
 #!/usr/bin/python
 import sys
 import json
 def loadnmap( fname ):
    nmap = {}
    for l in open(fname):
        arr = l.split()
        nmap[int(arr[0])] = arr[1].strip()
    return nmap
 def recstats( rec, l, label ):
    for it in l.split(','):
        k = int( it )
        if k not in rec:
            rec[ k ] = (0,0)
        else:
            if label == 0:
                rec[k] = (rec[k][0]+1,rec[k][1])
            else:
                rec[k] = (rec[k][0],rec[k][1]+1)
 def loadstats( fname, fpath ):
    res = {}
    fp = open( fname )
    for l in open( fpath ):
        label = int( fp.readline().split()[0] )
        arr = l.split()
        for i in xrange( len(arr) ):
            if i not in res:
                res[ i ] = {}
            recstats( res[ i ], arr[i], label )            
    return res
 def mapid( idmap, fid, bid ):
    if (bid, fid) not in idmap:
        idmap[ (bid,fid) ] = len(idmap)
    return idmap[ (bid,fid) ]
 def dumpjson( fo, trees ):
    fo.write('{\n')
    fo.write('  \"roots\":'+json.dumps( trees['roots'], separators=(' , ',' : ') ) +',\n' )
    fo.write('  \"weights\":'+json.dumps( trees['weights'], separators=(' , ',' : ') ) +',\n' )
    fo.write('  \"nodes\":[\n' )
    fo.write('%s\n   ]' % ',\n'.join(('    %s' % json.dumps( n, separators=(' , ',' : ') ) )   for n in trees['nodes']) )
    fo.write('\n}\n')
 fo = sys.stdout
 nmap = loadnmap( 'featmap.txt' )
 stat = loadstats( 'agaricus.txt.test', 'dump.path.txt' )
 trees = {'roots':[], 'weights':[], 'nodes':[] }
 idmap = {}
 for l in open( 'dump.raw.txt'):
    if l.startswith('booster['):
        bid = int( l.split('[')[1].split(']')[0] )
        trees['roots'].append( mapid(idmap,bid,0) )
        trees['weights'].append( 1.0 )
        continue
    node = {}
    rid = int( l.split(':')[0] )
    node['id'] = mapid( idmap, bid, rid )
    node['neg_cnt' ] = stat[ bid ][ rid ][ 0 ]
    node['pos_cnt' ] = stat[ bid ][ rid ][ 1 ] 
    idx = l.find('[f')
    if idx != -1:
        fid = int( l[idx+2:len(l)].split('<')[0])
        node['label'] = nmap[ fid ]
        node['children'] = [ mapid( idmap, bid, int(it.split('=')[1]) ) for it in l.split()[1].split(',') ]
        node['edge_tags'] = ['yes','no']
    else:
        node['label'] = l.split(':')[1].strip()
        node['value'] = float(l.split(':')[1].split('=')[1])
    trees['nodes'].append( node )
 trees['nodes'].sort( key = lambda x:x['id'] )
 dumpjson( sys.stderr, trees)
--- a/demo/binary_classification/mushroom.conf
+++ b/demo/binary_classification/mushroom.conf
@ -1,17 +1,27 @@
-# General Parameters
+# General Parameters, see comment for each definition
 # choose the tree booster, 0: tree, 1: linear
 booster_type = 0 
 # choose logistic regression loss function for binary classification
 loss_type = 2 
 # Tree Booster Parameters
-bst:tree_maker=2
+# step size shrinkage
-bst:eta=1.0
+bst:eta = 1.0 
-bst:gamma=1.0
+# minimum loss reduction required to make a further partition
-bst:min_child_weight=1   
+bst:gamma = 1.0 
-bst:max_depth=3           
+# minimum sum of instance weight(hessian) needed in a child
 bst:min_child_weight = 1 
 # maximum depth of a tree
 bst:max_depth = 3 
-# Binary Classification Parameters
+# Task Parameters
-num_round=2
+# the number of round to do boosting
-save_period=0
+num_round = 2
 # 0 means do not save any model except the final round model
 save_period = 0 
 # The path of training data
 data = "agaricus.txt.train" 
 # The path of validation data, used to monitor training process, here [test] sets name of the validation set
 eval[test] = "agaricus.txt.test" 
-test:data =  "agaricus.txt.test"
+# The path of test data 
 test:data = "agaricus.txt.test"      
--- a/demo/regression/dump2json.py
+++ b/demo/regression/dump2json.py
@ -1,80 +0,0 @@
 #!/usr/bin/python
 import sys
 import json
 def loadnmap( fname ):
    nmap = {}
    for l in open(fname):
        arr = l.split()
        nmap[int(arr[0])] = arr[1].strip()
    return nmap
 def recstats( rec, l, label ):
    for it in l.split(','):
        k = int( it )
        if k not in rec:
            rec[ k ] = (0,0)
        else:
            if label == 0:
                rec[k] = (rec[k][0]+1,rec[k][1])
            else:
                rec[k] = (rec[k][0],rec[k][1]+1)
 def loadstats( fname, fpath ):
    res = {}
    fp = open( fname )
    for l in open( fpath ):
        label = int( fp.readline().split()[0] )
        arr = l.split()
        for i in xrange( len(arr) ):
            if i not in res:
                res[ i ] = {}
            recstats( res[ i ], arr[i], label )            
    return res
 def mapid( idmap, fid, bid ):
    if (bid, fid) not in idmap:
        idmap[ (bid,fid) ] = len(idmap)
    return idmap[ (bid,fid) ]
 def dumpjson( fo, trees ):
    fo.write('{\n')
    fo.write('  \"roots\":'+json.dumps( trees['roots'], separators=(' , ',' : ') ) +',\n' )
    fo.write('  \"weights\":'+json.dumps( trees['weights'], separators=(' , ',' : ') ) +',\n' )
    fo.write('  \"nodes\":[\n' )
    fo.write('%s\n   ]' % ',\n'.join(('    %s' % json.dumps( n, separators=(' , ',' : ') ) )   for n in trees['nodes']) )
    fo.write('\n}\n')
 fo = sys.stdout
 nmap = loadnmap( 'featmap.txt' )
 stat = loadstats( 'agaricus.txt.test', 'dump.path.txt' )
 trees = {'roots':[], 'weights':[], 'nodes':[] }
 idmap = {}
 for l in open( 'dump.raw.txt'):
    if l.startswith('booster['):
        bid = int( l.split('[')[1].split(']')[0] )
        trees['roots'].append( mapid(idmap,bid,0) )
        trees['weights'].append( 1.0 )
        continue
    node = {}
    rid = int( l.split(':')[0] )
    node['id'] = mapid( idmap, bid, rid )
    node['neg_cnt' ] = stat[ bid ][ rid ][ 0 ]
    node['pos_cnt' ] = stat[ bid ][ rid ][ 1 ] 
    idx = l.find('[f')
    if idx != -1:
        fid = int( l[idx+2:len(l)].split('<')[0])
        node['label'] = nmap[ fid ]
        node['children'] = [ mapid( idmap, bid, int(it.split('=')[1]) ) for it in l.split()[1].split(',') ]
        node['edge_tags'] = ['yes','no']
    else:
        node['label'] = l.split(':')[1].strip()
        node['value'] = float(l.split(':')[1].split('=')[1])
    trees['nodes'].append( node )
 trees['nodes'].sort( key = lambda x:x['id'] )
 dumpjson( sys.stderr, trees)
--- a/demo/regression/machine.conf
+++ b/demo/regression/machine.conf
@ -0,0 +1,30 @@
 # General Parameters, see comment for each definition
 # choose the tree booster, 0: tree, 1: linear
 booster_type = 0 
 # this is the only difference with classification, use 0: linear regression
 # when labels are in [0,1] we can also use 1: logistic regression
 loss_type = 0
 # Tree Booster Parameters
 # step size shrinkage
 bst:eta = 1.0 
 # minimum loss reduction required to make a further partition
 bst:gamma = 1.0 
 # minimum sum of instance weight(hessian) needed in a child
 bst:min_child_weight = 1 
 # maximum depth of a tree
 bst:max_depth = 3 
 # Task parameters
 # the number of round to do boosting
 num_round = 2
 # 0 means do not save any model except the final round model
 save_period = 0 
 # The path of training data
 data = "machine.txt.train" 
 # The path of validation data, used to monitor training process, here [test] sets name of the validation set
 eval[test] = "machine.txt.test" 
 # The path of test data 
 test:data = "machine.txt.test"      
--- a/demo/regression/mapfeat.py
+++ b/demo/regression/mapfeat.py
@ -19,5 +19,3 @@ for l in open( 'machine.data' ):
    fo.write('\n')
 fo.close()
--- a/demo/regression/mknfold.py
+++ b/demo/regression/mknfold.py
--- a/demo/regression/runexp.sh
+++ b/demo/regression/runexp.sh
@ -6,6 +6,6 @@ python mknfold.py machine.txt 1
 # training and output the models
 ../../xgboost machine.conf
 # output predictions of test data
-../../xgboost machine.conf task=pred model_in=0003.model
+../../xgboost machine.conf task=pred model_in=0002.model
-# print the boosters of 00003.model in dump.raw.txt
+# print the boosters of 00002.model in dump.raw.txt
-../../xgboost machine.conf task=dump model_in=0003.model name_dump=dump.raw.txt 
+../../xgboost machine.conf task=dump model_in=0002.model name_dump=dump.raw.txt
`@ -19,5 +19,3 @@ for l in open( 'machine.data' ):`
	`fo.write('\n')`	`fo.write('\n')`

	`fo.close()`	`fo.close()`