This commit is contained in:
tqchen 2014-03-03 12:26:40 -08:00
parent 2adf905dcf
commit e3b7abfb47
3 changed files with 17 additions and 5 deletions

View File

@ -1,2 +1,12 @@
example of UCI dataset
example of training a binary classifier on UCI dataset
http://archive.ics.uci.edu/ml/datasets/Mushroom
Run: ./runexp.sh
Format of input: LIBSVM format
Format of featmap.txt:
<featureid> <featurename> <q or i>\n
q means continuous quantities, i means indicator features.
Feature id must be from 0 to num_features, in sorted order.

View File

@ -46,13 +46,13 @@ def dumpjson( fo, trees ):
fo.write('\n}\n')
fo = sys.stdout
nmap = loadnmap( 'featname.txt' )
nmap = loadnmap( 'featmap.txt' )
stat = loadstats( 'agaricus.txt.test', 'dump.path.txt' )
trees = {'roots':[], 'weights':[], 'nodes':[] }
idmap = {}
for l in open( 'dump.txt'):
for l in open( 'dump.raw.txt'):
if l.startswith('booster['):
bid = int( l.split('[')[1].split(']')[0] )
trees['roots'].append( mapid(idmap,bid,0) )

View File

@ -1,10 +1,12 @@
#!/bin/bash
# map feature using indicator encoding, also produce featmap.txt
python mapfeat.py
# split train and test
python mknfold.py agaricus.txt 1
# training
../../xgboost mushroom.conf
# this is what dump will looklike without feature map
../../xgboost mushroom.conf task=dump model_in=0003.model name_dump=dump.raw.txt
# this is what dump will looklike with feature map
../../xgboost mushroom.conf task=dump model_in=0003.model fmap=featmap.txt name_dump=dump.nice.txt
cat dump.nice.txt