ok

2014-03-03 12:26:40 -08:00 · 2014-03-03 12:26:40 -08:00 · e3b7abfb47
commit e3b7abfb47
parent 2adf905dcf
3 changed files with 17 additions and 5 deletions
--- a/demo/mushroom/README
+++ b/demo/mushroom/README
@ -1,2 +1,12 @@
-example of UCI dataset
+example of training a binary classifier on UCI dataset
 http://archive.ics.uci.edu/ml/datasets/Mushroom
+
+Run: ./runexp.sh
+
+Format of input: LIBSVM format
+
+Format of featmap.txt:
+<featureid> <featurename> <q or i>\n
+
+q means continuous quantities, i means indicator features.
+Feature id must be from 0 to num_features, in sorted order.
--- a/demo/mushroom/dump2json.py
+++ b/demo/mushroom/dump2json.py
@ -46,13 +46,13 @@ def dumpjson( fo, trees ):
    fo.write('\n}\n')
        
 fo = sys.stdout
-nmap = loadnmap( 'featname.txt' )
+nmap = loadnmap( 'featmap.txt' )
 stat = loadstats( 'agaricus.txt.test', 'dump.path.txt' )

 trees = {'roots':[], 'weights':[], 'nodes':[] }
 idmap = {}

-for l in open( 'dump.txt'):
+for l in open( 'dump.raw.txt'):
    if l.startswith('booster['):
        bid = int( l.split('[')[1].split(']')[0] )
        trees['roots'].append( mapid(idmap,bid,0) )
--- a/demo/mushroom/runexp.sh
+++ b/demo/mushroom/runexp.sh
@ -1,10 +1,12 @@
 #!/bin/bash
+# map feature using indicator encoding, also produce featmap.txt
 python mapfeat.py
+# split train and test
 python mknfold.py agaricus.txt 1
+# training
 ../../xgboost mushroom.conf
 # this is what dump will looklike without feature map
 ../../xgboost mushroom.conf task=dump model_in=0003.model name_dump=dump.raw.txt 
 # this is what dump will looklike with feature map
 ../../xgboost mushroom.conf task=dump model_in=0003.model fmap=featmap.txt name_dump=dump.nice.txt
 cat dump.nice.txt
-