From e3b7abfb47986db40c6e974032c17501b7147058 Mon Sep 17 00:00:00 2001 From: tqchen Date: Mon, 3 Mar 2014 12:26:40 -0800 Subject: [PATCH] ok --- demo/mushroom/README | 12 +++++++++++- demo/mushroom/dump2json.py | 6 +++--- demo/mushroom/runexp.sh | 4 +++- 3 files changed, 17 insertions(+), 5 deletions(-) diff --git a/demo/mushroom/README b/demo/mushroom/README index 4707a30c3..8067f5612 100644 --- a/demo/mushroom/README +++ b/demo/mushroom/README @@ -1,2 +1,12 @@ -example of UCI dataset +example of training a binary classifier on UCI dataset http://archive.ics.uci.edu/ml/datasets/Mushroom + +Run: ./runexp.sh + +Format of input: LIBSVM format + +Format of featmap.txt: + \n + +q means continuous quantities, i means indicator features. +Feature id must be from 0 to num_features, in sorted order. diff --git a/demo/mushroom/dump2json.py b/demo/mushroom/dump2json.py index dae506457..de1a32cca 100755 --- a/demo/mushroom/dump2json.py +++ b/demo/mushroom/dump2json.py @@ -46,13 +46,13 @@ def dumpjson( fo, trees ): fo.write('\n}\n') fo = sys.stdout -nmap = loadnmap( 'featname.txt' ) +nmap = loadnmap( 'featmap.txt' ) stat = loadstats( 'agaricus.txt.test', 'dump.path.txt' ) trees = {'roots':[], 'weights':[], 'nodes':[] } idmap = {} -for l in open( 'dump.txt'): +for l in open( 'dump.raw.txt'): if l.startswith('booster['): bid = int( l.split('[')[1].split(']')[0] ) trees['roots'].append( mapid(idmap,bid,0) ) @@ -63,7 +63,7 @@ for l in open( 'dump.txt'): rid = int( l.split(':')[0] ) node['id'] = mapid( idmap, bid, rid ) node['neg_cnt' ] = stat[ bid ][ rid ][ 0 ] - node['pos_cnt' ] = stat[ bid ][ rid ][ 1 ] + node['pos_cnt' ] = stat[ bid ][ rid ][ 1 ] idx = l.find('[f') if idx != -1: diff --git a/demo/mushroom/runexp.sh b/demo/mushroom/runexp.sh index 50d60ca9f..e855085ac 100755 --- a/demo/mushroom/runexp.sh +++ b/demo/mushroom/runexp.sh @@ -1,10 +1,12 @@ #!/bin/bash +# map feature using indicator encoding, also produce featmap.txt python mapfeat.py +# split train and test python mknfold.py agaricus.txt 1 +# training ../../xgboost mushroom.conf # this is what dump will looklike without feature map ../../xgboost mushroom.conf task=dump model_in=0003.model name_dump=dump.raw.txt # this is what dump will looklike with feature map ../../xgboost mushroom.conf task=dump model_in=0003.model fmap=featmap.txt name_dump=dump.nice.txt cat dump.nice.txt -