remove test directory

2014-03-23 00:05:46 +08:00 · 2014-03-23 00:05:46 +08:00 · 57713be940
commit 57713be940
parent 77901f2428
14 changed files with 21 additions and 8526 deletions
--- a/demo/binary_classification/README
+++ b/demo/binary_classification/README
@ -1,5 +1,4 @@
-example of training a binary classifier on UCI dataset
+Demonstrating how to use XGBoost accomplish binary classification tasks  on UCI mushroom dataset  http://archive.ics.uci.edu/ml/datasets/Mushroom
 http://archive.ics.uci.edu/ml/datasets/Mushroom
 Run: ./runexp.sh
--- a/demo/binary_classification/mushroom.conf
+++ b/demo/binary_classification/mushroom.conf
@ -1,18 +1,17 @@
-num_round=2
+# General Parameters
 save_period=0
 data = "agaricus.txt.train"
 eval[test] = "agaricus.txt.test"
 test:data =  "agaricus.txt.test"
 booster_type = 0
 loss_type = 2
 # Tree Booster Parameters
 bst:tree_maker=2
 bst:eta=1.0
 bst:gamma=1.0
 bst:min_child_weight=1   
 bst:max_depth=3           
 # Binary Classification Parameters
 num_round=2
 save_period=0
 data = "agaricus.txt.train"
 eval[test] = "agaricus.txt.test"
 test:data =  "agaricus.txt.test"
--- a/demo/binary_classification/runexp.sh
+++ b/demo/binary_classification/runexp.sh
@ -3,10 +3,12 @@
 python mapfeat.py
 # split train and test
 python mknfold.py agaricus.txt 1
-# training
+# training and output the models
 ../../xgboost mushroom.conf
-# this is what dump will looklike without feature map
+# output prediction task=pred 
 ../../xgboost mushroom.conf task=pred model_in=0003.model
 # print the boosters of 00003.model in dump.raw.txt
 ../../xgboost mushroom.conf task=dump model_in=0003.model name_dump=dump.raw.txt 
-# this is what dump will looklike with feature map
+# use the feature map in printing for better visualization
 ../../xgboost mushroom.conf task=dump model_in=0003.model fmap=featmap.txt name_dump=dump.nice.txt
 cat dump.nice.txt
--- a/demo/regression/README
+++ b/demo/regression/README
@ -1,5 +1,4 @@
-example of training a binary classifier on UCI dataset
+Demonstrating how to use XGBoost accomplish regression tasks on UCI mushroom dataset  http://archive.ics.uci.edu/ml/datasets/Mushroom
 http://archive.ics.uci.edu/ml/datasets/Mushroom
 Run: ./runexp.sh
--- a/demo/regression/runexp.sh
+++ b/demo/regression/runexp.sh
@ -3,10 +3,12 @@
 python mapfeat.py
 # split train and test
 python mknfold.py agaricus.txt 1
-# training
+# training and output the models
 ../../xgboost mushroom.conf
-# this is what dump will looklike without feature map
+# output predictions of test data
 ../../xgboost mushroom.conf task=pred model_in=0003.model
 # print the boosters of 00003.model in dump.raw.txt
 ../../xgboost mushroom.conf task=dump model_in=0003.model name_dump=dump.raw.txt 
-# this is what dump will looklike with feature map
+# use the feature map in printing for better visualization
 ../../xgboost mushroom.conf task=dump model_in=0003.model fmap=featmap.txt name_dump=dump.nice.txt
 cat dump.nice.txt
--- a/demo/test/README
+++ b/demo/test/README
@ -1 +0,0 @@
 test folder to test new functions
--- a/demo/test/agaricus-lepiota.data
+++ b/demo/test/agaricus-lepiota.data
--- a/demo/test/agaricus-lepiota.fmap
+++ b/demo/test/agaricus-lepiota.fmap
@ -1,32 +0,0 @@
     1. cap-shape:                bell=b,conical=c,convex=x,flat=f,knobbed=k,sunken=s
     2. cap-surface:              fibrous=f,grooves=g,scaly=y,smooth=s
     3. cap-color:                brown=n,buff=b,cinnamon=c,gray=g,green=r,pink=p,purple=u,red=e,white=w,yellow=y
     4. bruises?:                 bruises=t,no=f
     5. odor:                     almond=a,anise=l,creosote=c,fishy=y,foul=f,
                                  musty=m,none=n,pungent=p,spicy=s
     6. gill-attachment:          attached=a,descending=d,free=f,notched=n
     7. gill-spacing:             close=c,crowded=w,distant=d
     8. gill-size:                broad=b,narrow=n
     9. gill-color:               black=k,brown=n,buff=b,chocolate=h,gray=g,
                                  green=r,orange=o,pink=p,purple=u,red=e,
                                  white=w,yellow=y
    10. stalk-shape:              enlarging=e,tapering=t
    11. stalk-root:               bulbous=b,club=c,cup=u,equal=e,
                                  rhizomorphs=z,rooted=r,missing=?
    12. stalk-surface-above-ring: fibrous=f,scaly=y,silky=k,smooth=s
    13. stalk-surface-below-ring: fibrous=f,scaly=y,silky=k,smooth=s
    14. stalk-color-above-ring:   brown=n,buff=b,cinnamon=c,gray=g,orange=o,
                                  pink=p,red=e,white=w,yellow=y
    15. stalk-color-below-ring:   brown=n,buff=b,cinnamon=c,gray=g,orange=o,
                                  pink=p,red=e,white=w,yellow=y
    16. veil-type:                partial=p,universal=u
    17. veil-color:               brown=n,orange=o,white=w,yellow=y
    18. ring-number:              none=n,one=o,two=t
    19. ring-type:                cobwebby=c,evanescent=e,flaring=f,large=l,
                                  none=n,pendant=p,sheathing=s,zone=z
    20. spore-print-color:        black=k,brown=n,buff=b,chocolate=h,green=r,
                                  orange=o,purple=u,white=w,yellow=y
    21. population:               abundant=a,clustered=c,numerous=n,
                                  scattered=s,several=v,solitary=y
    22. habitat:                  grasses=g,leaves=l,meadows=m,paths=p,
                                  urban=u,waste=w,woods=d
--- a/demo/test/agaricus-lepiota.names
+++ b/demo/test/agaricus-lepiota.names
@ -1,148 +0,0 @@
 1. Title: Mushroom Database
 2. Sources: 
    (a) Mushroom records drawn from The Audubon Society Field Guide to North
        American Mushrooms (1981). G. H. Lincoff (Pres.), New York: Alfred
        A. Knopf
    (b) Donor: Jeff Schlimmer (Jeffrey.Schlimmer@a.gp.cs.cmu.edu)
    (c) Date: 27 April 1987
 3. Past Usage:
    1. Schlimmer,J.S. (1987). Concept Acquisition Through Representational
       Adjustment (Technical Report 87-19).  Doctoral disseration, Department
       of Information and Computer Science, University of California, Irvine.
       --- STAGGER: asymptoted to 95% classification accuracy after reviewing
           1000 instances.
    2. Iba,W., Wogulis,J., & Langley,P. (1988).  Trading off Simplicity
       and Coverage in Incremental Concept Learning. In Proceedings of 
       the 5th International Conference on Machine Learning, 73-79.
       Ann Arbor, Michigan: Morgan Kaufmann.  
       -- approximately the same results with their HILLARY algorithm    
    3. In the following references a set of rules (given below) were
 	learned for this data set which may serve as a point of
 	comparison for other researchers.
 	Duch W, Adamczak R, Grabczewski K (1996) Extraction of logical rules
 	from training data using backpropagation networks, in: Proc. of the
 	The 1st Online Workshop on Soft Computing, 19-30.Aug.1996, pp. 25-30,
 	available on-line at: http://www.bioele.nuee.nagoya-u.ac.jp/wsc1/
 	Duch W, Adamczak R, Grabczewski K, Ishikawa M, Ueda H, Extraction of
 	crisp logical rules using constrained backpropagation networks -
 	comparison of two new approaches, in: Proc. of the European Symposium
 	on Artificial Neural Networks (ESANN'97), Bruge, Belgium 16-18.4.1997,
 	pp. xx-xx
 	Wlodzislaw Duch, Department of Computer Methods, Nicholas Copernicus
 	University, 87-100 Torun, Grudziadzka 5, Poland
 	e-mail: duch@phys.uni.torun.pl
 	WWW     http://www.phys.uni.torun.pl/kmk/
 	Date: Mon, 17 Feb 1997 13:47:40 +0100
 	From: Wlodzislaw Duch <duch@phys.uni.torun.pl>
 	Organization: Dept. of Computer Methods, UMK
 	I have attached a file containing logical rules for mushrooms.
 	It should be helpful for other people since only in the last year I
 	have seen about 10 papers analyzing this dataset and obtaining quite
 	complex rules. We will try to contribute other results later.
 	With best regards, Wlodek Duch
 	________________________________________________________________
 	Logical rules for the mushroom data sets.
 	Logical rules given below seem to be the simplest possible for the
 	mushroom dataset and therefore should be treated as benchmark results.
 	Disjunctive rules for poisonous mushrooms, from most general
 	to most specific:
 	P_1) odor=NOT(almond.OR.anise.OR.none)
 	     120 poisonous cases missed, 98.52% accuracy
 	P_2) spore-print-color=green
 	     48 cases missed, 99.41% accuracy
 	P_3) odor=none.AND.stalk-surface-below-ring=scaly.AND.
 	          (stalk-color-above-ring=NOT.brown) 
 	     8 cases missed, 99.90% accuracy
 	P_4) habitat=leaves.AND.cap-color=white
 	         100% accuracy     
 	Rule P_4) may also be
 	P_4') population=clustered.AND.cap_color=white
 	These rule involve 6 attributes (out of 22). Rules for edible
 	mushrooms are obtained as negation of the rules given above, for
 	example the rule:
 	odor=(almond.OR.anise.OR.none).AND.spore-print-color=NOT.green
 	gives 48 errors, or 99.41% accuracy on the whole dataset.
 	Several slightly more complex variations on these rules exist,
 	involving other attributes, such as gill_size, gill_spacing,
 	stalk_surface_above_ring, but the rules given above are the simplest
 	we have found.
 4. Relevant Information:
    This data set includes descriptions of hypothetical samples
    corresponding to 23 species of gilled mushrooms in the Agaricus and
    Lepiota Family (pp. 500-525).  Each species is identified as
    definitely edible, definitely poisonous, or of unknown edibility and
    not recommended.  This latter class was combined with the poisonous
    one.  The Guide clearly states that there is no simple rule for
    determining the edibility of a mushroom; no rule like ``leaflets
    three, let it be'' for Poisonous Oak and Ivy.
 5. Number of Instances: 8124
 6. Number of Attributes: 22 (all nominally valued)
 7. Attribute Information: (classes: edible=e, poisonous=p)
     1. cap-shape:                bell=b,conical=c,convex=x,flat=f,
                                  knobbed=k,sunken=s
     2. cap-surface:              fibrous=f,grooves=g,scaly=y,smooth=s
     3. cap-color:                brown=n,buff=b,cinnamon=c,gray=g,green=r,
                                  pink=p,purple=u,red=e,white=w,yellow=y
     4. bruises?:                 bruises=t,no=f
     5. odor:                     almond=a,anise=l,creosote=c,fishy=y,foul=f,
                                  musty=m,none=n,pungent=p,spicy=s
     6. gill-attachment:          attached=a,descending=d,free=f,notched=n
     7. gill-spacing:             close=c,crowded=w,distant=d
     8. gill-size:                broad=b,narrow=n
     9. gill-color:               black=k,brown=n,buff=b,chocolate=h,gray=g,
                                  green=r,orange=o,pink=p,purple=u,red=e,
                                  white=w,yellow=y
    10. stalk-shape:              enlarging=e,tapering=t
    11. stalk-root:               bulbous=b,club=c,cup=u,equal=e,
                                  rhizomorphs=z,rooted=r,missing=?
    12. stalk-surface-above-ring: fibrous=f,scaly=y,silky=k,smooth=s
    13. stalk-surface-below-ring: fibrous=f,scaly=y,silky=k,smooth=s
    14. stalk-color-above-ring:   brown=n,buff=b,cinnamon=c,gray=g,orange=o,
                                  pink=p,red=e,white=w,yellow=y
    15. stalk-color-below-ring:   brown=n,buff=b,cinnamon=c,gray=g,orange=o,
                                  pink=p,red=e,white=w,yellow=y
    16. veil-type:                partial=p,universal=u
    17. veil-color:               brown=n,orange=o,white=w,yellow=y
    18. ring-number:              none=n,one=o,two=t
    19. ring-type:                cobwebby=c,evanescent=e,flaring=f,large=l,
                                  none=n,pendant=p,sheathing=s,zone=z
    20. spore-print-color:        black=k,brown=n,buff=b,chocolate=h,green=r,
                                  orange=o,purple=u,white=w,yellow=y
    21. population:               abundant=a,clustered=c,numerous=n,
                                  scattered=s,several=v,solitary=y
    22. habitat:                  grasses=g,leaves=l,meadows=m,paths=p,
                                  urban=u,waste=w,woods=d
 8. Missing Attribute Values: 2480 of them (denoted by "?"), all for
   attribute #11.
 9. Class Distribution: 
    --    edible: 4208 (51.8%)
    -- poisonous: 3916 (48.2%)
    --     total: 8124 instances
--- a/demo/test/dump2json.py
+++ b/demo/test/dump2json.py
@ -1,80 +0,0 @@
 #!/usr/bin/python
 import sys
 import json
 def loadnmap( fname ):
    nmap = {}
    for l in open(fname):
        arr = l.split()
        nmap[int(arr[0])] = arr[1].strip()
    return nmap
 def recstats( rec, l, label ):
    for it in l.split(','):
        k = int( it )
        if k not in rec:
            rec[ k ] = (0,0)
        else:
            if label == 0:
                rec[k] = (rec[k][0]+1,rec[k][1])
            else:
                rec[k] = (rec[k][0],rec[k][1]+1)
 def loadstats( fname, fpath ):
    res = {}
    fp = open( fname )
    for l in open( fpath ):
        label = int( fp.readline().split()[0] )
        arr = l.split()
        for i in xrange( len(arr) ):
            if i not in res:
                res[ i ] = {}
            recstats( res[ i ], arr[i], label )            
    return res
 def mapid( idmap, fid, bid ):
    if (bid, fid) not in idmap:
        idmap[ (bid,fid) ] = len(idmap)
    return idmap[ (bid,fid) ]
 def dumpjson( fo, trees ):
    fo.write('{\n')
    fo.write('  \"roots\":'+json.dumps( trees['roots'], separators=(' , ',' : ') ) +',\n' )
    fo.write('  \"weights\":'+json.dumps( trees['weights'], separators=(' , ',' : ') ) +',\n' )
    fo.write('  \"nodes\":[\n' )
    fo.write('%s\n   ]' % ',\n'.join(('    %s' % json.dumps( n, separators=(' , ',' : ') ) )   for n in trees['nodes']) )
    fo.write('\n}\n')
 fo = sys.stdout
 nmap = loadnmap( 'featmap.txt' )
 stat = loadstats( 'agaricus.txt.test', 'dump.path.txt' )
 trees = {'roots':[], 'weights':[], 'nodes':[] }
 idmap = {}
 for l in open( 'dump.raw.txt'):
    if l.startswith('booster['):
        bid = int( l.split('[')[1].split(']')[0] )
        trees['roots'].append( mapid(idmap,bid,0) )
        trees['weights'].append( 1.0 )
        continue
    node = {}
    rid = int( l.split(':')[0] )
    node['id'] = mapid( idmap, bid, rid )
    node['neg_cnt' ] = stat[ bid ][ rid ][ 0 ]
    node['pos_cnt' ] = stat[ bid ][ rid ][ 1 ] 
    idx = l.find('[f')
    if idx != -1:
        fid = int( l[idx+2:len(l)].split('<')[0])
        node['label'] = nmap[ fid ]
        node['children'] = [ mapid( idmap, bid, int(it.split('=')[1]) ) for it in l.split()[1].split(',') ]
        node['edge_tags'] = ['yes','no']
    else:
        node['label'] = l.split(':')[1].strip()
        node['value'] = float(l.split(':')[1].split('=')[1])
    trees['nodes'].append( node )
 trees['nodes'].sort( key = lambda x:x['id'] )
 dumpjson( sys.stderr, trees)
--- a/demo/test/mapfeat.py
+++ b/demo/test/mapfeat.py
@ -1,50 +0,0 @@
 #!/usr/bin/python
 import sys
 def loadfmap( fname ):
    fmap = {}
    nmap = {}
    for l in open( fname ):
        arr = l.split()
        if arr[0].find('.') != -1:            
            idx = int( arr[0].strip('.') )
            assert idx not in fmap        
            fmap[ idx ] = {}
            ftype = arr[1].strip(':')        
            content = arr[2]
        else:
            content = arr[0]
        for it in content.split(','):
            if it.strip() == '':
                continue
            k , v = it.split('=')
            fmap[ idx ][ v ] = len(nmap)
            nmap[ len(nmap) ] = ftype+'='+k
    return fmap, nmap
 def write_nmap( fo, nmap ):    
    for i in xrange( len(nmap) ):
        fo.write('%d\t%s\ti\n' % (i, nmap[i]) )
 # start here
 fmap, nmap = loadfmap( 'agaricus-lepiota.fmap' )
 fo = open( 'featmap.txt', 'w' )
 write_nmap( fo, nmap )
 fo.close()
 fo = open( 'agaricus.txt', 'w' ) 
 for l in open( 'agaricus-lepiota.data' ):
    arr = l.split(',')
    if arr[0] == 'p':
        fo.write('1')
    else:
        assert arr[0] == 'e'
        fo.write('0')
    for i in xrange( 1,len(arr) ):
        fo.write( ' %d:1' % fmap[i][arr[i].strip()] )
    fo.write('\n')
 fo.close()
--- a/demo/test/mknfold.py
+++ b/demo/test/mknfold.py
@ -1,29 +0,0 @@
 #!/usr/bin/python
 import sys
 import random
 if len(sys.argv) < 2:
    print 'Usage:<filename> <k> [nfold = 5]'
    exit(0)
 random.seed( 10 )
 k = int( sys.argv[2] )
 if len(sys.argv) > 3:
    nfold = int( sys.argv[3] )
 else:
    nfold = 5
 fi = open( sys.argv[1], 'r' )
 ftr = open( sys.argv[1]+'.train', 'w' )
 fte = open( sys.argv[1]+'.test', 'w' )
 for l in fi:
    if random.randint( 1 , nfold ) == k:
        fte.write( l )
    else:
        ftr.write( l )
 fi.close()
 ftr.close()
 fte.close()
--- a/demo/test/mushroom.conf
+++ b/demo/test/mushroom.conf
@ -1,19 +0,0 @@
 num_round=2
 save_period=0
 data = "agaricus.txt.train"
 test:data =  "agaricus.txt.test"
 eval[test] = "agaricus.txt.test"
 eval[train] = "agaricus.txt.train"
 booster_type = 0
 loss_type = 2
 bst:tree_maker=2
 bst:eta=1.0
 bst:gamma=1.0
 bst:min_child_weight=1   
 bst:max_depth=3           
--- a/demo/test/runexp.sh
+++ b/demo/test/runexp.sh
@ -1,23 +0,0 @@
 #!/bin/bash
 # map feature using indicator encoding, also produce featmap.txt
 python mapfeat.py
 # split train and test
 python mknfold.py agaricus.txt 1
 # training
 ../../xgboost mushroom.conf num_round=2 model_out=full.model bst:max_depth=3
 ../../xgboost mushroom.conf task=dump model_in=full.model fmap=featmap.txt name_dump=dump.full.txt
 # major element of batch running: add batch prefix to each setting, batch:run=1 will run that action
 ../../xgboost mushroom.conf model_in=full.model model_out=m1.model task=interact\
 batch:interact:booster_index=0 batch:bst:interact:remove=1 batch:run=1\
 batch:interact:booster_index=1 batch:bst:interact:remove=1 batch:run=1\
 batch:interact:booster_index=1 batch:bst:interact:expand=9 batch:run=1\
 ../../xgboost mushroom.conf task=dump model_in=m1.model fmap=featmap.txt name_dump=dump.m1.txt
 echo "========full======="
 cat dump.full.txt
 echo "========m1======="
 cat dump.m1.txt