From 623e003923889f68dc869a0ed2c24c2375639934 Mon Sep 17 00:00:00 2001 From: tqchen Date: Mon, 3 Mar 2014 11:05:10 -0800 Subject: [PATCH] fix fmap --- booster/tree/xgboost_col_treemaker.hpp | 1 + booster/tree/xgboost_svdf_tree.hpp | 8 +-- booster/tree/xgboost_tree.hpp | 9 ++-- booster/tree/xgboost_tree_model.h | 70 +++++++++++++++++++++----- booster/xgboost.h | 7 ++- booster/xgboost_gbmbase.h | 6 ++- demo/mushroom/mapfeat.py | 4 +- demo/mushroom/runexp.sh | 8 ++- regression/xgboost_reg.h | 6 ++- regression/xgboost_reg_main.cpp | 19 +++++-- utils/xgboost_fmap.h | 68 +++++++++++++++++++++++++ 11 files changed, 172 insertions(+), 34 deletions(-) create mode 100644 utils/xgboost_fmap.h diff --git a/booster/tree/xgboost_col_treemaker.hpp b/booster/tree/xgboost_col_treemaker.hpp index 1205f4cca..22a03c432 100644 --- a/booster/tree/xgboost_col_treemaker.hpp +++ b/booster/tree/xgboost_col_treemaker.hpp @@ -155,6 +155,7 @@ namespace xgboost{ for( int nid = 0; nid < tree.param.num_nodes; ++ nid ){ tree.stat( nid ).leaf_child_cnt = 0; tree.stat( nid ).loss_chg = snode[ nid ].best.loss_chg; + tree.stat( nid ).sum_hess = static_cast( snode[ nid ].sum_hess ); } for( int nid = 0; nid < tree.param.num_nodes; ++ nid ){ if( tree[ nid ].is_leaf() ) this->TryPruneLeaf( nid, tree.GetDepth(nid) ); diff --git a/booster/tree/xgboost_svdf_tree.hpp b/booster/tree/xgboost_svdf_tree.hpp index 715a920b9..1ff9ea1c8 100644 --- a/booster/tree/xgboost_svdf_tree.hpp +++ b/booster/tree/xgboost_svdf_tree.hpp @@ -154,18 +154,20 @@ namespace xgboost{ if( compute ){ sum_grad += grad[ ridx ]; sum_hess += hess[ ridx ]; + } } - } + tree.stat( tsk.nid ).sum_hess = static_cast( sum_hess ); tree[ tsk.nid ].set_leaf( param.learning_rate * param.CalcWeight( sum_grad, sum_hess, tsk.parent_base_weight ) ); this->try_prune_leaf( tsk.nid, tree.GetDepth( tsk.nid ) ); } private: // make split for current task, re-arrange positions in idset - inline void make_split( Task tsk, const SCEntry *entry, int num, float loss_chg, double base_weight ){ + inline void make_split( Task tsk, const SCEntry *entry, int num, float loss_chg, double sum_hess, double base_weight ){ // before split, first prepare statistics RegTree::NodeStat &s = tree.stat( tsk.nid ); s.loss_chg = loss_chg; s.leaf_child_cnt = 0; + s.sum_hess = static_cast( sum_hess ); s.base_weight = static_cast( base_weight ); // add childs to current node @@ -345,7 +347,7 @@ namespace xgboost{ // add splits tree[ tsk.nid ].set_split( e.split_index(), e.split_value, e.default_left() ); // re-arrange idset, push tasks - this->make_split( tsk, &entry[ e.start ], e.len, e.loss_chg, base_weight ); + this->make_split( tsk, &entry[ e.start ], e.len, e.loss_chg, rsum_hess, base_weight ); }else{ // make leaf if we didn't meet requirement this->make_leaf( tsk, rsum_grad, rsum_hess, false ); diff --git a/booster/tree/xgboost_tree.hpp b/booster/tree/xgboost_tree.hpp index 6bf79c4c6..5daf759c7 100644 --- a/booster/tree/xgboost_tree.hpp +++ b/booster/tree/xgboost_tree.hpp @@ -105,8 +105,8 @@ namespace xgboost{ int pid = this->GetLeafIndex( feat, funknown, gid ); return tree[ pid ].leaf_value(); } - virtual void DumpModel( FILE *fo ){ - tree.DumpModel( fo ); + virtual void DumpModel( FILE *fo, const utils::FeatMap &fmap, bool with_stats ){ + tree.DumpModel( fo, fmap, with_stats ); } private: template @@ -171,9 +171,8 @@ namespace xgboost{ inline int GetNext( int pid, float fvalue, bool is_unknown ){ float split_value = tree[ pid ].split_cond(); - if( is_unknown ){ - if( tree[ pid ].default_left() ) return tree[ pid ].cleft(); - else return tree[ pid ].cright(); + if( is_unknown ){ + return tree[ pid ].cdefault(); }else{ if( fvalue < split_value ) return tree[ pid ].cleft(); else return tree[ pid ].cright(); diff --git a/booster/tree/xgboost_tree_model.h b/booster/tree/xgboost_tree_model.h index 712d46b36..262616461 100644 --- a/booster/tree/xgboost_tree_model.h +++ b/booster/tree/xgboost_tree_model.h @@ -89,6 +89,10 @@ namespace xgboost{ inline int cright( void ) const{ return this->cright_; } + /*! \brief index of default child when feature is missing */ + inline int cdefault( void ) const{ + return this->default_left() ? this->cleft() : this->cright(); + } /*! \brief feature index of split condition */ inline unsigned split_index( void ) const{ return sindex_ & ( (1U<<31) - 1U ); @@ -228,9 +232,10 @@ namespace xgboost{ */ inline void LoadModel( utils::IStream &fi ){ utils::Assert( fi.Read( ¶m, sizeof(Param) ) > 0, "TreeModel" ); - nodes.resize( param.num_nodes ); + nodes.resize( param.num_nodes ); stats.resize( param.num_nodes ); utils::Assert( fi.Read( &nodes[0], sizeof(Node) * nodes.size() ) > 0, "TreeModel::Node" ); - + utils::Assert( fi.Read( &stats[0], sizeof(NodeStat) * stats.size() ) > 0, "TreeModel::Node" ); + deleted_nodes.resize( 0 ); for( int i = param.num_roots; i < param.num_nodes; i ++ ){ if( nodes[i].is_root() ) deleted_nodes.push_back( i ); @@ -243,8 +248,10 @@ namespace xgboost{ */ inline void SaveModel( utils::IStream &fo ) const{ utils::Assert( param.num_nodes == (int)nodes.size() ); + utils::Assert( param.num_nodes == (int)stats.size() ); fo.Write( ¶m, sizeof(Param) ); fo.Write( &nodes[0], sizeof(Node) * nodes.size() ); + fo.Write( &stats[0], sizeof(NodeStat) * nodes.size() ); } /*! * \brief add child nodes to node @@ -285,23 +292,50 @@ namespace xgboost{ return param.num_nodes - param.num_roots - param.num_deleted; } /*! \brief dump model to text file */ - inline void DumpModel( FILE *fo ){ - this->Dump( 0, fo, 0 ); + inline void DumpModel( FILE *fo, const utils::FeatMap& fmap, bool with_stats ){ + this->Dump( 0, fo, fmap, 0, with_stats ); } private: - void Dump( int nid, FILE *fo, int depth ){ + void Dump( int nid, FILE *fo, const utils::FeatMap& fmap, int depth, bool with_stats ){ for( int i = 0; i < depth; ++ i ){ fprintf( fo, "\t" ); } if( nodes[ nid ].is_leaf() ){ - fprintf( fo, "%d:leaf=%f\n", nid, nodes[ nid ].leaf_value() ); + fprintf( fo, "%d:leaf=%f ", nid, nodes[ nid ].leaf_value() ); + if( with_stats ){ + stat( nid ).Print( fo, true ); + } + fprintf( fo, "\n" ); }else{ // right then left, TSplitCond cond = nodes[ nid ].split_cond(); - fprintf( fo, "%d:[f%u<%f] yes=%d,no=%d\n", nid, - nodes[ nid ].split_index(), float(cond), nodes[ nid ].cleft(), nodes[ nid ].cright() ); - this->Dump( nodes[ nid ].cleft(), fo, depth+1 ); - this->Dump( nodes[ nid ].cright(), fo, depth+1 ); + const unsigned split_index = nodes[ nid ].split_index(); + + if( split_index < fmap.size() ){ + if( fmap.type(split_index) == utils::FeatMap::kIndicator ){ + int nyes = nodes[ nid ].default_left()?nodes[nid].cright():nodes[nid].cleft(); + fprintf( fo, "%d:[%s] yes=%d,no=%d", + nid, fmap.name( split_index ), + nyes, nodes[nid].cdefault() ); + }else{ + fprintf( fo, "%d:[%s<%f] yes=%d,no=%d,missing=%d", + nid, fmap.name(split_index), float(cond), + nodes[ nid ].cleft(), nodes[ nid ].cright(), + nodes[ nid ].cdefault() ); + } + }else{ + fprintf( fo, "%d:[f%u<%f] yes=%d,no=%d,missing=%d", + nid, split_index, float(cond), + nodes[ nid ].cleft(), nodes[ nid ].cright(), + nodes[ nid ].cdefault() ); + } + if( with_stats ){ + fprintf( fo, " "); + stat( nid ).Print( fo, false ); + } + fprintf( fo, "\n" ); + this->Dump( nodes[ nid ].cleft(), fo, fmap, depth+1, with_stats ); + this->Dump( nodes[ nid ].cright(), fo, fmap, depth+1, with_stats ); } } }; @@ -447,12 +481,22 @@ namespace xgboost{ namespace booster{ /*! \brief node statistics used in regression tree */ struct RTreeNodeStat{ - // loss chg caused by current split + /*! \brief loss chg caused by current split */ float loss_chg; - // weight of current node + /*! \brief sum of hessian values, used to measure coverage of data */ + float sum_hess; + /*! \brief weight of current node */ float base_weight; - // number of child that is leaf node known up to now + /*! \brief number of child that is leaf node known up to now */ int leaf_child_cnt; + /*! \brief print information of current stats to fo */ + inline void Print( FILE *fo, bool is_leaf ) const{ + if( !is_leaf ){ + fprintf( fo, "gain=%f,cover=%f", loss_chg, sum_hess ); + }else{ + fprintf( fo, "cover=%f", sum_hess ); + } + } }; /*! \brief most comment structure of regression tree */ class RegTree: public TreeModel{ diff --git a/booster/xgboost.h b/booster/xgboost.h index f688218f9..800fa13e0 100644 --- a/booster/xgboost.h +++ b/booster/xgboost.h @@ -8,6 +8,7 @@ */ #include #include "../utils/xgboost_utils.h" +#include "../utils/xgboost_fmap.h" #include "../utils/xgboost_stream.h" #include "../utils/xgboost_config.h" #include "xgboost_data.h" @@ -107,8 +108,10 @@ namespace xgboost{ /*! * \brief dump model into text file * \param fo output stream - */ - virtual void DumpModel( FILE *fo ){ + * \param fmap feature map that may help give interpretations of feature + * \param with_stats whether print statistics + */ + virtual void DumpModel( FILE *fo, const utils::FeatMap& fmap, bool with_stats = false ){ utils::Error( "not implemented" ); } public: diff --git a/booster/xgboost_gbmbase.h b/booster/xgboost_gbmbase.h index e6545dd40..841e62a69 100644 --- a/booster/xgboost_gbmbase.h +++ b/booster/xgboost_gbmbase.h @@ -188,11 +188,13 @@ namespace xgboost{ /*! * \brief DumpModel * \param fo text file + * \param fmap feature map that may help give interpretations of feature + * \param with_stats whether print statistics */ - inline void DumpModel( FILE *fo ){ + inline void DumpModel( FILE *fo, const utils::FeatMap& fmap, bool with_stats ){ for( size_t i = 0; i < boosters.size(); i ++ ){ fprintf( fo, "booster[%d]\n", (int)i ); - boosters[i]->DumpModel( fo ); + boosters[i]->DumpModel( fo, fmap, with_stats ); } } /*! diff --git a/demo/mushroom/mapfeat.py b/demo/mushroom/mapfeat.py index 5056368f7..74ca22d32 100755 --- a/demo/mushroom/mapfeat.py +++ b/demo/mushroom/mapfeat.py @@ -25,11 +25,11 @@ def loadfmap( fname ): def write_nmap( fo, nmap ): for i in xrange( len(nmap) ): - fo.write('%d\t%s\n' % (i, nmap[i]) ) + fo.write('%d\t%s\ti\n' % (i, nmap[i]) ) # start here fmap, nmap = loadfmap( 'agaricus-lepiota.fmap' ) -fo = open( 'featname.txt', 'w' ) +fo = open( 'featmap.txt', 'w' ) write_nmap( fo, nmap ) fo.close() diff --git a/demo/mushroom/runexp.sh b/demo/mushroom/runexp.sh index f4ae9c5e2..50d60ca9f 100755 --- a/demo/mushroom/runexp.sh +++ b/demo/mushroom/runexp.sh @@ -2,5 +2,9 @@ python mapfeat.py python mknfold.py agaricus.txt 1 ../../xgboost mushroom.conf -../../xgboost mushroom.conf task=dump model_in=0003.model -python maptree.py +# this is what dump will looklike without feature map +../../xgboost mushroom.conf task=dump model_in=0003.model name_dump=dump.raw.txt +# this is what dump will looklike with feature map +../../xgboost mushroom.conf task=dump model_in=0003.model fmap=featmap.txt name_dump=dump.nice.txt +cat dump.nice.txt + diff --git a/regression/xgboost_reg.h b/regression/xgboost_reg.h index db1395168..4c2debe2e 100644 --- a/regression/xgboost_reg.h +++ b/regression/xgboost_reg.h @@ -116,9 +116,11 @@ namespace xgboost{ /*! * \brief DumpModel * \param fo text file + * \param fmap feature map that may help give interpretations of feature + * \param with_stats whether print statistics as well */ - inline void DumpModel( FILE *fo ){ - base_model.DumpModel( fo ); + inline void DumpModel( FILE *fo, const utils::FeatMap& fmap, bool with_stats ){ + base_model.DumpModel( fo, fmap, with_stats ); } /*! * \brief Dump path of all trees diff --git a/regression/xgboost_reg_main.cpp b/regression/xgboost_reg_main.cpp index 656980acb..3b604f30e 100644 --- a/regression/xgboost_reg_main.cpp +++ b/regression/xgboost_reg_main.cpp @@ -5,6 +5,7 @@ #include #include #include "xgboost_reg.h" +#include "../utils/xgboost_fmap.h" #include "../utils/xgboost_random.h" #include "../utils/xgboost_config.h" @@ -60,6 +61,10 @@ namespace xgboost{ if( !strcmp("test:data", name ) ) test_path = val; if( !strcmp("model_in", name ) ) model_in = val; if( !strcmp("model_dir", name ) ) model_dir_path = val; + if( !strcmp("fmap", name ) ) name_fmap = val; + if( !strcmp("name_dump", name ) ) name_dump = val; + if( !strcmp("name_pred", name ) ) name_pred = val; + if( !strcmp("dump_stats", name ) ) dump_model_stats = atoi( val ); if( !strncmp("eval[", name, 5 ) ) { char evname[ 256 ]; utils::Assert( sscanf( name, "eval[%[^]]", evname ) == 1, "must specify evaluation name for display"); @@ -75,8 +80,10 @@ namespace xgboost{ use_buffer = 1; num_round = 10; save_period = 0; + dump_model_stats = 0; task = "train"; model_in = "NULL"; + name_fmap = "NULL"; name_pred = "pred.txt"; name_dump = "dump.txt"; name_dumppath = "dump.path.txt"; @@ -89,7 +96,8 @@ namespace xgboost{ } private: inline void InitData( void ){ - if( task == "dump") return; + if( name_fmap != "NULL" ) fmap.LoadText( name_fmap.c_str() ); + if( task == "dump" ) return; if( task == "test" || task == "dumppath" ){ data.CacheLoad( test_path.c_str(), silent!=0, use_buffer!=0 ); }else{ @@ -142,7 +150,7 @@ namespace xgboost{ inline void TaskDump( void ){ FILE *fo = utils::FopenCheck( name_dump.c_str(), "w" ); - learner.DumpModel( fo ); + learner.DumpModel( fo, fmap, dump_model_stats != 0 ); fclose( fo ); } inline void TaskDumpPath( void ){ @@ -187,6 +195,10 @@ namespace xgboost{ std::string task; /* \brief name of predict file */ std::string name_pred; + /* \brief whether dump statistics along with model */ + int dump_model_stats; + /* \brief name of feature map */ + std::string name_fmap; /* \brief name of dump file */ std::string name_dump; /* \brief name of dump path file */ @@ -194,12 +206,13 @@ namespace xgboost{ /* \brief the paths of validation data sets */ std::vector eval_data_paths; /* \brief the names of the evaluation data used in output log */ - std::vector eval_data_names; + std::vector eval_data_names; /*! \brief saves configurations */ utils::ConfigSaver cfg; private: DMatrix data; std::vector deval; + utils::FeatMap fmap; RegBoostLearner learner; }; }; diff --git a/utils/xgboost_fmap.h b/utils/xgboost_fmap.h new file mode 100644 index 000000000..dd7add417 --- /dev/null +++ b/utils/xgboost_fmap.h @@ -0,0 +1,68 @@ +#ifndef XGBOOST_FMAP_H +#define XGBOOST_FMAP_H +/*! + * \file xgboost_fmap.h + * \brief helper class that holds the feature names and interpretations + * \author Tianqi Chen: tianqi.tchen@gmail.com + */ +#include +#include +#include +#include "xgboost_utils.h" + +namespace xgboost{ + namespace utils{ + /*! \brief helper class that holds the feature names and interpretations */ + class FeatMap{ + public: + enum Type{ + kIndicator = 0, + kQuantitive = 1 + }; + public: + /*! \brief load feature map from text format */ + inline void LoadText( const char *fname ){ + FILE *fi = utils::FopenCheck( fname, "r" ); + this->LoadText( fi ); + fclose( fi ); + } + /*! \brief load feature map from text format */ + inline void LoadText( FILE *fi ){ + int fid; + char fname[256], ftype[256]; + while( fscanf( fi, "%d%s%s", &fid, fname, ftype ) == 3 ){ + utils::Assert( fid == (int)names_.size(), "invalid fmap format" ); + names_.push_back( std::string(fname) ); + types_.push_back( GetType( ftype ) ); + } + } + /*! \brief number of known features */ + size_t size( void ) const{ + return names_.size(); + } + /*! \brief return name of specific feature */ + const char* name( size_t idx ) const{ + utils::Assert( idx < names_.size(), "utils::FMap::name feature index exceed bound" ); + return names_[ idx ].c_str(); + } + /*! \brief return type of specific feature */ + const Type& type( size_t idx ) const{ + utils::Assert( idx < names_.size(), "utils::FMap::name feature index exceed bound" ); + return types_[ idx ]; + } + private: + inline static Type GetType( const char *tname ){ + if( !strcmp( "i", tname ) ) return kIndicator; + if( !strcmp( "q", tname ) ) return kQuantitive; + utils::Error("unknown feature type, use i for indicator and q for quantity"); + return kIndicator; + } + private: + /*! \brief name of the feature */ + std::vector names_; + /*! \brief type of the feature */ + std::vector types_; + }; + }; // namespace utils +}; // namespace xgboost +#endif // XGBOOST_FMAP_H