diff --git a/booster/tree/xgboost_tree.hpp b/booster/tree/xgboost_tree.hpp index 76ec073ce..783e7c380 100644 --- a/booster/tree/xgboost_tree.hpp +++ b/booster/tree/xgboost_tree.hpp @@ -57,7 +57,7 @@ namespace xgboost{ virtual int GetLeafIndex( const std::vector &feat, const std::vector &funknown, - unsigned gid = 0 ){ + unsigned gid = 0 ){ // start from groups that belongs to current data int pid = (int)gid; // tranverse tree @@ -67,18 +67,28 @@ namespace xgboost{ } return pid; } + + virtual void PredPath( std::vector &path, const FMatrixS::Line &feat, unsigned gid = 0 ){ + path.clear(); + this->InitTmp(); + this->PrepareTmp( feat ); + + int pid = (int)gid; + path.push_back( pid ); + // tranverse tree + while( !tree[ pid ].is_leaf() ){ + unsigned split_index = tree[ pid ].split_index(); + pid = this->GetNext( pid, tmp_feat[ split_index ], tmp_funknown[ split_index ] ); + path.push_back( pid ); + } + this->DropTmp( feat ); + } + virtual float Predict( const FMatrixS::Line &feat, unsigned gid = 0 ){ this->InitTmp(); - for( unsigned i = 0; i < feat.len; i ++ ){ - utils::Assert( feat[i].findex < (unsigned)tmp_funknown.size() , "input feature execeed bound" ); - tmp_funknown[ feat[i].findex ] = false; - tmp_feat[ feat[i].findex ] = feat[i].fvalue; - } + this->PrepareTmp( feat ); int pid = this->GetLeafIndex( tmp_feat, tmp_funknown, gid ); - // set back - for( unsigned i = 0; i < feat.len; i ++ ){ - tmp_funknown[ feat[i].findex ] = true; - } + this->DropTmp( feat ); return tree[ pid ].leaf_value(); } virtual float Predict( const std::vector &feat, @@ -127,6 +137,18 @@ namespace xgboost{ std::fill( tmp_funknown.begin(), tmp_funknown.end(), true ); } } + inline void PrepareTmp( const FMatrixS::Line &feat ){ + for( unsigned i = 0; i < feat.len; i ++ ){ + utils::Assert( feat[i].findex < (unsigned)tmp_funknown.size() , "input feature execeed bound" ); + tmp_funknown[ feat[i].findex ] = false; + tmp_feat[ feat[i].findex ] = feat[i].fvalue; + } + } + inline void DropTmp( const FMatrixS::Line &feat ){ + for( unsigned i = 0; i < feat.len; i ++ ){ + tmp_funknown[ feat[i].findex ] = true; + } + } inline int GetNext( int pid, float fvalue, bool is_unknown ){ float split_value = tree[ pid ].split_cond(); diff --git a/booster/xgboost.h b/booster/xgboost.h index 06fb644f8..731b869bb 100644 --- a/booster/xgboost.h +++ b/booster/xgboost.h @@ -65,6 +65,14 @@ namespace xgboost{ std::vector &hess, const FMatrixS &feats, const std::vector &root_index ) = 0; + /*! + * \brief predict the path ids along a trees, for given sparse feature vector. When booster is a tree + * \param path the result of path + * \param rid root id of current instance, default = 0 + */ + virtual void PredPath( std::vector &path, const FMatrixS::Line &feat, unsigned rid = 0 ){ + utils::Error( "not implemented" ); + } /*! * \brief predict values for given sparse feature vector * NOTE: in tree implementation, this is not threadsafe, used dense version to ensure threadsafety diff --git a/booster/xgboost_gbmbase.h b/booster/xgboost_gbmbase.h index 7759de35a..a846061dd 100644 --- a/booster/xgboost_gbmbase.h +++ b/booster/xgboost_gbmbase.h @@ -179,6 +179,25 @@ namespace xgboost{ boosters[i]->DumpModel( fo ); } } + /*! + * \brief Dump path of all trees + * \param fo text file + * \param data input data + */ + inline void DumpPath( FILE *fo, const FMatrixS &data ){ + for( size_t i = 0; i < data.NumRow(); ++ i ){ + for( size_t j = 0; j < boosters.size(); ++ j ){ + if( j != 0 ) fprintf( fo, "\t" ); + std::vector path; + boosters[j]->PredPath( path, data[i] ); + fprintf( fo, "%d", path[0] ); + for( size_t k = 1; k < path.size(); ++ k ){ + fprintf( fo, ",%d", path[k] ); + } + } + fprintf( fo, "\n" ); + } + } public: /*! * \brief do gradient boost training for one step, using the information given @@ -195,7 +214,7 @@ namespace xgboost{ const std::vector &root_index ) { booster::IBooster *bst = this->GetUpdateBooster(); bst->DoBoost( grad, hess, feats, root_index ); - } + } /*! * \brief predict values for given sparse feature vector * NOTE: in tree implementation, this is not threadsafe @@ -204,7 +223,7 @@ namespace xgboost{ * \param rid root id of current instance, default = 0 * \return prediction */ - virtual float Predict( const booster::FMatrixS::Line &feat, int buffer_index = -1, unsigned rid = 0 ){ + virtual float Predict( const booster::FMatrixS::Line &feat, int buffer_index = -1, unsigned rid = 0 ){ size_t istart = 0; float psum = 0.0f; diff --git a/demo/mushroom/mushroom.conf b/demo/mushroom/mushroom.conf index e0c2b2992..e3d3f484e 100644 --- a/demo/mushroom/mushroom.conf +++ b/demo/mushroom/mushroom.conf @@ -4,6 +4,8 @@ save_period=0 data = "agaricus.txt.train" eval[test] = "agaricus.txt.test" +test:data = "agaricus.txt.test" + booster_type = 0 loss_type = 2 diff --git a/demo/mushroom/runexp.sh b/demo/mushroom/runexp.sh index f4ae9c5e2..16747b15e 100755 --- a/demo/mushroom/runexp.sh +++ b/demo/mushroom/runexp.sh @@ -3,4 +3,5 @@ python mapfeat.py python mknfold.py agaricus.txt 1 ../../xgboost mushroom.conf ../../xgboost mushroom.conf task=dump model_in=0003.model +../../xgboost mushroom.conf task=dumppath model_in=0003.model python maptree.py diff --git a/regression/xgboost_reg.h b/regression/xgboost_reg.h index a55226798..542185a77 100644 --- a/regression/xgboost_reg.h +++ b/regression/xgboost_reg.h @@ -99,6 +99,14 @@ namespace xgboost{ inline void DumpModel( FILE *fo ){ base_model.DumpModel( fo ); } + /*! + * \brief Dump path of all trees + * \param fo text file + * \param data input data + */ + inline void DumpPath( FILE *fo, const DMatrix &data ){ + base_model.DumpPath( fo, data.data ); + } /*! * \brief save model to stream * \param fo output stream diff --git a/regression/xgboost_reg_main.cpp b/regression/xgboost_reg_main.cpp index 537801d60..61751b961 100644 --- a/regression/xgboost_reg_main.cpp +++ b/regression/xgboost_reg_main.cpp @@ -34,13 +34,17 @@ namespace xgboost{ } this->InitData(); this->InitLearner(); - if( !strcmp( task.c_str(), "dump") ){ + if( task == "dump" ){ this->TaskDump(); return 0; } - if( !strcmp( task.c_str(), "test") ){ + if( task == "dumppath" ){ + this->TaskDumpPath(); + return 0; + } + if( task == "test" ){ this->TaskTest(); - }else{ + }else{ this->TaskTrain(); } return 0; @@ -73,6 +77,7 @@ namespace xgboost{ model_in = "NULL"; name_pred = "pred.txt"; name_dump = "dump.txt"; + name_dumppath = "dump.path.txt"; model_dir_path = "./"; } ~RegBoostTask( void ){ @@ -82,8 +87,8 @@ namespace xgboost{ } private: inline void InitData( void ){ - if( !strcmp( task.c_str(), "dump") ) return; - if( !strcmp( task.c_str(), "test") ){ + if( task == "dump") return; + if( task == "test" || task == "dumppath" ){ data.CacheLoad( test_path.c_str() ); }else{ // training @@ -101,12 +106,12 @@ namespace xgboost{ while( cfg.Next() ){ learner.SetParam( cfg.name(), cfg.val() ); } - if( strcmp( model_in.c_str(), "NULL" ) != 0 ){ + if( model_in != "NULL" ){ utils::FileStream fi( utils::FopenCheck( model_in.c_str(), "rb") ); learner.LoadModel( fi ); fi.Close(); }else{ - utils::Assert( !strcmp( task.c_str(), "train"), "model_in not specified" ); + utils::Assert( task == "train", "model_in not specified" ); learner.InitModel(); } learner.InitTrainer(); @@ -138,6 +143,11 @@ namespace xgboost{ learner.DumpModel( fo ); fclose( fo ); } + inline void TaskDumpPath( void ){ + FILE *fo = utils::FopenCheck( name_dumppath.c_str(), "w" ); + learner.DumpPath( fo, data ); + fclose( fo ); + } inline void SaveModel( int i ) const{ char fname[256]; sprintf( fname ,"%s/%04d.model", model_dir_path.c_str(), i+1 ); @@ -175,6 +185,8 @@ namespace xgboost{ std::string name_pred; /* \brief name of dump file */ std::string name_dump; + /* \brief name of dump path file */ + std::string name_dumppath; /* \brief the paths of validation data sets */ std::vector eval_data_paths; /* \brief the names of the evaluation data used in output log */