diff --git a/Makefile b/Makefile index b55cb3ac4..ef37a002d 100644 --- a/Makefile +++ b/Makefile @@ -4,14 +4,13 @@ export CFLAGS = -Wall -O3 -msse2 -Wno-unknown-pragmas -fopenmp # specify tensor path BIN = xgboost -OBJ = xgboost.o +OBJ = .PHONY: clean all all: $(BIN) $(OBJ) export LDFLAGS= -pthread -lm -xgboost.o: booster/xgboost.h booster/xgboost_data.h booster/xgboost.cpp booster/*/*.hpp booster/*/*.h -xgboost: regression/xgboost_reg_main.cpp regression/*.h xgboost.o +xgboost: regression/xgboost_reg_main.cpp regression/*.h booster/*.h booster/*/*.hpp booster/*.hpp $(BIN) : $(CXX) $(CFLAGS) $(LDFLAGS) -o $@ $(filter %.cpp %.o %.c, $^) diff --git a/booster/linear/xgboost_linear.hpp b/booster/linear/xgboost_linear.hpp index eb76cf8d7..8979bee72 100644 --- a/booster/linear/xgboost_linear.hpp +++ b/booster/linear/xgboost_linear.hpp @@ -15,7 +15,8 @@ namespace xgboost{ namespace booster{ /*! \brief linear model, with L1/L2 regularization */ - class LinearBooster : public IBooster{ + template + class LinearBooster : public InterfaceBooster{ public: LinearBooster( void ){ silent = 0;} virtual ~LinearBooster( void ){} @@ -37,15 +38,15 @@ namespace xgboost{ public: virtual void DoBoost( std::vector &grad, std::vector &hess, - const FMatrixS &smat, + const FMatrix &fmat, const std::vector &root_index ){ utils::Assert( grad.size() < UINT_MAX, "number of instance exceed what we can handle" ); - this->UpdateWeights( grad, hess, smat ); - } - virtual float Predict( const FMatrixS::Line &sp, unsigned rid = 0 ){ + this->UpdateWeights( grad, hess, fmat ); + } + inline float Predict( const FMatrix &fmat, bst_uint ridx, unsigned root_index ){ float sum = model.bias(); - for( unsigned i = 0; i < sp.len; i ++ ){ - sum += model.weight[ sp[i].findex ] * sp[i].fvalue; + for( typename FMatrix::RowIter it = fmat.GetRow(ridx); it.Next(); ){ + sum += model.weight[ it.findex() ] * it.fvalue(); } return sum; } @@ -59,6 +60,7 @@ namespace xgboost{ } return sum; } + protected: // training parameter struct ParamTrain{ @@ -155,7 +157,6 @@ namespace xgboost{ ParamTrain param; protected: // update weights, should work for any FMatrix - template inline void UpdateWeights( std::vector &grad, const std::vector &hess, const FMatrix &smat ){ diff --git a/booster/tree/xgboost_tree.hpp b/booster/tree/xgboost_tree.hpp index 5daf759c7..31cfcf01f 100644 --- a/booster/tree/xgboost_tree.hpp +++ b/booster/tree/xgboost_tree.hpp @@ -29,7 +29,8 @@ namespace xgboost{ namespace booster{ // regression tree, construction algorithm is seperated from this class // see RegTreeUpdater - class RegTreeTrainer : public IBooster{ + template + class RegTreeTrainer : public InterfaceBooster{ public: RegTreeTrainer( void ){ silent = 0; tree_maker = 1; @@ -55,62 +56,6 @@ namespace xgboost{ } public: virtual void DoBoost( std::vector &grad, - std::vector &hess, - const FMatrixS &smat, - const std::vector &root_index ){ - this->DoBoost_( grad, hess, smat, root_index ); - } - - virtual int GetLeafIndex( const std::vector &feat, - const std::vector &funknown, - unsigned gid = 0 ){ - // start from groups that belongs to current data - int pid = (int)gid; - // tranverse tree - while( !tree[ pid ].is_leaf() ){ - unsigned split_index = tree[ pid ].split_index(); - pid = this->GetNext( pid, feat[ split_index ], funknown[ split_index ] ); - } - return pid; - } - - virtual void PredPath( std::vector &path, const FMatrixS::Line &feat, unsigned gid = 0 ){ - path.clear(); - ThreadEntry &e = this->InitTmp(); - this->PrepareTmp( feat, e ); - - int pid = (int)gid; - path.push_back( pid ); - // tranverse tree - while( !tree[ pid ].is_leaf() ){ - unsigned split_index = tree[ pid ].split_index(); - pid = this->GetNext( pid, e.feat[ split_index ], e.funknown[ split_index ] ); - path.push_back( pid ); - } - this->DropTmp( feat, e ); - } - // make it OpenMP thread safe, but not thread safe in general - virtual float Predict( const FMatrixS::Line &feat, unsigned gid = 0 ){ - ThreadEntry &e = this->InitTmp(); - this->PrepareTmp( feat, e ); - int pid = this->GetLeafIndex( e.feat, e.funknown, gid ); - this->DropTmp( feat, e ); - return tree[ pid ].leaf_value(); - } - virtual float Predict( const std::vector &feat, - const std::vector &funknown, - unsigned gid = 0 ){ - utils::Assert( feat.size() >= (size_t)tree.param.num_feature, - "input data smaller than num feature" ); - int pid = this->GetLeafIndex( feat, funknown, gid ); - return tree[ pid ].leaf_value(); - } - virtual void DumpModel( FILE *fo, const utils::FeatMap &fmap, bool with_stats ){ - tree.DumpModel( fo, fmap, with_stats ); - } - private: - template - inline void DoBoost_( std::vector &grad, std::vector &hess, const FMatrix &smat, const std::vector &root_index ){ @@ -131,6 +76,52 @@ namespace xgboost{ printf( "tree train end, %d roots, %d extra nodes, %d pruned nodes ,max_depth=%d\n", tree.param.num_roots, tree.num_extra_nodes(), num_pruned, tree.param.max_depth ); } + } + virtual float Predict( const FMatrix &fmat, bst_uint ridx, unsigned gid = 0 ){ + ThreadEntry &e = this->InitTmp(); + this->PrepareTmp( fmat.GetRow(ridx), e ); + int pid = this->GetLeafIndex( e.feat, e.funknown, gid ); + this->DropTmp( fmat.GetRow(ridx), e ); + return tree[ pid ].leaf_value(); + } + virtual int GetLeafIndex( const std::vector &feat, + const std::vector &funknown, + unsigned gid = 0 ){ + // start from groups that belongs to current data + int pid = (int)gid; + // tranverse tree + while( !tree[ pid ].is_leaf() ){ + unsigned split_index = tree[ pid ].split_index(); + pid = this->GetNext( pid, feat[ split_index ], funknown[ split_index ] ); + } + return pid; + } + + virtual void PredPath( std::vector &path, const FMatrix &fmat, bst_uint ridx, unsigned gid = 0 ){ + path.clear(); + ThreadEntry &e = this->InitTmp(); + this->PrepareTmp( fmat.GetRow(ridx), e ); + + int pid = (int)gid; + path.push_back( pid ); + // tranverse tree + while( !tree[ pid ].is_leaf() ){ + unsigned split_index = tree[ pid ].split_index(); + pid = this->GetNext( pid, e.feat[ split_index ], e.funknown[ split_index ] ); + path.push_back( pid ); + } + this->DropTmp( fmat.GetRow(ridx), e ); + } + virtual float Predict( const std::vector &feat, + const std::vector &funknown, + unsigned gid = 0 ){ + utils::Assert( feat.size() >= (size_t)tree.param.num_feature, + "input data smaller than num feature" ); + int pid = this->GetLeafIndex( feat, funknown, gid ); + return tree[ pid ].leaf_value(); + } + virtual void DumpModel( FILE *fo, const utils::FeatMap &fmap, bool with_stats ){ + tree.DumpModel( fo, fmap, with_stats ); } private: int silent; @@ -144,7 +135,6 @@ namespace xgboost{ }; std::vector threadtemp; private: - inline ThreadEntry& InitTmp( void ){ const int tid = omp_get_thread_num(); utils::Assert( tid < (int)threadtemp.size(), "RTreeUpdater: threadtemp pool is too small" ); @@ -156,16 +146,17 @@ namespace xgboost{ } return e; } - inline void PrepareTmp( const FMatrixS::Line &feat, ThreadEntry &e ){ - for( unsigned i = 0; i < feat.len; i ++ ){ - utils::Assert( feat[i].findex < (unsigned)tree.param.num_feature , "input feature execeed bound" ); - e.funknown[ feat[i].findex ] = false; - e.feat[ feat[i].findex ] = feat[i].fvalue; + inline void PrepareTmp( typename FMatrix::RowIter it, ThreadEntry &e ){ + while( it.Next() ){ + const bst_uint findex = it.findex(); + utils::Assert( findex < (unsigned)tree.param.num_feature , "input feature execeed bound" ); + e.funknown[ findex ] = false; + e.feat[ findex ] = it.fvalue(); } } - inline void DropTmp( const FMatrixS::Line &feat, ThreadEntry &e ){ - for( unsigned i = 0; i < feat.len; i ++ ){ - e.funknown[ feat[i].findex ] = true; + inline void DropTmp( typename FMatrix::RowIter it, ThreadEntry &e ){ + while( it.Next() ){ + e.funknown[ it.findex() ] = true; } } diff --git a/booster/xgboost.cpp b/booster/xgboost-inl.hpp similarity index 53% rename from booster/xgboost.cpp rename to booster/xgboost-inl.hpp index 61f42004b..eee64ac7b 100644 --- a/booster/xgboost.cpp +++ b/booster/xgboost-inl.hpp @@ -1,16 +1,13 @@ +#ifndef XGBOOST_INL_HPP +#define XGBOOST_INL_HPP /*! -* \file xgboost.cpp -* \brief bootser implementations -* \author Tianqi Chen: tianqi.tchen@gmail.com -*/ + * \file xgboost-inl.hpp + * \brief bootser implementations + * \author Tianqi Chen: tianqi.tchen@gmail.com + */ // implementation of boosters go to here -#define _CRT_SECURE_NO_WARNINGS -#define _CRT_SECURE_NO_DEPRECATE -#include #include "xgboost.h" #include "../utils/xgboost_utils.h" -#include "xgboost_gbmbase.h" -// implementations of boosters #include "tree/xgboost_tree.hpp" #include "linear/xgboost_linear.hpp" @@ -19,16 +16,18 @@ namespace xgboost{ /*! * \brief create a gradient booster, given type of booster * \param booster_type type of gradient booster, can be used to specify implements + * \tparam FMatrix input data type for booster * \return the pointer to the gradient booster created */ - IBooster *CreateBooster( int booster_type ){ + template + inline InterfaceBooster *CreateBooster( int booster_type ){ switch( booster_type ){ - case 0: return new RegTreeTrainer(); - case 1: return new LinearBooster(); + case 0: return new RegTreeTrainer(); + case 1: return new LinearBooster(); default: utils::Error("unknown booster_type"); return NULL; } } }; }; - +#endif // XGBOOST_INL_HPP diff --git a/booster/xgboost.h b/booster/xgboost.h index 800fa13e0..587cefdc3 100644 --- a/booster/xgboost.h +++ b/booster/xgboost.h @@ -3,7 +3,9 @@ /*! * \file xgboost.h * \brief the general gradient boosting interface - * + * + * common practice of this header: use IBooster and CreateBooster + * * \author Tianqi Chen: tianqi.tchen@gmail.com */ #include @@ -19,8 +21,10 @@ namespace xgboost{ namespace booster{ /*! * \brief interface of a gradient boosting learner + * \tparam FMatrix the feature matrix format that the booster takes */ - class IBooster{ + template + class InterfaceBooster{ public: // interface for model setting and loading // calling procedure: @@ -69,9 +73,12 @@ namespace xgboost{ /*! * \brief predict the path ids along a trees, for given sparse feature vector. When booster is a tree * \param path the result of path - * \param rid root id of current instance, default = 0 + * \param feats feature matrix + * \param row_index row index in the feature matrix + * \param root_index root id of current instance, default = 0 */ - virtual void PredPath( std::vector &path, const FMatrixS::Line &feat, unsigned rid = 0 ){ + virtual void PredPath( std::vector &path, const FMatrix &feats, + bst_uint row_index, unsigned root_index = 0 ){ utils::Error( "not implemented" ); } /*! @@ -79,11 +86,12 @@ namespace xgboost{ * * NOTE: in tree implementation, Sparse Predict is OpenMP threadsafe, but not threadsafe in general, * dense version of Predict to ensures threadsafety - * \param feat vector in sparse format - * \param rid root id of current instance, default = 0 + * \param feats feature matrix + * \param row_index row index in the feature matrix + * \param root_index root id of current instance, default = 0 * \return prediction */ - virtual float Predict( const FMatrixS::Line &feat, unsigned rid = 0 ){ + virtual float Predict( const FMatrix &feats, bst_uint row_index, unsigned root_index = 0 ){ utils::Error( "not implemented" ); return 0.0f; } @@ -92,12 +100,12 @@ namespace xgboost{ * \param feat feature vector in dense format * \param funknown indicator that the feature is missing * \param rid root id of current instance, default = 0 - * \return prediction - */ + * \return prediction + */ virtual float Predict( const std::vector &feat, const std::vector &funknown, unsigned rid = 0 ){ - utils::Error( "not implemented" ); + utils::Error( "not implemented" ); return 0.0f; } /*! @@ -116,8 +124,15 @@ namespace xgboost{ } public: /*! \brief virtual destructor */ - virtual ~IBooster( void ){} - }; + virtual ~InterfaceBooster( void ){} + }; + }; + namespace booster{ + /*! + * \brief this will is the most commonly used booster interface + * we try to make booster invariant of data structures, but most cases, FMatrixS is what we wnat + */ + typedef InterfaceBooster IBooster; }; }; @@ -125,10 +140,24 @@ namespace xgboost{ namespace booster{ /*! * \brief create a gradient booster, given type of booster + * normally we use FMatrixS, by calling CreateBooster * \param booster_type type of gradient booster, can be used to specify implements + * \tparam FMatrix input data type for booster * \return the pointer to the gradient booster created */ - IBooster *CreateBooster( int booster_type ); + template + inline InterfaceBooster *CreateBooster( int booster_type ); }; }; + +// A good design should have minimum functions defined interface, user should only operate on interface +// I break it a bit, by using template and let user 'see' the implementation +// The user should pretend that they only can use the interface, and we are all cool +// I find this is the only way so far I can think of to make boosters invariant of data structure, +// while keep everything fast + +// this file includes the template implementations of all boosters +// the cost of using template is that the user can 'see' all the implementations, which is OK +// ignore implementations and focus on the interface:) +#include "xgboost-inl.hpp" #endif diff --git a/booster/xgboost_data.h b/booster/xgboost_data.h index 5bd95d48b..128544e55 100644 --- a/booster/xgboost_data.h +++ b/booster/xgboost_data.h @@ -341,7 +341,7 @@ namespace xgboost{ utils::Assert( fi.Read( &data[0] , data.size() * sizeof(REntry) ) , "Load FMatrixS" ); } } - private: + protected: /*! \brief row pointer of CSR sparse storage */ std::vector row_ptr_; /*! \brief data in the row */ diff --git a/booster/xgboost_gbmbase.h b/booster/xgboost_gbmbase.h index 4bbd8a8a2..50746e59e 100644 --- a/booster/xgboost_gbmbase.h +++ b/booster/xgboost_gbmbase.h @@ -3,6 +3,7 @@ #include #include "xgboost.h" +#include "xgboost_data.h" #include "../utils/xgboost_omp.h" #include "../utils/xgboost_config.h" /*! @@ -128,7 +129,7 @@ namespace xgboost{ utils::Assert( fi.Read( ¶m, sizeof(Param) ) != 0 ); boosters.resize( param.num_boosters ); for( size_t i = 0; i < boosters.size(); i ++ ){ - boosters[ i ] = booster::CreateBooster( param.booster_type ); + boosters[ i ] = booster::CreateBooster( param.booster_type ); boosters[ i ]->LoadModel( fi ); } {// load info @@ -207,7 +208,7 @@ namespace xgboost{ for( size_t j = 0; j < boosters.size(); ++ j ){ if( j != 0 ) fprintf( fo, "\t" ); std::vector path; - boosters[j]->PredPath( path, data[i] ); + boosters[j]->PredPath( path, data, i ); fprintf( fo, "%d", path[0] ); for( size_t k = 1; k < path.size(); ++ k ){ fprintf( fo, ",%d", path[k] ); @@ -236,12 +237,13 @@ namespace xgboost{ /*! * \brief predict values for given sparse feature vector * NOTE: in tree implementation, this is not threadsafe - * \param feat vector in sparse format + * \param feats feature matrix + * \param row_index row index in the feature matrix * \param buffer_index the buffer index of the current feature line, default -1 means no buffer assigned - * \param rid root id of current instance, default = 0 + * \param root_index root id of current instance, default = 0 * \return prediction - */ - virtual float Predict( const booster::FMatrixS::Line &feat, int buffer_index = -1, unsigned rid = 0 ){ + */ + inline float Predict( const FMatrixS &feats, bst_uint row_index, int buffer_index = -1, unsigned root_index = 0 ){ size_t istart = 0; float psum = 0.0f; @@ -253,7 +255,7 @@ namespace xgboost{ } for( size_t i = istart; i < this->boosters.size(); i ++ ){ - psum += this->boosters[ i ]->Predict( feat, rid ); + psum += this->boosters[ i ]->Predict( feats, row_index, root_index ); } // updated the buffered results @@ -320,7 +322,7 @@ namespace xgboost{ inline booster::IBooster *GetUpdateBooster( void ){ if( param.do_reboost == 0 || boosters.size() == 0 ){ param.num_boosters += 1; - boosters.push_back( booster::CreateBooster( param.booster_type ) ); + boosters.push_back( booster::CreateBooster( param.booster_type ) ); booster_info.push_back( 0 ); this->ConfigBooster( boosters.back() ); boosters.back()->InitModel(); diff --git a/regression/xgboost_reg.h b/regression/xgboost_reg.h index 5096aec5e..7d1a680dc 100644 --- a/regression/xgboost_reg.h +++ b/regression/xgboost_reg.h @@ -174,7 +174,7 @@ namespace xgboost{ #pragma omp parallel for schedule( static ) for( unsigned j = 0; j < ndata; ++ j ){ preds[j] = mparam.PredTransform - ( mparam.base_score + base_model.Predict( data.data[j], -1 ) ); + ( mparam.base_score + base_model.Predict( data.data, j, -1 ) ); } } private: @@ -186,7 +186,7 @@ namespace xgboost{ #pragma omp parallel for schedule( static ) for( unsigned j = 0; j < ndata; ++ j ){ preds[j] = mparam.PredTransform - ( mparam.base_score + base_model.Predict( data.data[j], buffer_offset + j ) ); + ( mparam.base_score + base_model.Predict( data.data, j, buffer_offset + j ) ); } }