From 9b09cd3d492cd8daf2949514823273132c6c2eaf Mon Sep 17 00:00:00 2001 From: tqchen Date: Wed, 26 Feb 2014 11:51:58 -0800 Subject: [PATCH] change input data structure --- .gitignore | 5 + booster/linear/xgboost_linear.hpp | 17 +- booster/tree/xgboost_svdf_tree.hpp | 18 +- booster/xgboost.cpp | 10 +- booster/xgboost.h | 6 +- booster/xgboost_data.h | 268 ++++++++++++++++++----------- booster/xgboost_gbmbase.h | 2 +- regression/xgboost_reg.h | 3 +- regression/xgboost_regdata.h | 4 +- 9 files changed, 204 insertions(+), 129 deletions(-) diff --git a/.gitignore b/.gitignore index f800ecd23..c5103fd56 100644 --- a/.gitignore +++ b/.gitignore @@ -12,3 +12,8 @@ *.la *.a *~ +*txt* +*conf +*buffer +*model +xgboost \ No newline at end of file diff --git a/booster/linear/xgboost_linear.hpp b/booster/linear/xgboost_linear.hpp index 852bbc7fe..a6e10199f 100644 --- a/booster/linear/xgboost_linear.hpp +++ b/booster/linear/xgboost_linear.hpp @@ -38,7 +38,7 @@ namespace xgboost{ public: virtual void DoBoost( std::vector &grad, std::vector &hess, - const FMatrixS::Image &smat, + const FMatrixS &smat, const std::vector &root_index ){ utils::Assert( grad.size() < UINT_MAX, "number of instance exceed what we can handle" ); this->Update( smat, grad, hess ); @@ -46,7 +46,7 @@ namespace xgboost{ virtual float Predict( const FMatrixS::Line &sp, unsigned rid = 0 ){ float sum = model.bias(); for( unsigned i = 0; i < sp.len; i ++ ){ - sum += model.weight[ sp.findex[i] ] * sp.fvalue[i]; + sum += model.weight[ sp[i].findex ] * sp[i].fvalue; } return sum; } @@ -208,11 +208,10 @@ namespace xgboost{ } } } - inline void MakeCmajor( std::vector &rptr, std::vector &entry, const std::vector &hess, - const FMatrixS::Image &smat ){ + const FMatrixS &smat ){ // transform to column order first const int nfeat = model.param.num_feature; // build CSR column major format data @@ -224,8 +223,8 @@ namespace xgboost{ // add sparse part budget FMatrixS::Line sp = smat[ i ]; for( unsigned j = 0; j < sp.len; j ++ ){ - if( j == 0 || sp.findex[j-1] != sp.findex[j] ){ - builder.AddBudget( sp.findex[j] ); + if( j == 0 || sp[j-1].findex != sp[j].findex ){ + builder.AddBudget( sp[j].findex ); } } } @@ -237,14 +236,14 @@ namespace xgboost{ FMatrixS::Line sp = smat[ i ]; for( unsigned j = 0; j < sp.len; j ++ ){ // skip duplicated terms - if( j == 0 || sp.findex[j-1] != sp.findex[j] ){ - builder.PushElem( sp.findex[j], SCEntry( sp.fvalue[j], i ) ); + if( j == 0 || sp[j-1].findex != sp[j].findex ){ + builder.PushElem( sp[j].findex, SCEntry( sp[j].fvalue, i ) ); } } } } protected: - virtual void Update( const FMatrixS::Image &smat, + virtual void Update( const FMatrixS &smat, std::vector &grad, const std::vector &hess ){ std::vector rptr; diff --git a/booster/tree/xgboost_svdf_tree.hpp b/booster/tree/xgboost_svdf_tree.hpp index f34d271f1..8155618b2 100644 --- a/booster/tree/xgboost_svdf_tree.hpp +++ b/booster/tree/xgboost_svdf_tree.hpp @@ -131,7 +131,7 @@ namespace xgboost{ RTree &tree; std::vector &grad; std::vector &hess; - const FMatrixS::Image &smat; + const FMatrixS &smat; const std::vector &group_id; private: // maximum depth up to now @@ -322,7 +322,7 @@ namespace xgboost{ FMatrixS::Line sp = smat[ ridx ]; for( unsigned j = 0; j < sp.len; j ++ ){ - builder.AddBudget( sp.findex[j] ); + builder.AddBudget( sp[j].findex ); } } @@ -336,7 +336,7 @@ namespace xgboost{ const unsigned ridx = tsk.idset[i]; FMatrixS::Line sp = smat[ ridx ]; for( unsigned j = 0; j < sp.len; j ++ ){ - builder.PushElem( sp.findex[j], SCEntry( sp.fvalue[j], ridx ) ); + builder.PushElem( sp[j].findex, SCEntry( sp[j].fvalue, ridx ) ); } } // --- end of building column major matrix --- @@ -429,7 +429,7 @@ namespace xgboost{ RTree &ptree, std::vector &pgrad, std::vector &phess, - const FMatrixS::Image &psmat, + const FMatrixS &psmat, const std::vector &pgroup_id ): param( pparam ), tree( ptree ), grad( pgrad ), hess( phess ), smat( psmat ), group_id( pgroup_id ){ @@ -494,7 +494,7 @@ namespace xgboost{ public: virtual void DoBoost( std::vector &grad, std::vector &hess, - const FMatrixS::Image &smat, + const FMatrixS &smat, const std::vector &group_id ){ utils::Assert( grad.size() < UINT_MAX, "number of instance exceed what we can handle" ); if( !silent ){ @@ -526,14 +526,14 @@ namespace xgboost{ virtual float Predict( const FMatrixS::Line &feat, unsigned gid = 0 ){ this->init_tmpfeat(); for( unsigned i = 0; i < feat.len; i ++ ){ - utils::Assert( feat.findex[i] < (unsigned)tmp_funknown.size() , "input feature execeed bound" ); - tmp_funknown[ feat.findex[i] ] = false; - tmp_feat[ feat.findex[i] ] = feat.fvalue[i]; + utils::Assert( feat[i].findex < (unsigned)tmp_funknown.size() , "input feature execeed bound" ); + tmp_funknown[ feat[i].findex ] = false; + tmp_feat[ feat[i].findex ] = feat[i].fvalue; } int pid = this->GetLeafIndex( tmp_feat, tmp_funknown, gid ); // set back for( unsigned i = 0; i < feat.len; i ++ ){ - tmp_funknown[ feat.findex[i] ] = true; + tmp_funknown[ feat[i].findex ] = true; } return tree[ pid ].leaf_value(); } diff --git a/booster/xgboost.cpp b/booster/xgboost.cpp index f6356049d..7507c11c6 100644 --- a/booster/xgboost.cpp +++ b/booster/xgboost.cpp @@ -16,9 +16,9 @@ namespace xgboost{ namespace booster{ - /*ŁĄ - * \brief listing the types of boosters - */ + /* + * \brief listing the types of boosters + */ enum BOOSTER_TYPE_LIST{ TREE, LINEAR, @@ -30,8 +30,8 @@ namespace xgboost{ */ IBooster *CreateBooster( int booster_type ){ switch( booster_type ){ - case TREE: return new RTreeTrainer(); - case LINEAR: return new LinearBooster(); + case TREE: return new RTreeTrainer(); + case LINEAR: return new LinearBooster(); default: utils::Error("unknown booster_type"); return NULL; } } diff --git a/booster/xgboost.h b/booster/xgboost.h index fe929cedf..06fb644f8 100644 --- a/booster/xgboost.h +++ b/booster/xgboost.h @@ -16,7 +16,9 @@ namespace xgboost{ /*! \brief namespace for boosters */ namespace booster{ - /*! \brief interface of a gradient boosting learner */ + /*! + * \brief interface of a gradient boosting learner + */ class IBooster{ public: // interface for model setting and loading @@ -61,7 +63,7 @@ namespace xgboost{ */ virtual void DoBoost( std::vector &grad, std::vector &hess, - const FMatrixS::Image &feats, + const FMatrixS &feats, const std::vector &root_index ) = 0; /*! * \brief predict values for given sparse feature vector diff --git a/booster/xgboost_data.h b/booster/xgboost_data.h index 285b770b1..84935e4a7 100644 --- a/booster/xgboost_data.h +++ b/booster/xgboost_data.h @@ -24,119 +24,187 @@ namespace xgboost{ const bool bst_debug = false; }; }; + namespace xgboost{ namespace booster{ - /*! - * \brief feature matrix to store training instance, in sparse CSR format + /** + * \brief This is a interface, defining the way to access features, + * by column or by row. This interface is used to make implementation + * of booster does not depend on how feature is stored. + * + * Why template instead of virtual class: for efficiency + * feature matrix is going to be used by most inner loop of the algorithm + * + * \tparam Derived type of actual implementation + * \sa FMatrixS: most of time FMatrixS is sufficient, refer to it if you find it confusing */ - class FMatrixS{ + template + struct FMatrix{ public: - /*! \brief one row of sparse feature matrix */ - struct Line{ - /*! \brief array of feature index */ - const bst_uint *findex; - /*! \brief array of feature value */ - const bst_float *fvalue; - /*! \brief size of the data */ - bst_uint len; + /*! \brief exmaple iterator over one row */ + struct RowIter{ + /*! + * \brief move to next position + * \return whether there is element in next position + */ + inline bool Next( void ); + /*! \return feature index in current position */ + inline bst_uint findex( void ) const; + /*! \return feature value in current position */ + inline bst_float fvalue( void ) const; }; + /*! \brief example iterator over one column */ + struct ColIter{ + /*! + * \brief move to next position + * \return whether there is element in next position + */ + inline bool Next( void ); + /*! \return row index of current position */ + inline bst_uint rindex( void ) const; + /*! \return feature value in current position */ + inline bst_float fvalue( void ) const; + }; + public: /*! - * \brief remapped image of sparse matrix, - * allows use a subset of sparse matrix, by specifying a rowmap + * \brief prepare sorted columns so that GetSortedCol can be called */ - struct Image{ - public: - Image( const FMatrixS &smat ):smat(smat), row_map( tmp_rowmap ){ - } - Image( const FMatrixS &smat, const std::vector &row_map ) - :smat(smat), row_map(row_map){ - } - /*! \brief get sparse part of current row */ - inline Line operator[]( size_t sidx ) const{ - if( row_map.size() == 0 ) return smat[ sidx ]; - else return smat[ row_map[ sidx ] ]; - } - private: - // used to set the simple case - std::vector tmp_rowmap; - const FMatrixS &smat; - const std::vector &row_map; - }; - public: - // -----Note: unless needed for hacking, these fields should not be accessed directly ----- - /*! \brief row pointer of CSR sparse storage */ - std::vector row_ptr; - /*! \brief index of CSR format */ - std::vector findex; - /*! \brief value of CSR format */ - std::vector fvalue; - public: - /*! \brief constructor */ - FMatrixS( void ){ this->Clear(); } + inline void MakeSortedCol( void ); /*! * \brief get number of rows * \return number of rows */ + inline size_t NumRow( void ) const; + /*! + * \brief get number of columns + * \return number of columns + */ + inline size_t NumCol( void ) const; + /*! + * \brief get row iterator + * \param ridx row index + * \return row iterator + */ + inline RowIter GetRow( size_t ridx ) const; + /*! + * \brief get column iterator, the columns must be sorted by feature value + * \param ridx column index + * \return column iterator + */ + inline ColIter GetSortedCol( size_t ridx ) const; + + /*! \return the view of derived class */ + inline const Derived& self( void ) const{ + return *static_cast(this); + } + }; + }; +}; + +namespace xgboost{ + namespace booster{ + /*! + * \brief feature matrix to store training instance, in sparse CSR format + */ + class FMatrixS: public FMatrix{ + public: + /*! \brief one entry in a row */ + struct REntry{ + /*! \brief feature index */ + bst_uint findex; + /*! \brief feature value */ + bst_float fvalue; + }; + /*! \brief one entry in a row */ + struct CEntry{ + /*! \brief row index */ + bst_uint rindex; + /*! \brief feature value */ + bst_float fvalue; + }; + /*! \brief one row of sparse feature matrix */ + struct Line{ + /*! \brief array of feature index */ + const REntry *data_; + /*! \brief size of the data */ + bst_uint len; + inline const REntry& operator[]( unsigned i ) const{ + return data_[i]; + } + }; + public: + struct RowIter{ + const REntry *dptr, *end; + inline bool Next( void ){ + if( dptr == end ) return false; + else{ + ++ dptr; return true; + } + } + inline bst_uint findex( void ) const{ + return dptr->findex; + } + inline bst_float fvalue( void ) const{ + return dptr->fvalue; + } + }; + public: + /*! \brief constructor */ + FMatrixS( void ){ this->Clear(); } + /*! \brief get number of rows */ inline size_t NumRow( void ) const{ - return row_ptr.size() - 1; + return row_ptr_.size() - 1; } /*! * \brief get number of nonzero entries * \return number of nonzero entries */ inline size_t NumEntry( void ) const{ - return findex.size(); + return row_data_.size(); } /*! \brief clear the storage */ inline void Clear( void ){ - row_ptr.resize( 0 ); - findex.resize( 0 ); - fvalue.resize( 0 ); - row_ptr.push_back( 0 ); - } - /*! - * \brief add a row to the matrix, but only accept features from fstart to fend - * \param feat sparse feature - * \param fstart start bound of feature - * \param fend end bound range of feature - * \return the row id of added line - */ - inline size_t AddRow( const Line &feat, unsigned fstart = 0, unsigned fend = UINT_MAX ){ - utils::Assert( feat.len >= 0, "sparse feature length can not be negative" ); - unsigned cnt = 0; - for( unsigned i = 0; i < feat.len; i ++ ){ - if( feat.findex[i] < fstart || feat.findex[i] >= fend ) continue; - findex.push_back( feat.findex[i] ); - fvalue.push_back( feat.fvalue[i] ); - cnt ++; - } - row_ptr.push_back( row_ptr.back() + cnt ); - return row_ptr.size() - 2; - } - - /*! - * \brief add a row to the matrix, with data stored in STL container - * \param findex feature index - * \param fvalue feature value - * \return the row id added line - */ - inline size_t AddRow( const std::vector &findex, - const std::vector &fvalue ){ - FMatrixS::Line l; - utils::Assert( findex.size() == fvalue.size() ); - l.findex = &findex[0]; - l.fvalue = &fvalue[0]; - l.len = static_cast( findex.size() ); - return this->AddRow( l ); + row_ptr_.clear(); + row_ptr_.push_back( 0 ); + row_data_.clear(); } /*! \brief get sparse part of current row */ inline Line operator[]( size_t sidx ) const{ Line sp; utils::Assert( !bst_debug || sidx < this->NumRow(), "row id exceed bound" ); - sp.len = static_cast( row_ptr[ sidx + 1 ] - row_ptr[ sidx ] ); - sp.findex = &findex[ row_ptr[ sidx ] ]; - sp.fvalue = &fvalue[ row_ptr[ sidx ] ]; + sp.len = static_cast( row_ptr_[ sidx + 1 ] - row_ptr_[ sidx ] ); + sp.data_ = &row_data_[ row_ptr_[ sidx ] ]; return sp; + } + /*! \brief get row iterator*/ + inline RowIter GetRow( size_t ridx ) const{ + utils::Assert( !bst_debug || ridx < this->NumRow(), "row id exceed bound" ); + RowIter it; + it.dptr = &row_data_[ row_ptr_[ridx] ] - 1; + it.dptr = &row_data_[ row_ptr_[ridx+1] ] - 1; + return it; + } + /*! + * \brief add a row to the matrix, with data stored in STL container + * \param findex feature index + * \param fvalue feature value + * \param fstart start bound of feature + * \param fend end bound range of feature + * \return the row id added line + */ + inline size_t AddRow( const std::vector &findex, + const std::vector &fvalue, + unsigned fstart = 0, unsigned fend = UINT_MAX ){ + utils::Assert( findex.size() == fvalue.size() ); + unsigned cnt = 0; + for( size_t i = 0; i < findex.size(); i ++ ){ + if( findex[i] < fstart || findex[i] >= fend ) continue; + REntry e; e.findex = findex[i]; e.fvalue = fvalue[i]; + row_data_.push_back( e ); + cnt ++; + } + row_ptr_.push_back( row_ptr_.back() + cnt ); + return row_ptr_.size() - 2; } public: /*! @@ -148,10 +216,9 @@ namespace xgboost{ inline void SaveBinary(utils::IStream &fo ) const{ size_t nrow = this->NumRow(); fo.Write( &nrow, sizeof(size_t) ); - fo.Write( &row_ptr[0], row_ptr.size() * sizeof(size_t) ); - if( findex.size() != 0 ){ - fo.Write( &findex[0] , findex.size() * sizeof(bst_uint) ); - fo.Write( &fvalue[0] , fvalue.size() * sizeof(bst_float) ); + fo.Write( &row_ptr_[0], row_ptr_.size() * sizeof(size_t) ); + if( row_data_.size() != 0 ){ + fo.Write( &row_data_[0] , row_data_.size() * sizeof(REntry) ); } } /*! @@ -163,17 +230,20 @@ namespace xgboost{ inline void LoadBinary( utils::IStream &fi ){ size_t nrow; utils::Assert( fi.Read( &nrow, sizeof(size_t) ) != 0, "Load FMatrixS" ); - row_ptr.resize( nrow + 1 ); - utils::Assert( fi.Read( &row_ptr[0], row_ptr.size() * sizeof(size_t) ), "Load FMatrixS" ); + row_ptr_.resize( nrow + 1 ); + utils::Assert( fi.Read( &row_ptr_[0], row_ptr_.size() * sizeof(size_t) ), "Load FMatrixS" ); - findex.resize( row_ptr.back() ); fvalue.resize( row_ptr.back() ); - if( findex.size() != 0 ){ - utils::Assert( fi.Read( &findex[0] , findex.size() * sizeof(bst_uint) ) , "Load FMatrixS" ); - utils::Assert( fi.Read( &fvalue[0] , fvalue.size() * sizeof(bst_float) ), "Load FMatrixS" ); + row_data_.resize( row_ptr_.back() ); + if( row_data_.size() != 0 ){ + utils::Assert( fi.Read( &row_data_[0] , row_data_.size() * sizeof(REntry) ) , "Load FMatrixS" ); } } - }; - }; + private: + /*! \brief row pointer of CSR sparse storage */ + std::vector row_ptr_; + /*! \brief data in the row */ + std::vector row_data_; + }; + }; }; - #endif diff --git a/booster/xgboost_gbmbase.h b/booster/xgboost_gbmbase.h index 1b3ddd959..7759de35a 100644 --- a/booster/xgboost_gbmbase.h +++ b/booster/xgboost_gbmbase.h @@ -191,7 +191,7 @@ namespace xgboost{ */ inline void DoBoost( std::vector &grad, std::vector &hess, - const booster::FMatrixS::Image &feats, + const booster::FMatrixS &feats, const std::vector &root_index ) { booster::IBooster *bst = this->GetUpdateBooster(); bst->DoBoost( grad, hess, feats, root_index ); diff --git a/regression/xgboost_reg.h b/regression/xgboost_reg.h index 623f70e6d..a55226798 100644 --- a/regression/xgboost_reg.h +++ b/regression/xgboost_reg.h @@ -117,8 +117,7 @@ namespace xgboost{ this->GetGradient( preds, train_->labels, grad, hess ); std::vector root_index; - booster::FMatrixS::Image train_image( train_->data ); - base_model.DoBoost(grad,hess,train_image,root_index); + base_model.DoBoost(grad,hess,train_->data,root_index); } /*! * \brief evaluate the model for specific iteration diff --git a/regression/xgboost_regdata.h b/regression/xgboost_regdata.h index d73565614..0a86c51f5 100644 --- a/regression/xgboost_regdata.h +++ b/regression/xgboost_regdata.h @@ -132,8 +132,8 @@ namespace xgboost{ for( size_t i = 0; i < data.NumRow(); i ++ ){ booster::FMatrixS::Line sp = data[i]; for( unsigned j = 0; j < sp.len; j ++ ){ - if( num_feature <= sp.findex[j] ){ - num_feature = sp.findex[j] + 1; + if( num_feature <= sp[j].findex ){ + num_feature = sp[j].findex + 1; } } }