diff --git a/Makefile b/Makefile index ef37a002d..75baf5662 100644 --- a/Makefile +++ b/Makefile @@ -12,6 +12,8 @@ export LDFLAGS= -pthread -lm xgboost: regression/xgboost_reg_main.cpp regression/*.h booster/*.h booster/*/*.hpp booster/*.hpp +#xgboost: rank/xgboost_rank_main.cpp base/*.h rank/*.h booster/*.h booster/*/*.hpp booster/*.hpp + $(BIN) : $(CXX) $(CFLAGS) $(LDFLAGS) -o $@ $(filter %.cpp %.o %.c, $^) diff --git a/booster/xgboost-inl.hpp b/booster/xgboost-inl.hpp index db44b5ac7..95ba90d15 100644 --- a/booster/xgboost-inl.hpp +++ b/booster/xgboost-inl.hpp @@ -2,7 +2,7 @@ #define XGBOOST_INL_HPP /*! * \file xgboost-inl.hpp - * \brief bootser implementations + * \brief bootser implementations * \author Tianqi Chen: tianqi.tchen@gmail.com */ // implementation of boosters go to here @@ -18,7 +18,7 @@ #include "linear/xgboost_linear.hpp" namespace xgboost{ - namespace booster{ + namespace booster{ /*! * \brief create a gradient booster, given type of booster * \param booster_type type of gradient booster, can be used to specify implements @@ -26,14 +26,14 @@ namespace xgboost{ * \return the pointer to the gradient booster created */ template - inline InterfaceBooster *CreateBooster( int booster_type ){ - switch( booster_type ){ + inline InterfaceBooster *CreateBooster(int booster_type){ + switch (booster_type){ case 0: return new RegTreeTrainer(); case 1: return new LinearBooster(); default: utils::Error("unknown booster_type"); return NULL; } - } - }; // namespace booster + } + }; // namespace booster }; // namespace xgboost #endif // XGBOOST_INL_HPP diff --git a/booster/xgboost.h b/booster/xgboost.h index 608eb5d7a..11d79b410 100644 --- a/booster/xgboost.h +++ b/booster/xgboost.h @@ -19,8 +19,8 @@ namespace xgboost{ /*! \brief namespace for boosters */ namespace booster{ - /*! - * \brief interface of a gradient boosting learner + /*! + * \brief interface of a gradient boosting learner * \tparam FMatrix the feature matrix format that the booster takes */ template @@ -35,101 +35,101 @@ namespace xgboost{ // call booster->LoadModel // (3) booster->DoBoost to update the model // (4) booster->Predict to get new prediction - /*! - * \brief set parameters from outside + /*! + * \brief set parameters from outside * \param name name of the parameter * \param val value of the parameter */ - virtual void SetParam( const char *name, const char *val ) = 0; - /*! + virtual void SetParam(const char *name, const char *val) = 0; + /*! * \brief load model from stream * \param fi input stream */ - virtual void LoadModel( utils::IStream &fi ) = 0; - /*! + virtual void LoadModel(utils::IStream &fi) = 0; + /*! * \brief save model to stream * \param fo output stream */ - virtual void SaveModel( utils::IStream &fo ) const = 0; + virtual void SaveModel(utils::IStream &fo) const = 0; /*! * \brief initialize solver before training, called before training - * this function is reserved for solver to allocate necessary space and do other preparation + * this function is reserved for solver to allocate necessary space and do other preparation */ - virtual void InitModel( void ) = 0; + virtual void InitModel(void) = 0; public: - /*! - * \brief do gradient boost training for one step, using the information given, + /*! + * \brief do gradient boost training for one step, using the information given, * Note: content of grad and hess can change after DoBoost * \param grad first order gradient of each instance * \param hess second order gradient of each instance * \param feats features of each instance - * \param root_index pre-partitioned root index of each instance, + * \param root_index pre-partitioned root index of each instance, * root_index.size() can be 0 which indicates that no pre-partition involved */ - virtual void DoBoost( std::vector &grad, - std::vector &hess, - const FMatrix &feats, - const std::vector &root_index ) = 0; - /*! + virtual void DoBoost(std::vector &grad, + std::vector &hess, + const FMatrix &feats, + const std::vector &root_index) = 0; + /*! * \brief predict the path ids along a trees, for given sparse feature vector. When booster is a tree * \param path the result of path * \param feats feature matrix * \param row_index row index in the feature matrix * \param root_index root id of current instance, default = 0 */ - virtual void PredPath( std::vector &path, const FMatrix &feats, - bst_uint row_index, unsigned root_index = 0 ){ - utils::Error( "not implemented" ); + virtual void PredPath(std::vector &path, const FMatrix &feats, + bst_uint row_index, unsigned root_index = 0){ + utils::Error("not implemented"); } - /*! + /*! * \brief predict values for given sparse feature vector - * + * * NOTE: in tree implementation, Sparse Predict is OpenMP threadsafe, but not threadsafe in general, * dense version of Predict to ensures threadsafety * \param feats feature matrix * \param row_index row index in the feature matrix * \param root_index root id of current instance, default = 0 - * \return prediction - */ - virtual float Predict( const FMatrix &feats, bst_uint row_index, unsigned root_index = 0 ){ - utils::Error( "not implemented" ); + * \return prediction + */ + virtual float Predict(const FMatrix &feats, bst_uint row_index, unsigned root_index = 0){ + utils::Error("not implemented"); return 0.0f; } - /*! + /*! * \brief predict values for given dense feature vector * \param feat feature vector in dense format * \param funknown indicator that the feature is missing * \param rid root id of current instance, default = 0 * \return prediction */ - virtual float Predict( const std::vector &feat, - const std::vector &funknown, - unsigned rid = 0 ){ - utils::Error( "not implemented" ); + virtual float Predict(const std::vector &feat, + const std::vector &funknown, + unsigned rid = 0){ + utils::Error("not implemented"); return 0.0f; } - /*! + /*! * \brief print information - * \param fo output stream - */ - virtual void PrintInfo( FILE *fo ){} - /*! + * \param fo output stream + */ + virtual void PrintInfo(FILE *fo){} + /*! * \brief dump model into text file - * \param fo output stream + * \param fo output stream * \param fmap feature map that may help give interpretations of feature * \param with_stats whether print statistics */ - virtual void DumpModel( FILE *fo, const utils::FeatMap& fmap, bool with_stats = false ){ - utils::Error( "not implemented" ); + virtual void DumpModel(FILE *fo, const utils::FeatMap& fmap, bool with_stats = false){ + utils::Error("not implemented"); } public: /*! \brief virtual destructor */ - virtual ~InterfaceBooster( void ){} + virtual ~InterfaceBooster(void){} }; }; namespace booster{ - /*! - * \brief this will is the most commonly used booster interface + /*! + * \brief this will is the most commonly used booster interface * we try to make booster invariant of data structures, but most cases, FMatrixS is what we wnat */ typedef InterfaceBooster IBooster; @@ -138,7 +138,7 @@ namespace xgboost{ namespace xgboost{ namespace booster{ - /*! + /*! * \brief create a gradient booster, given type of booster * normally we use FMatrixS, by calling CreateBooster * \param booster_type type of gradient booster, can be used to specify implements @@ -146,7 +146,7 @@ namespace xgboost{ * \return the pointer to the gradient booster created */ template - inline InterfaceBooster *CreateBooster( int booster_type ); + inline InterfaceBooster *CreateBooster(int booster_type); }; }; diff --git a/booster/xgboost_data.h b/booster/xgboost_data.h index ebe2faa4d..bdc04478a 100644 --- a/booster/xgboost_data.h +++ b/booster/xgboost_data.h @@ -21,76 +21,76 @@ namespace xgboost{ typedef unsigned bst_uint; /*! \brief float type used in boost */ typedef float bst_float; - /*! \brief debug option for booster */ - const bool bst_debug = false; + /*! \brief debug option for booster */ + const bool bst_debug = false; }; }; namespace xgboost{ namespace booster{ /** - * \brief This is a interface, defining the way to access features, + * \brief This is a interface, defining the way to access features, * by column or by row. This interface is used to make implementation * of booster does not depend on how feature is stored. * * Why template instead of virtual class: for efficiency * feature matrix is going to be used by most inner loop of the algorithm * - * \tparam Derived type of actual implementation + * \tparam Derived type of actual implementation * \sa FMatrixS: most of time FMatrixS is sufficient, refer to it if you find it confusing */ template struct FMatrix{ public: - /*! \brief exmaple iterator over one row */ + /*! \brief exmaple iterator over one row */ struct RowIter{ - /*! - * \brief move to next position + /*! + * \brief move to next position * \return whether there is element in next position */ - inline bool Next( void ); + inline bool Next(void); /*! \return feature index in current position */ - inline bst_uint findex( void ) const; + inline bst_uint findex(void) const; /*! \return feature value in current position */ - inline bst_float fvalue( void ) const; + inline bst_float fvalue(void) const; }; /*! \brief example iterator over one column */ struct ColIter{ - /*! - * \brief move to next position + /*! + * \brief move to next position * \return whether there is element in next position */ - inline bool Next( void ); + inline bool Next(void); /*! \return row index of current position */ - inline bst_uint rindex( void ) const; + inline bst_uint rindex(void) const; /*! \return feature value in current position */ - inline bst_float fvalue( void ) const; + inline bst_float fvalue(void) const; }; /*! \brief backward iterator over column */ struct ColBackIter : public ColIter {}; public: - /*! - * \brief get number of rows + /*! + * \brief get number of rows * \return number of rows */ - inline size_t NumRow( void ) const; - /*! + inline size_t NumRow(void) const; + /*! * \brief get number of columns * \return number of columns */ - inline size_t NumCol( void ) const; + inline size_t NumCol(void) const; /*! * \brief get row iterator * \param ridx row index * \return row iterator */ - inline RowIter GetRow( size_t ridx ) const; - /*! + inline RowIter GetRow(size_t ridx) const; + /*! * \brief get number of column groups, this ise used together with GetRow( ridx, gid ) * \return number of column group */ - inline unsigned NumColGroup( void ) const{ + inline unsigned NumColGroup(void) const{ return 1; } /*! @@ -99,32 +99,32 @@ namespace xgboost{ * \param gid colmun group id * \return row iterator, only iterates over features of specified column group */ - inline RowIter GetRow( size_t ridx, unsigned gid ) const; + inline RowIter GetRow(size_t ridx, unsigned gid) const; /*! \return whether column access is enabled */ - inline bool HaveColAccess( void ) const; + inline bool HaveColAccess(void) const; /*! * \brief get column iterator, the columns must be sorted by feature value * \param ridx column index * \return column iterator */ - inline ColIter GetSortedCol( size_t ridx ) const; + inline ColIter GetSortedCol(size_t ridx) const; /*! * \brief get column backward iterator, starts from biggest fvalue, and iterator back * \param ridx column index * \return reverse column iterator */ - inline ColBackIter GetReverseSortedCol( size_t ridx ) const; + inline ColBackIter GetReverseSortedCol(size_t ridx) const; }; }; }; namespace xgboost{ namespace booster{ - /*! + /*! * \brief feature matrix to store training instance, in sparse CSR format - */ - class FMatrixS: public FMatrix{ + */ + class FMatrixS : public FMatrix{ public: /*! \brief one entry in a row */ struct REntry{ @@ -133,10 +133,10 @@ namespace xgboost{ /*! \brief feature value */ bst_float fvalue; /*! \brief constructor */ - REntry( void ){} + REntry(void){} /*! \brief constructor */ - REntry( bst_uint findex, bst_float fvalue ) : findex(findex), fvalue(fvalue){} - inline static bool cmp_fvalue( const REntry &a, const REntry &b ){ + REntry(bst_uint findex, bst_float fvalue) : findex(findex), fvalue(fvalue){} + inline static bool cmp_fvalue(const REntry &a, const REntry &b){ return a.fvalue < b.fvalue; } }; @@ -147,79 +147,79 @@ namespace xgboost{ /*! \brief size of the data */ bst_uint len; /*! \brief get k-th element */ - inline const REntry& operator[]( unsigned i ) const{ + inline const REntry& operator[](unsigned i) const{ return data_[i]; - } + } }; /*! \brief row iterator */ struct RowIter{ const REntry *dptr_, *end_; - RowIter( const REntry* dptr, const REntry* end ) - :dptr_(dptr),end_(end){} - inline bool Next( void ){ - if( dptr_ == end_ ) return false; + RowIter(const REntry* dptr, const REntry* end) + :dptr_(dptr), end_(end){} + inline bool Next(void){ + if (dptr_ == end_) return false; else{ - ++ dptr_; return true; + ++dptr_; return true; } } - inline bst_uint findex( void ) const{ + inline bst_uint findex(void) const{ return dptr_->findex; } - inline bst_float fvalue( void ) const{ + inline bst_float fvalue(void) const{ return dptr_->fvalue; } }; /*! \brief column iterator */ - struct ColIter: public RowIter{ - ColIter( const REntry* dptr, const REntry* end ) - :RowIter( dptr, end ){} - inline bst_uint rindex( void ) const{ + struct ColIter : public RowIter{ + ColIter(const REntry* dptr, const REntry* end) + :RowIter(dptr, end){} + inline bst_uint rindex(void) const{ return this->findex(); } }; /*! \brief reverse column iterator */ - struct ColBackIter: public ColIter{ - ColBackIter( const REntry* dptr, const REntry* end ) - :ColIter( dptr, end ){} + struct ColBackIter : public ColIter{ + ColBackIter(const REntry* dptr, const REntry* end) + :ColIter(dptr, end){} // shadows RowIter::Next - inline bool Next( void ){ - if( dptr_ == end_ ) return false; + inline bool Next(void){ + if (dptr_ == end_) return false; else{ - -- dptr_; return true; + --dptr_; return true; } } }; public: /*! \brief constructor */ - FMatrixS( void ){ this->Clear(); } + FMatrixS(void){ this->Clear(); } /*! \brief get number of rows */ - inline size_t NumRow( void ) const{ + inline size_t NumRow(void) const{ return row_ptr_.size() - 1; } - /*! + /*! * \brief get number of nonzero entries * \return number of nonzero entries */ - inline size_t NumEntry( void ) const{ + inline size_t NumEntry(void) const{ return row_data_.size(); } /*! \brief clear the storage */ - inline void Clear( void ){ + inline void Clear(void){ row_ptr_.clear(); - row_ptr_.push_back( 0 ); + row_ptr_.push_back(0); row_data_.clear(); col_ptr_.clear(); col_data_.clear(); } /*! \brief get sparse part of current row */ - inline Line operator[]( size_t sidx ) const{ + inline Line operator[](size_t sidx) const{ Line sp; - utils::Assert( !bst_debug || sidx < this->NumRow(), "row id exceed bound" ); - sp.len = static_cast( row_ptr_[ sidx + 1 ] - row_ptr_[ sidx ] ); - sp.data_ = &row_data_[ row_ptr_[ sidx ] ]; + utils::Assert(!bst_debug || sidx < this->NumRow(), "row id exceed bound"); + sp.len = static_cast(row_ptr_[sidx + 1] - row_ptr_[sidx]); + sp.data_ = &row_data_[row_ptr_[sidx]]; return sp; } - /*! + /*! * \brief add a row to the matrix, with data stored in STL container * \param findex feature index * \param fvalue feature value @@ -227,155 +227,155 @@ namespace xgboost{ * \param fend end bound range of feature * \return the row id added line */ - inline size_t AddRow( const std::vector &findex, - const std::vector &fvalue, - unsigned fstart = 0, unsigned fend = UINT_MAX ){ - utils::Assert( findex.size() == fvalue.size() ); + inline size_t AddRow(const std::vector &findex, + const std::vector &fvalue, + unsigned fstart = 0, unsigned fend = UINT_MAX){ + utils::Assert(findex.size() == fvalue.size()); unsigned cnt = 0; - for( size_t i = 0; i < findex.size(); i ++ ){ - if( findex[i] < fstart || findex[i] >= fend ) continue; - row_data_.push_back( REntry( findex[i], fvalue[i] ) ); - cnt ++; + for (size_t i = 0; i < findex.size(); i++){ + if (findex[i] < fstart || findex[i] >= fend) continue; + row_data_.push_back(REntry(findex[i], fvalue[i])); + cnt++; } - row_ptr_.push_back( row_ptr_.back() + cnt ); + row_ptr_.push_back(row_ptr_.back() + cnt); return row_ptr_.size() - 2; } /*! \brief get row iterator*/ - inline RowIter GetRow( size_t ridx ) const{ - utils::Assert( !bst_debug || ridx < this->NumRow(), "row id exceed bound" ); - return RowIter( &row_data_[ row_ptr_[ridx] ] - 1, &row_data_[ row_ptr_[ridx+1] ] - 1 ); + inline RowIter GetRow(size_t ridx) const{ + utils::Assert(!bst_debug || ridx < this->NumRow(), "row id exceed bound"); + return RowIter(&row_data_[row_ptr_[ridx]] - 1, &row_data_[row_ptr_[ridx + 1]] - 1); } /*! \brief get row iterator*/ - inline RowIter GetRow( size_t ridx, unsigned gid ) const{ - utils::Assert( gid == 0, "FMatrixS only have 1 column group" ); - return FMatrixS::GetRow( ridx ); + inline RowIter GetRow(size_t ridx, unsigned gid) const{ + utils::Assert(gid == 0, "FMatrixS only have 1 column group"); + return FMatrixS::GetRow(ridx); } public: /*! \return whether column access is enabled */ - inline bool HaveColAccess( void ) const{ + inline bool HaveColAccess(void) const{ return col_ptr_.size() != 0 && col_data_.size() == row_data_.size(); } /*! \brief get number of colmuns */ - inline size_t NumCol( void ) const{ - utils::Assert( this->HaveColAccess() ); + inline size_t NumCol(void) const{ + utils::Assert(this->HaveColAccess()); return col_ptr_.size() - 1; } /*! \brief get col iterator*/ - inline ColIter GetSortedCol( size_t cidx ) const{ - utils::Assert( !bst_debug || cidx < this->NumCol(), "col id exceed bound" ); - return ColIter( &col_data_[ col_ptr_[cidx] ] - 1, &col_data_[ col_ptr_[cidx+1] ] - 1 ); + inline ColIter GetSortedCol(size_t cidx) const{ + utils::Assert(!bst_debug || cidx < this->NumCol(), "col id exceed bound"); + return ColIter(&col_data_[col_ptr_[cidx]] - 1, &col_data_[col_ptr_[cidx + 1]] - 1); } /*! \brief get col iterator */ - inline ColBackIter GetReverseSortedCol( size_t cidx ) const{ - utils::Assert( !bst_debug || cidx < this->NumCol(), "col id exceed bound" ); - return ColBackIter( &col_data_[ col_ptr_[cidx+1] ], &col_data_[ col_ptr_[cidx] ] ); + inline ColBackIter GetReverseSortedCol(size_t cidx) const{ + utils::Assert(!bst_debug || cidx < this->NumCol(), "col id exceed bound"); + return ColBackIter(&col_data_[col_ptr_[cidx + 1]], &col_data_[col_ptr_[cidx]]); } /*! * \brief intialize the data so that we have both column and row major * access, call this whenever we need column access */ - inline void InitData( void ){ - utils::SparseCSRMBuilder builder( col_ptr_, col_data_ ); - builder.InitBudget( 0 ); - for( size_t i = 0; i < this->NumRow(); i ++ ){ - for( RowIter it = this->GetRow(i); it.Next(); ){ - builder.AddBudget( it.findex() ); + inline void InitData(void){ + utils::SparseCSRMBuilder builder(col_ptr_, col_data_); + builder.InitBudget(0); + for (size_t i = 0; i < this->NumRow(); i++){ + for (RowIter it = this->GetRow(i); it.Next();){ + builder.AddBudget(it.findex()); } } builder.InitStorage(); - for( size_t i = 0; i < this->NumRow(); i ++ ){ - for( RowIter it = this->GetRow(i); it.Next(); ){ - builder.PushElem( it.findex(), REntry( (bst_uint)i, it.fvalue() ) ); + for (size_t i = 0; i < this->NumRow(); i++){ + for (RowIter it = this->GetRow(i); it.Next();){ + builder.PushElem(it.findex(), REntry((bst_uint)i, it.fvalue())); } } // sort columns - unsigned ncol = static_cast( this->NumCol() ); - for( unsigned i = 0; i < ncol; i ++ ){ - std::sort( &col_data_[ col_ptr_[ i ] ], &col_data_[ col_ptr_[ i+1 ] ], REntry::cmp_fvalue ); + unsigned ncol = static_cast(this->NumCol()); + for (unsigned i = 0; i < ncol; i++){ + std::sort(&col_data_[col_ptr_[i]], &col_data_[col_ptr_[i + 1]], REntry::cmp_fvalue); } } /*! - * \brief save data to binary stream - * note: since we have size_t in ptr, + * \brief save data to binary stream + * note: since we have size_t in ptr, * the function is not consistent between 64bit and 32bit machine * \param fo output stream */ - inline void SaveBinary( utils::IStream &fo ) const{ - FMatrixS::SaveBinary( fo, row_ptr_, row_data_ ); + inline void SaveBinary(utils::IStream &fo) const{ + FMatrixS::SaveBinary(fo, row_ptr_, row_data_); int col_access = this->HaveColAccess() ? 1 : 0; - fo.Write( &col_access, sizeof(int) ); - if( col_access != 0 ){ - FMatrixS::SaveBinary( fo, col_ptr_, col_data_ ); + fo.Write(&col_access, sizeof(int)); + if (col_access != 0){ + FMatrixS::SaveBinary(fo, col_ptr_, col_data_); } } /*! - * \brief load data from binary stream - * note: since we have size_t in ptr, + * \brief load data from binary stream + * note: since we have size_t in ptr, * the function is not consistent between 64bit and 32bit machin * \param fi input stream */ - inline void LoadBinary( utils::IStream &fi ){ - FMatrixS::LoadBinary( fi, row_ptr_, row_data_ ); - int col_access; - fi.Read( &col_access, sizeof(int) ); - if( col_access != 0 ){ - FMatrixS::LoadBinary( fi, col_ptr_, col_data_ ); + inline void LoadBinary(utils::IStream &fi){ + FMatrixS::LoadBinary(fi, row_ptr_, row_data_); + int col_access; + fi.Read(&col_access, sizeof(int)); + if (col_access != 0){ + FMatrixS::LoadBinary(fi, col_ptr_, col_data_); } } /*! - * \brief load from text file + * \brief load from text file * \param fi input file pointer - */ - inline void LoadText( FILE *fi ){ + */ + inline void LoadText(FILE *fi){ this->Clear(); int ninst; - while( fscanf( fi, "%d", &ninst ) == 1 ){ + while (fscanf(fi, "%d", &ninst) == 1){ std::vector findex; std::vector fvalue; - while( ninst -- ){ + while (ninst--){ unsigned index; float value; - utils::Assert( fscanf( fi, "%u:%f", &index, &value ) == 2, "load Text" ); - findex.push_back( index ); fvalue.push_back( value ); + utils::Assert(fscanf(fi, "%u:%f", &index, &value) == 2, "load Text"); + findex.push_back(index); fvalue.push_back(value); } - this->AddRow( findex, fvalue ); + this->AddRow(findex, fvalue); } // initialize column support as well this->InitData(); } private: /*! - * \brief save data to binary stream + * \brief save data to binary stream * \param fo output stream * \param ptr pointer data * \param data data content */ - inline static void SaveBinary( utils::IStream &fo, - const std::vector &ptr, - const std::vector &data ){ + inline static void SaveBinary(utils::IStream &fo, + const std::vector &ptr, + const std::vector &data){ size_t nrow = ptr.size() - 1; - fo.Write( &nrow, sizeof(size_t) ); - fo.Write( &ptr[0], ptr.size() * sizeof(size_t) ); - if( data.size() != 0 ){ - fo.Write( &data[0] , data.size() * sizeof(REntry) ); + fo.Write(&nrow, sizeof(size_t)); + fo.Write(&ptr[0], ptr.size() * sizeof(size_t)); + if (data.size() != 0){ + fo.Write(&data[0], data.size() * sizeof(REntry)); } } /*! - * \brief load data from binary stream + * \brief load data from binary stream * \param fi input stream * \param ptr pointer data * \param data data content */ - inline static void LoadBinary( utils::IStream &fi, - std::vector &ptr, - std::vector &data ){ + inline static void LoadBinary(utils::IStream &fi, + std::vector &ptr, + std::vector &data){ size_t nrow; - utils::Assert( fi.Read( &nrow, sizeof(size_t) ) != 0, "Load FMatrixS" ); - ptr.resize( nrow + 1 ); - utils::Assert( fi.Read( &ptr[0], ptr.size() * sizeof(size_t) ), "Load FMatrixS" ); + utils::Assert(fi.Read(&nrow, sizeof(size_t)) != 0, "Load FMatrixS"); + ptr.resize(nrow + 1); + utils::Assert(fi.Read(&ptr[0], ptr.size() * sizeof(size_t)), "Load FMatrixS"); - data.resize( ptr.back() ); - if( data.size() != 0 ){ - utils::Assert( fi.Read( &data[0] , data.size() * sizeof(REntry) ) , "Load FMatrixS" ); + data.resize(ptr.back()); + if (data.size() != 0){ + utils::Assert(fi.Read(&data[0], data.size() * sizeof(REntry)), "Load FMatrixS"); } } protected: @@ -387,7 +387,7 @@ namespace xgboost{ std::vector col_ptr_; /*! \brief column datas */ std::vector col_data_; - }; + }; }; }; #endif diff --git a/booster/xgboost_gbmbase.h b/booster/xgboost_gbmbase.h index fbd06c417..1a7764430 100644 --- a/booster/xgboost_gbmbase.h +++ b/booster/xgboost_gbmbase.h @@ -8,25 +8,25 @@ #include "../utils/xgboost_config.h" /*! * \file xgboost_gbmbase.h - * \brief a base model class, + * \brief a base model class, * that assembles the ensembles of booster together and do model update - * this class can be used as base code to create booster variants + * this class can be used as base code to create booster variants * * The detailed implementation of boosters should start by using the class * provided by this file - * + * * \author Tianqi Chen: tianqi.tchen@gmail.com */ namespace xgboost{ namespace booster{ /*! - * \brief a base model class, + * \brief a base model class, * that assembles the ensembles of booster together and provide single routines to do prediction buffer and update - * this class can be used as base code to create booster variants + * this class can be used as base code to create booster variants * * * relation to xgboost.h: * (1) xgboost.h provides a interface to a single booster(e.g. a single regression tree ) - * while GBMBaseModel builds upon IBooster to build a class that + * while GBMBaseModel builds upon IBooster to build a class that * ensembls the boosters together; * (2) GBMBaseModel provides prediction buffering scheme to speedup training; * (3) Summary: GBMBaseModel is a standard wrapper for boosting ensembles; @@ -37,259 +37,260 @@ namespace xgboost{ * (3) model.InitTrainer before calling model.Predict and model.DoBoost * (4) model.Predict to get predictions given a instance * (4) model.DoBoost to update the ensembles, add new booster to the model - * (4) model.SaveModel to save learned results + * (4) model.SaveModel to save learned results * - * Bufferring: each instance comes with a buffer_index in Predict. - * when mparam.num_pbuffer != 0, a unique buffer index can be + * Bufferring: each instance comes with a buffer_index in Predict. + * when mparam.num_pbuffer != 0, a unique buffer index can be * assigned to each instance to buffer previous results of boosters, - * this helps to speedup training, so consider assign buffer_index + * this helps to speedup training, so consider assign buffer_index * for each training instances, if buffer_index = -1, the code * recalculate things from scratch and will still works correctly */ class GBMBase{ public: /*! \brief number of thread used */ - GBMBase( void ){} + GBMBase(void){} /*! \brief destructor */ - virtual ~GBMBase( void ){ + virtual ~GBMBase(void){ this->FreeSpace(); } - /*! - * \brief set parameters from outside + /*! + * \brief set parameters from outside * \param name name of the parameter * \param val value of the parameter */ - inline void SetParam( const char *name, const char *val ){ - if( !strncmp( name, "bst:", 4 ) ){ - cfg.PushBack( name + 4, val ); + inline void SetParam(const char *name, const char *val){ + if (!strncmp(name, "bst:", 4)){ + cfg.PushBack(name + 4, val); } - if( !strcmp( name, "silent") ){ - cfg.PushBack( name, val ); + if (!strcmp(name, "silent")){ + cfg.PushBack(name, val); } - tparam.SetParam( name, val ); - if( boosters.size() == 0 ) mparam.SetParam( name, val ); + tparam.SetParam(name, val); + if (boosters.size() == 0) mparam.SetParam(name, val); } - /*! + /*! * \brief load model from stream * \param fi input stream */ - inline void LoadModel( utils::IStream &fi ){ - if( boosters.size() != 0 ) this->FreeSpace(); - utils::Assert( fi.Read( &mparam, sizeof(ModelParam) ) != 0 ); - boosters.resize( mparam.num_boosters ); - for( size_t i = 0; i < boosters.size(); i ++ ){ - boosters[ i ] = booster::CreateBooster( mparam.booster_type ); - boosters[ i ]->LoadModel( fi ); + inline void LoadModel(utils::IStream &fi){ + if (boosters.size() != 0) this->FreeSpace(); + utils::Assert(fi.Read(&mparam, sizeof(ModelParam)) != 0); + boosters.resize(mparam.num_boosters); + for (size_t i = 0; i < boosters.size(); i++){ + boosters[i] = booster::CreateBooster(mparam.booster_type); + boosters[i]->LoadModel(fi); } {// load info - booster_info.resize( mparam.num_boosters ); - if( mparam.num_boosters != 0 ){ - utils::Assert( fi.Read( &booster_info[0], sizeof(int)*mparam.num_boosters ) != 0 ); + booster_info.resize(mparam.num_boosters); + if (mparam.num_boosters != 0){ + utils::Assert(fi.Read(&booster_info[0], sizeof(int)*mparam.num_boosters) != 0); } } - if( mparam.num_pbuffer != 0 ){ - pred_buffer.resize ( mparam.num_pbuffer ); - pred_counter.resize( mparam.num_pbuffer ); - utils::Assert( fi.Read( &pred_buffer[0] , pred_buffer.size()*sizeof(float) ) != 0 ); - utils::Assert( fi.Read( &pred_counter[0], pred_counter.size()*sizeof(unsigned) ) != 0 ); + if (mparam.num_pbuffer != 0){ + pred_buffer.resize(mparam.num_pbuffer); + pred_counter.resize(mparam.num_pbuffer); + utils::Assert(fi.Read(&pred_buffer[0], pred_buffer.size()*sizeof(float)) != 0); + utils::Assert(fi.Read(&pred_counter[0], pred_counter.size()*sizeof(unsigned)) != 0); } } - /*! + /*! * \brief save model to stream * \param fo output stream */ - inline void SaveModel( utils::IStream &fo ) const { - utils::Assert( mparam.num_boosters == (int)boosters.size() ); - fo.Write( &mparam, sizeof(ModelParam) ); - for( size_t i = 0; i < boosters.size(); i ++ ){ - boosters[ i ]->SaveModel( fo ); + inline void SaveModel(utils::IStream &fo) const { + utils::Assert(mparam.num_boosters == (int)boosters.size()); + fo.Write(&mparam, sizeof(ModelParam)); + for (size_t i = 0; i < boosters.size(); i++){ + boosters[i]->SaveModel(fo); } - if( booster_info.size() != 0 ){ - fo.Write( &booster_info[0], sizeof(int) * booster_info.size() ); + if (booster_info.size() != 0){ + fo.Write(&booster_info[0], sizeof(int)* booster_info.size()); } - if( mparam.num_pbuffer != 0 ){ - fo.Write( &pred_buffer[0] , pred_buffer.size()*sizeof(float) ); - fo.Write( &pred_counter[0], pred_counter.size()*sizeof(unsigned) ); + if (mparam.num_pbuffer != 0){ + fo.Write(&pred_buffer[0], pred_buffer.size()*sizeof(float)); + fo.Write(&pred_counter[0], pred_counter.size()*sizeof(unsigned)); } } /*! * \brief initialize the current data storage for model, if the model is used first time, call this function */ - inline void InitModel( void ){ + inline void InitModel(void){ pred_buffer.clear(); pred_counter.clear(); - pred_buffer.resize ( mparam.num_pbuffer, 0.0 ); - pred_counter.resize( mparam.num_pbuffer, 0 ); - utils::Assert( mparam.num_boosters == 0 ); - utils::Assert( boosters.size() == 0 ); + pred_buffer.resize(mparam.num_pbuffer, 0.0); + pred_counter.resize(mparam.num_pbuffer, 0); + utils::Assert(mparam.num_boosters == 0); + utils::Assert(boosters.size() == 0); } /*! * \brief initialize solver before training, called before training - * this function is reserved for solver to allocate necessary space and do other preparation - */ - inline void InitTrainer( void ){ - if( tparam.nthread != 0 ){ - omp_set_num_threads( tparam.nthread ); + * this function is reserved for solver to allocate necessary space and do other preparation + */ + inline void InitTrainer(void){ + if (tparam.nthread != 0){ + omp_set_num_threads(tparam.nthread); } // make sure all the boosters get the latest parameters - for( size_t i = 0; i < this->boosters.size(); i ++ ){ - this->ConfigBooster( this->boosters[i] ); + for (size_t i = 0; i < this->boosters.size(); i++){ + this->ConfigBooster(this->boosters[i]); } } - /*! + /*! * \brief DumpModel - * \param fo text file + * \param fo text file * \param fmap feature map that may help give interpretations of feature * \param with_stats whether print statistics - */ - inline void DumpModel( FILE *fo, const utils::FeatMap& fmap, bool with_stats ){ - for( size_t i = 0; i < boosters.size(); i ++ ){ - fprintf( fo, "booster[%d]\n", (int)i ); - boosters[i]->DumpModel( fo, fmap, with_stats ); + */ + inline void DumpModel(FILE *fo, const utils::FeatMap& fmap, bool with_stats){ + for (size_t i = 0; i < boosters.size(); i++){ + fprintf(fo, "booster[%d]\n", (int)i); + boosters[i]->DumpModel(fo, fmap, with_stats); } } - /*! + /*! * \brief Dump path of all trees - * \param fo text file + * \param fo text file * \param data input data */ - inline void DumpPath( FILE *fo, const FMatrixS &data ){ - for( size_t i = 0; i < data.NumRow(); ++ i ){ - for( size_t j = 0; j < boosters.size(); ++ j ){ - if( j != 0 ) fprintf( fo, "\t" ); + inline void DumpPath(FILE *fo, const FMatrixS &data){ + for (size_t i = 0; i < data.NumRow(); ++i){ + for (size_t j = 0; j < boosters.size(); ++j){ + if (j != 0) fprintf(fo, "\t"); std::vector path; - boosters[j]->PredPath( path, data, i ); - fprintf( fo, "%d", path[0] ); - for( size_t k = 1; k < path.size(); ++ k ){ - fprintf( fo, ",%d", path[k] ); + boosters[j]->PredPath(path, data, i); + fprintf(fo, "%d", path[0]); + for (size_t k = 1; k < path.size(); ++k){ + fprintf(fo, ",%d", path[k]); } } - fprintf( fo, "\n" ); + fprintf(fo, "\n"); } } public: - /*! + /*! * \brief do gradient boost training for one step, using the information given * Note: content of grad and hess can change after DoBoost * \param grad first order gradient of each instance * \param hess second order gradient of each instance * \param feats features of each instance - * \param root_index pre-partitioned root index of each instance, + * \param root_index pre-partitioned root index of each instance, * root_index.size() can be 0 which indicates that no pre-partition involved */ - inline void DoBoost( std::vector &grad, - std::vector &hess, - const booster::FMatrixS &feats, - const std::vector &root_index ) { + inline void DoBoost(std::vector &grad, + std::vector &hess, + const booster::FMatrixS &feats, + const std::vector &root_index) { booster::IBooster *bst = this->GetUpdateBooster(); - bst->DoBoost( grad, hess, feats, root_index ); + bst->DoBoost(grad, hess, feats, root_index); } - /*! + /*! * \brief predict values for given sparse feature vector * NOTE: in tree implementation, this is only OpenMP threadsafe, but not threadsafe * \param feats feature matrix * \param row_index row index in the feature matrix * \param buffer_index the buffer index of the current feature line, default -1 means no buffer assigned * \param root_index root id of current instance, default = 0 - * \return prediction + * \return prediction */ - inline float Predict( const FMatrixS &feats, bst_uint row_index, int buffer_index = -1, unsigned root_index = 0 ){ + inline float Predict(const FMatrixS &feats, bst_uint row_index, int buffer_index = -1, unsigned root_index = 0){ size_t istart = 0; float psum = 0.0f; // load buffered results if any - if( mparam.do_reboost == 0 && buffer_index >= 0 ){ - utils::Assert( buffer_index < mparam.num_pbuffer, "buffer index exceed num_pbuffer" ); - istart = this->pred_counter[ buffer_index ]; - psum = this->pred_buffer [ buffer_index ]; + if (mparam.do_reboost == 0 && buffer_index >= 0){ + utils::Assert(buffer_index < mparam.num_pbuffer, "buffer index exceed num_pbuffer"); + istart = this->pred_counter[buffer_index]; + psum = this->pred_buffer[buffer_index]; + } + + for (size_t i = istart; i < this->boosters.size(); i++){ + psum += this->boosters[i]->Predict(feats, row_index, root_index); } - - for( size_t i = istart; i < this->boosters.size(); i ++ ){ - psum += this->boosters[ i ]->Predict( feats, row_index, root_index ); - } // updated the buffered results - if( mparam.do_reboost == 0 && buffer_index >= 0 ){ - this->pred_counter[ buffer_index ] = static_cast( boosters.size() ); - this->pred_buffer [ buffer_index ] = psum; + if (mparam.do_reboost == 0 && buffer_index >= 0){ + this->pred_counter[buffer_index] = static_cast(boosters.size()); + this->pred_buffer[buffer_index] = psum; } return psum; } public: //--------trial code for interactive update an existing booster------ //-------- usually not needed, ignore this region --------- - /*! - * \brief same as Predict, but removes the prediction of booster to be updated + /*! + * \brief same as Predict, but removes the prediction of booster to be updated * this function must be called once and only once for every data with pbuffer */ - inline float InteractPredict( const FMatrixS &feats, bst_uint row_index, int buffer_index = -1, unsigned root_index = 0 ){ - float psum = this->Predict( feats, row_index, buffer_index, root_index ); - if( tparam.reupdate_booster != -1 ){ + inline float InteractPredict(const FMatrixS &feats, bst_uint row_index, int buffer_index = -1, unsigned root_index = 0){ + float psum = this->Predict(feats, row_index, buffer_index, root_index); + if (tparam.reupdate_booster != -1){ const int bid = tparam.reupdate_booster; - utils::Assert( bid >= 0 && bid < (int)boosters.size(), "interact:booster_index exceed existing bound" ); - psum -= boosters[ bid ]->Predict( feats, row_index, root_index ); - if( mparam.do_reboost == 0 && buffer_index >= 0 ){ - this->pred_buffer[ buffer_index ] = psum; + utils::Assert(bid >= 0 && bid < (int)boosters.size(), "interact:booster_index exceed existing bound"); + psum -= boosters[bid]->Predict(feats, row_index, root_index); + if (mparam.do_reboost == 0 && buffer_index >= 0){ + this->pred_buffer[buffer_index] = psum; } } return psum; } /*! \brief delete the specified booster */ - inline void DelteBooster( void ){ + inline void DelteBooster(void){ const int bid = tparam.reupdate_booster; - utils::Assert( bid >= 0 && bid < mparam.num_boosters , "must specify booster index for deletion"); - delete boosters[ bid ]; - for( int i = bid + 1; i < mparam.num_boosters; ++ i ){ - boosters[i-1] = boosters[ i ]; - booster_info[i-1] = booster_info[ i ]; - } - boosters.resize( mparam.num_boosters -= 1 ); - booster_info.resize( boosters.size() ); + utils::Assert(bid >= 0 && bid < mparam.num_boosters, "must specify booster index for deletion"); + delete boosters[bid]; + for (int i = bid + 1; i < mparam.num_boosters; ++i){ + boosters[i - 1] = boosters[i]; + booster_info[i - 1] = booster_info[i]; + } + boosters.resize(mparam.num_boosters -= 1); + booster_info.resize(boosters.size()); } - /*! \brief update the prediction buffer, after booster have been updated */ - inline void InteractRePredict( const FMatrixS &feats, bst_uint row_index, int buffer_index = -1, unsigned root_index = 0 ){ - if( tparam.reupdate_booster != -1 ){ + /*! \brief update the prediction buffer, after booster have been updated */ + inline void InteractRePredict(const FMatrixS &feats, bst_uint row_index, int buffer_index = -1, unsigned root_index = 0){ + if (tparam.reupdate_booster != -1){ const int bid = tparam.reupdate_booster; - utils::Assert( bid >= 0 && bid < (int)boosters.size(), "interact:booster_index exceed existing bound" ); - if( mparam.do_reboost == 0 && buffer_index >= 0 ){ - this->pred_buffer[ buffer_index ] += boosters[ bid ]->Predict( feats, row_index, root_index ); + utils::Assert(bid >= 0 && bid < (int)boosters.size(), "interact:booster_index exceed existing bound"); + if (mparam.do_reboost == 0 && buffer_index >= 0){ + this->pred_buffer[buffer_index] += boosters[bid]->Predict(feats, row_index, root_index); } } } //-----------non public fields afterwards------------- protected: /*! \brief free space of the model */ - inline void FreeSpace( void ){ - for( size_t i = 0; i < boosters.size(); i ++ ){ + inline void FreeSpace(void){ + for (size_t i = 0; i < boosters.size(); i++){ delete boosters[i]; } - boosters.clear(); booster_info.clear(); mparam.num_boosters = 0; + boosters.clear(); booster_info.clear(); mparam.num_boosters = 0; } /*! \brief configure a booster */ - inline void ConfigBooster( booster::IBooster *bst ){ + inline void ConfigBooster(booster::IBooster *bst){ cfg.BeforeFirst(); - while( cfg.Next() ){ - bst->SetParam( cfg.name(), cfg.val() ); + while (cfg.Next()){ + bst->SetParam(cfg.name(), cfg.val()); } } - /*! - * \brief get a booster to update + /*! + * \brief get a booster to update * \return the booster created */ - inline booster::IBooster *GetUpdateBooster( void ){ - if( tparam.reupdate_booster != -1 ){ + inline booster::IBooster *GetUpdateBooster(void){ + if (tparam.reupdate_booster != -1){ const int bid = tparam.reupdate_booster; - utils::Assert( bid >= 0 && bid < (int)boosters.size(), "interact:booster_index exceed existing bound" ); - this->ConfigBooster( boosters[bid] ); - return boosters[ bid ]; + utils::Assert(bid >= 0 && bid < (int)boosters.size(), "interact:booster_index exceed existing bound"); + this->ConfigBooster(boosters[bid]); + return boosters[bid]; } - if( mparam.do_reboost == 0 || boosters.size() == 0 ){ + if (mparam.do_reboost == 0 || boosters.size() == 0){ mparam.num_boosters += 1; - boosters.push_back( booster::CreateBooster( mparam.booster_type ) ); - booster_info.push_back( 0 ); - this->ConfigBooster( boosters.back() ); - boosters.back()->InitModel(); - }else{ - this->ConfigBooster( boosters.back() ); + boosters.push_back(booster::CreateBooster(mparam.booster_type)); + booster_info.push_back(0); + this->ConfigBooster(boosters.back()); + boosters.back()->InitModel(); + } + else{ + this->ConfigBooster(boosters.back()); } return boosters.back(); } @@ -306,76 +307,76 @@ namespace xgboost{ int num_feature; /*! \brief size of predicton buffer allocated for buffering boosting computation */ int num_pbuffer; - /*! + /*! * \brief whether we repeatly update a single booster each round: default 0 * set to 1 for linear booster, so that regularization term can be considered */ int do_reboost; /*! \brief reserved parameters */ - int reserved[ 32 ]; + int reserved[32]; /*! \brief constructor */ - ModelParam( void ){ - num_boosters = 0; + ModelParam(void){ + num_boosters = 0; booster_type = 0; - num_roots = num_feature = 0; + num_roots = num_feature = 0; do_reboost = 0; num_pbuffer = 0; - memset( reserved, 0, sizeof( reserved ) ); + memset(reserved, 0, sizeof(reserved)); } - /*! - * \brief set parameters from outside + /*! + * \brief set parameters from outside * \param name name of the parameter * \param val value of the parameter */ - inline void SetParam( const char *name, const char *val ){ - if( !strcmp("booster_type", name ) ){ - booster_type = atoi( val ); + inline void SetParam(const char *name, const char *val){ + if (!strcmp("booster_type", name)){ + booster_type = atoi(val); // linear boost automatically set do reboost - if( booster_type == 1 ) do_reboost = 1; + if (booster_type == 1) do_reboost = 1; } - if( !strcmp("num_pbuffer", name ) ) num_pbuffer = atoi( val ); - if( !strcmp("do_reboost", name ) ) do_reboost = atoi( val ); - if( !strcmp("bst:num_roots", name ) ) num_roots = atoi( val ); - if( !strcmp("bst:num_feature", name ) ) num_feature = atoi( val ); + if (!strcmp("num_pbuffer", name)) num_pbuffer = atoi(val); + if (!strcmp("do_reboost", name)) do_reboost = atoi(val); + if (!strcmp("bst:num_roots", name)) num_roots = atoi(val); + if (!strcmp("bst:num_feature", name)) num_feature = atoi(val); } }; /*! \brief training parameters */ struct TrainParam{ /*! \brief number of OpenMP threads */ int nthread; - /*! - * \brief index of specific booster to be re-updated, default = -1: update new booster + /*! + * \brief index of specific booster to be re-updated, default = -1: update new booster * parameter this is part of trial interactive update mode */ int reupdate_booster; /*! \brief constructor */ - TrainParam( void ) { + TrainParam(void) { nthread = 1; reupdate_booster = -1; } - /*! - * \brief set parameters from outside + /*! + * \brief set parameters from outside * \param name name of the parameter * \param val value of the parameter - */ - inline void SetParam( const char *name, const char *val ){ - if( !strcmp("nthread", name ) ) nthread = atoi( val ); - if( !strcmp("interact:booster_index", name ) ) reupdate_booster = atoi( val ); + */ + inline void SetParam(const char *name, const char *val){ + if (!strcmp("nthread", name)) nthread = atoi(val); + if (!strcmp("interact:booster_index", name)) reupdate_booster = atoi(val); } }; protected: - /*! \brief model parameters */ + /*! \brief model parameters */ ModelParam mparam; - /*! \brief training parameters */ + /*! \brief training parameters */ TrainParam tparam; protected: - /*! \brief component boosters */ + /*! \brief component boosters */ std::vector boosters; - /*! \brief some information indicator of the booster, reserved */ + /*! \brief some information indicator of the booster, reserved */ std::vector booster_info; - /*! \brief prediction buffer */ + /*! \brief prediction buffer */ std::vector pred_buffer; - /*! \brief prediction buffer counter, record the progress so fart of the buffer */ + /*! \brief prediction buffer counter, record the progress so fart of the buffer */ std::vector pred_counter; /*! \brief configurations saved for each booster */ utils::ConfigSaver cfg; diff --git a/demo/rank/README b/demo/rank/README new file mode 100644 index 000000000..43eb1e431 --- /dev/null +++ b/demo/rank/README @@ -0,0 +1,13 @@ +Demonstrating how to use XGBoost accomplish regression tasks on computer hardware dataset https://archive.ics.uci.edu/ml/datasets/Computer+Hardware + +Run: ./runexp.sh + +Format of input: LIBSVM format + +Format of ```featmap.txt: \n ```: + - Feature id must be from 0 to number of features, in sorted order. + - i means this feature is binary indicator feature + - q means this feature is a quantitative value, such as age, time, can be missing + - int means this feature is integer value (when int is hinted, the decision boundary will be integer) + +Explainations: https://github.com/tqchen/xgboost/wiki/Regression diff --git a/demo/rank/runexp.sh b/demo/rank/runexp.sh new file mode 100644 index 000000000..900a80cce --- /dev/null +++ b/demo/rank/runexp.sh @@ -0,0 +1,16 @@ +#!/bin/bash +# map the data to features. For convenience we only use 7 original attributes and encode them as features in a trivial way +python mapfeat.py +# split train and test +python mknfold.py machine.txt 1 +# training and output the models +../../xgboost machine.conf +# output predictions of test data +../../xgboost machine.conf task=pred model_in=0002.model +# print the boosters of 0002.model in dump.raw.txt +../../xgboost machine.conf task=dump model_in=0002.model name_dump=dump.raw.txt +# print the boosters of 0002.model in dump.nice.txt with feature map +../../xgboost machine.conf task=dump model_in=0002.model fmap=featmap.txt name_dump=dump.nice.txt + +# cat the result +cat dump.nice.txt diff --git a/demo/rank/toy.eval b/demo/rank/toy.eval new file mode 100644 index 000000000..8dad91eb1 --- /dev/null +++ b/demo/rank/toy.eval @@ -0,0 +1,5 @@ +1 0:2 1:3 2:2 +0 0:2 1:3 2:2 +0 0:2 1:3 2:2 +0 0:2 1:3 2:2 +1 0:2 1:3 2:2 diff --git a/demo/rank/toy.eval.group b/demo/rank/toy.eval.group new file mode 100644 index 000000000..4792e70f3 --- /dev/null +++ b/demo/rank/toy.eval.group @@ -0,0 +1,2 @@ +2 +3 diff --git a/demo/rank/toy.test b/demo/rank/toy.test new file mode 100644 index 000000000..8dad91eb1 --- /dev/null +++ b/demo/rank/toy.test @@ -0,0 +1,5 @@ +1 0:2 1:3 2:2 +0 0:2 1:3 2:2 +0 0:2 1:3 2:2 +0 0:2 1:3 2:2 +1 0:2 1:3 2:2 diff --git a/demo/rank/toy.test.group b/demo/rank/toy.test.group new file mode 100644 index 000000000..4792e70f3 --- /dev/null +++ b/demo/rank/toy.test.group @@ -0,0 +1,2 @@ +2 +3 diff --git a/demo/rank/toy.train b/demo/rank/toy.train new file mode 100644 index 000000000..cd8b6d628 --- /dev/null +++ b/demo/rank/toy.train @@ -0,0 +1,11 @@ +1 0:1.2 1:3 2:5.6 +0 0:2.0 1:2.3 2:5.1 +0 0:3.9 1:3 2:3.1 +0 0:2 1:3.2 2:3.4 +1 0:2.1 1:4.5 2:4.2 +0 0:1.9 1:2.8 2:3.1 +1 0:3.0 1:2.0 2:1.1 +0 0:1.9 1:1.8 2:2.1 +0 0:1.1 1:2.2 2:1.4 +1 0:2.1 1:4.1 2:4.0 +0 0:1.9 1:2.2 2:1.1 diff --git a/demo/rank/toy.train.group b/demo/rank/toy.train.group new file mode 100644 index 000000000..ec385ae9f --- /dev/null +++ b/demo/rank/toy.train.group @@ -0,0 +1,2 @@ +6 +5 \ No newline at end of file diff --git a/demo/rank/train b/demo/rank/train new file mode 100644 index 000000000..e69de29bb diff --git a/dev/base/xgboost_boost_task.h b/dev/base/xgboost_boost_task.h index 1234eee8f..b79af31e4 100644 --- a/dev/base/xgboost_boost_task.h +++ b/dev/base/xgboost_boost_task.h @@ -11,314 +11,319 @@ #include "../utils/xgboost_config.h" namespace xgboost{ - namespace base{ - /*! - * \brief wrapping the training process of the gradient boosting model, - * given the configuation - * \author Kailong Chen: chenkl198812@gmail.com, Tianqi Chen: tianqi.chen@gmail.com - */ - class BoostTask{ - public: - inline int Run(int argc, char *argv[]){ - if (argc < 2){ - printf("Usage: \n"); - return 0; - } - utils::ConfigIterator itr(argv[1]); - while (itr.Next()){ - this->SetParam(itr.name(), itr.val()); - } - for (int i = 2; i < argc; i++){ - char name[256], val[256]; - if (sscanf(argv[i], "%[^=]=%s", name, val) == 2){ - this->SetParam(name, val); - } - } - this->InitData(); - this->InitLearner(); - if (task == "dump"){ - this->TaskDump(); - return 0; - } - if (task == "interact"){ - this->TaskInteractive(); return 0; - } - if (task == "dumppath"){ - this->TaskDumpPath(); return 0; - } - if (task == "eval"){ - this->TaskEval(); return 0; - } - if (task == "pred"){ - this->TaskPred(); - } - else{ - this->TaskTrain(); - } - return 0; - } + namespace base{ + /*! + * \brief wrapping the training process of the gradient boosting model, + * given the configuation + * \author Kailong Chen: chenkl198812@gmail.com, Tianqi Chen: tianqi.chen@gmail.com + */ + class BoostTask{ + public: + inline int Run(int argc, char *argv[]){ - enum learning_tasks{ - REGRESSION = 0, - BINARY_CLASSIFICATION = 1, - RANKING = 2 - }; + if (argc < 2){ + printf("Usage: \n"); + return 0; + } + utils::ConfigIterator itr(argv[1]); + while (itr.Next()){ + this->SetParam(itr.name(), itr.val()); + } + for (int i = 2; i < argc; i++){ + char name[256], val[256]; + if (sscanf(argv[i], "%[^=]=%s", name, val) == 2){ + this->SetParam(name, val); + } + } + + this->InitData(); + this->InitLearner(); + if (task == "dump"){ + this->TaskDump(); + return 0; + } + if (task == "interact"){ + this->TaskInteractive(); return 0; + } + if (task == "dumppath"){ + this->TaskDumpPath(); return 0; + } + if (task == "eval"){ + this->TaskEval(); return 0; + } + if (task == "pred"){ + this->TaskPred(); + } + else{ + this->TaskTrain(); + } + return 0; + } - /* \brief set learner - * \param learner the passed in learner - */ - inline void SetLearner(BoostLearner* learner){ - learner_ = learner; - } + enum learning_tasks{ + REGRESSION = 0, + BINARY_CLASSIFICATION = 1, + RANKING = 2 + }; - inline void SetParam(const char *name, const char *val){ - if (!strcmp("learning_task", name)) learning_task = atoi(val); - if (!strcmp("silent", name)) silent = atoi(val); - if (!strcmp("use_buffer", name)) use_buffer = atoi(val); - if (!strcmp("seed", name)) random::Seed(atoi(val)); - if (!strcmp("num_round", name)) num_round = atoi(val); - if (!strcmp("save_period", name)) save_period = atoi(val); - if (!strcmp("task", name)) task = val; - if (!strcmp("data", name)) train_path = val; - if (!strcmp("test:data", name)) test_path = val; - if (!strcmp("model_in", name)) model_in = val; - if (!strcmp("model_out", name)) model_out = val; - if (!strcmp("model_dir", name)) model_dir_path = val; - if (!strcmp("fmap", name)) name_fmap = val; - if (!strcmp("name_dump", name)) name_dump = val; - if (!strcmp("name_dumppath", name)) name_dumppath = val; - if (!strcmp("name_pred", name)) name_pred = val; - if (!strcmp("dump_stats", name)) dump_model_stats = atoi(val); - if (!strcmp("interact:action", name)) interact_action = val; - if (!strncmp("batch:", name, 6)){ - cfg_batch.PushBack(name + 6, val); - } - if (!strncmp("eval[", name, 5)) { - char evname[256]; - utils::Assert(sscanf(name, "eval[%[^]]", evname) == 1, "must specify evaluation name for display"); - eval_data_names.push_back(std::string(evname)); - eval_data_paths.push_back(std::string(val)); - } - cfg.PushBack(name, val); - } - public: - BoostTask(void){ - // default parameters - silent = 0; - use_buffer = 1; - num_round = 10; - save_period = 0; - dump_model_stats = 0; - task = "train"; - model_in = "NULL"; - model_out = "NULL"; - name_fmap = "NULL"; - name_pred = "pred.txt"; - name_dump = "dump.txt"; - name_dumppath = "dump.path.txt"; - model_dir_path = "./"; - interact_action = "update"; - } - ~BoostTask(void){ - for (size_t i = 0; i < deval.size(); i++){ - delete deval[i]; - } - } - private: + /* \brief set learner + * \param learner the passed in learner + */ + inline void SetLearner(BoostLearner* learner){ + learner_ = learner; + } + + inline void SetParam(const char *name, const char *val){ + if (!strcmp("learning_task", name)) learning_task = atoi(val); + if (!strcmp("silent", name)) silent = atoi(val); + if (!strcmp("use_buffer", name)) use_buffer = atoi(val); + if (!strcmp("seed", name)) random::Seed(atoi(val)); + if (!strcmp("num_round", name)) num_round = atoi(val); + if (!strcmp("save_period", name)) save_period = atoi(val); + if (!strcmp("task", name)) task = val; + if (!strcmp("data", name)) train_path = val; + if (!strcmp("test:data", name)) test_path = val; + if (!strcmp("model_in", name)) model_in = val; + if (!strcmp("model_out", name)) model_out = val; + if (!strcmp("model_dir", name)) model_dir_path = val; + if (!strcmp("fmap", name)) name_fmap = val; + if (!strcmp("name_dump", name)) name_dump = val; + if (!strcmp("name_dumppath", name)) name_dumppath = val; + if (!strcmp("name_pred", name)) name_pred = val; + if (!strcmp("dump_stats", name)) dump_model_stats = atoi(val); + if (!strcmp("interact:action", name)) interact_action = val; + if (!strncmp("batch:", name, 6)){ + cfg_batch.PushBack(name + 6, val); + } + if (!strncmp("eval[", name, 5)) { + char evname[256]; + utils::Assert(sscanf(name, "eval[%[^]]", evname) == 1, "must specify evaluation name for display"); + eval_data_names.push_back(std::string(evname)); + eval_data_paths.push_back(std::string(val)); + } + cfg.PushBack(name, val); + } + public: + BoostTask(void){ + // default parameters + silent = 0; + use_buffer = 1; + num_round = 10; + save_period = 0; + dump_model_stats = 0; + task = "train"; + model_in = "NULL"; + model_out = "NULL"; + name_fmap = "NULL"; + name_pred = "pred.txt"; + name_dump = "dump.txt"; + name_dumppath = "dump.path.txt"; + model_dir_path = "./"; + interact_action = "update"; + } + ~BoostTask(void){ + for (size_t i = 0; i < deval.size(); i++){ + delete deval[i]; + } + } + private: - inline void InitData(void){ - if (name_fmap != "NULL") fmap.LoadText(name_fmap.c_str()); - if (task == "dump") return; - if (learning_task == RANKING){ - char instance_path[256], group_path[256]; - if (task == "pred" || task == "dumppath"){ - sscanf(test_path.c_str(), "%[^;];%s", instance_path, group_path); - data.CacheLoad(instance_path, group_path, silent != 0, use_buffer != 0); - } - else{ - // training - sscanf(train_path.c_str(), "%[^;];%s", instance_path, group_path); - data.CacheLoad(instance_path, group_path, silent != 0, use_buffer != 0); - utils::Assert(eval_data_names.size() == eval_data_paths.size()); - for (size_t i = 0; i < eval_data_names.size(); ++i){ - deval.push_back(new DMatrix()); - sscanf(eval_data_paths[i].c_str(), "%[^;];%s", instance_path, group_path); - deval.back()->CacheLoad(instance_path, group_path, silent != 0, use_buffer != 0); - } - } + inline void InitData(void){ + + if (name_fmap != "NULL") fmap.LoadText(name_fmap.c_str()); + if (task == "dump") return; + if (learning_task == RANKING){ + char instance_path[256], group_path[256]; + if (task == "pred" || task == "dumppath"){ + sscanf(test_path.c_str(), "%[^;];%s", instance_path, group_path); + data.CacheLoad(instance_path, group_path, silent != 0, use_buffer != 0); + } + else{ + // training + sscanf(train_path.c_str(), "%[^;];%s", instance_path, group_path); + data.CacheLoad(instance_path, group_path, silent != 0, use_buffer != 0); + + utils::Assert(eval_data_names.size() == eval_data_paths.size()); + for (size_t i = 0; i < eval_data_names.size(); ++i){ + deval.push_back(new DMatrix()); + sscanf(eval_data_paths[i].c_str(), "%[^;];%s", instance_path, group_path); + deval.back()->CacheLoad(instance_path, group_path, silent != 0, use_buffer != 0); + } + } + } + else{ + if (task == "pred" || task == "dumppath"){ + data.CacheLoad(test_path.c_str(), "", silent != 0, use_buffer != 0); + } + else{ + // training + data.CacheLoad(train_path.c_str(), "", silent != 0, use_buffer != 0); + utils::Assert(eval_data_names.size() == eval_data_paths.size()); + for (size_t i = 0; i < eval_data_names.size(); ++i){ + deval.push_back(new DMatrix()); + deval.back()->CacheLoad(eval_data_paths[i].c_str(), "", silent != 0, use_buffer != 0); + } + } + } + learner_->SetData(&data, deval, eval_data_names); + if(!silent) printf("BoostTask:Data Initiation Done!\n"); + } + + inline void InitLearner(void){ + cfg.BeforeFirst(); + while (cfg.Next()){ + learner_->SetParam(cfg.name(), cfg.val()); + } + if (model_in != "NULL"){ + utils::FileStream fi(utils::FopenCheck(model_in.c_str(), "rb")); + learner_->LoadModel(fi); + fi.Close(); + } + else{ + utils::Assert(task == "train", "model_in not specified"); + learner_->InitModel(); + } + learner_->InitTrainer(); + if(!silent) printf("BoostTask:InitLearner Done!\n"); + } - } - else{ - if (task == "pred" || task == "dumppath"){ - data.CacheLoad(test_path.c_str(), "", silent != 0, use_buffer != 0); - } - else{ - // training - data.CacheLoad(train_path.c_str(), "", silent != 0, use_buffer != 0); - utils::Assert(eval_data_names.size() == eval_data_paths.size()); - for (size_t i = 0; i < eval_data_names.size(); ++i){ - deval.push_back(new DMatrix()); - deval.back()->CacheLoad(eval_data_paths[i].c_str(), "", silent != 0, use_buffer != 0); - } - } - } + inline void TaskTrain(void){ + const time_t start = time(NULL); + unsigned long elapsed = 0; + for (int i = 0; i < num_round; ++i){ + elapsed = (unsigned long)(time(NULL) - start); + if (!silent) printf("boosting round %d, %lu sec elapsed\n", i, elapsed); + learner_->UpdateOneIter(i); + learner_->EvalOneIter(i); + if (save_period != 0 && (i + 1) % save_period == 0){ + this->SaveModel(i); + } + elapsed = (unsigned long)(time(NULL) - start); + } + // always save final round + if (save_period == 0 || num_round % save_period != 0){ + if (model_out == "NULL"){ + this->SaveModel(num_round - 1); + } + else{ + this->SaveModel(model_out.c_str()); + } + } + if (!silent){ + printf("\nupdating end, %lu sec in all\n", elapsed); + } + } + inline void TaskEval(void){ + learner_->EvalOneIter(0); + } + inline void TaskInteractive(void){ + const time_t start = time(NULL); + unsigned long elapsed = 0; + int batch_action = 0; - learner_->SetData(&data, deval, eval_data_names); - } - inline void InitLearner(void){ - cfg.BeforeFirst(); - while (cfg.Next()){ - learner_->SetParam(cfg.name(), cfg.val()); - } - if (model_in != "NULL"){ - utils::FileStream fi(utils::FopenCheck(model_in.c_str(), "rb")); - learner_->LoadModel(fi); - fi.Close(); - } - else{ - utils::Assert(task == "train", "model_in not specified"); - learner_->InitModel(); - } - learner_->InitTrainer(); - } + cfg_batch.BeforeFirst(); + while (cfg_batch.Next()){ + if (!strcmp(cfg_batch.name(), "run")){ + learner_->UpdateInteract(interact_action); + batch_action += 1; + } + else{ + learner_->SetParam(cfg_batch.name(), cfg_batch.val()); + } + } - inline void TaskTrain(void){ - const time_t start = time(NULL); - unsigned long elapsed = 0; - for (int i = 0; i < num_round; ++i){ - elapsed = (unsigned long)(time(NULL) - start); - if (!silent) printf("boosting round %d, %lu sec elapsed\n", i, elapsed); - learner_->UpdateOneIter(i); - learner_->EvalOneIter(i); - if (save_period != 0 && (i + 1) % save_period == 0){ - this->SaveModel(i); - } - elapsed = (unsigned long)(time(NULL) - start); - } - // always save final round - if (save_period == 0 || num_round % save_period != 0){ - if (model_out == "NULL"){ - this->SaveModel(num_round - 1); - } - else{ - this->SaveModel(model_out.c_str()); - } - } - if (!silent){ - printf("\nupdating end, %lu sec in all\n", elapsed); - } - } - inline void TaskEval(void){ - learner_->EvalOneIter(0); - } - inline void TaskInteractive(void){ - const time_t start = time(NULL); - unsigned long elapsed = 0; - int batch_action = 0; + if (batch_action == 0){ + learner_->UpdateInteract(interact_action); + } + utils::Assert(model_out != "NULL", "interactive mode must specify model_out"); + this->SaveModel(model_out.c_str()); + elapsed = (unsigned long)(time(NULL) - start); - cfg_batch.BeforeFirst(); - while (cfg_batch.Next()){ - if (!strcmp(cfg_batch.name(), "run")){ - learner_->UpdateInteract(interact_action); - batch_action += 1; - } - else{ - learner_->SetParam(cfg_batch.name(), cfg_batch.val()); - } - } + if (!silent){ + printf("\ninteractive update, %d batch actions, %lu sec in all\n", batch_action, elapsed); + } + } - if (batch_action == 0){ - learner_->UpdateInteract(interact_action); - } - utils::Assert(model_out != "NULL", "interactive mode must specify model_out"); - this->SaveModel(model_out.c_str()); - elapsed = (unsigned long)(time(NULL) - start); + inline void TaskDump(void){ + FILE *fo = utils::FopenCheck(name_dump.c_str(), "w"); + learner_->DumpModel(fo, fmap, dump_model_stats != 0); + fclose(fo); + } + inline void TaskDumpPath(void){ + FILE *fo = utils::FopenCheck(name_dumppath.c_str(), "w"); + learner_->DumpPath(fo, data); + fclose(fo); + } + inline void SaveModel(const char *fname) const{ + utils::FileStream fo(utils::FopenCheck(fname, "wb")); + learner_->SaveModel(fo); + fo.Close(); + } + inline void SaveModel(int i) const{ + char fname[256]; + sprintf(fname, "%s/%04d.model", model_dir_path.c_str(), i + 1); + this->SaveModel(fname); + } + inline void TaskPred(void){ + std::vector preds; + if (!silent) printf("start prediction...\n"); + learner_->Predict(preds, data); + if (!silent) printf("writing prediction to %s\n", name_pred.c_str()); + FILE *fo = utils::FopenCheck(name_pred.c_str(), "w"); + for (size_t i = 0; i < preds.size(); i++){ + fprintf(fo, "%f\n", preds[i]); + } + fclose(fo); + } + private: + /* \brief specify the learning task*/ + int learning_task; + /* \brief whether silent */ + int silent; + /* \brief whether use auto binary buffer */ + int use_buffer; + /* \brief number of boosting iterations */ + int num_round; + /* \brief the period to save the model, 0 means only save the final round model */ + int save_period; + /*! \brief interfact action */ + std::string interact_action; + /* \brief the path of training/test data set */ + std::string train_path, test_path; + /* \brief the path of test model file, or file to restart training */ + std::string model_in; + /* \brief the path of final model file, to be saved */ + std::string model_out; + /* \brief the path of directory containing the saved models */ + std::string model_dir_path; + /* \brief task to perform, choosing training or testing */ + std::string task; + /* \brief name of predict file */ + std::string name_pred; + /* \brief whether dump statistics along with model */ + int dump_model_stats; + /* \brief name of feature map */ + std::string name_fmap; + /* \brief name of dump file */ + std::string name_dump; + /* \brief name of dump path file */ + std::string name_dumppath; + /* \brief the paths of validation data sets */ + std::vector eval_data_paths; + /* \brief the names of the evaluation data used in output log */ + std::vector eval_data_names; + /*! \brief saves configurations */ + utils::ConfigSaver cfg; + /*! \brief batch configurations */ + utils::ConfigSaver cfg_batch; + private: + DMatrix data; + std::vector deval; + utils::FeatMap fmap; + BoostLearner* learner_; - if (!silent){ - printf("\ninteractive update, %d batch actions, %lu sec in all\n", batch_action, elapsed); - } - } - - inline void TaskDump(void){ - FILE *fo = utils::FopenCheck(name_dump.c_str(), "w"); - learner_->DumpModel(fo, fmap, dump_model_stats != 0); - fclose(fo); - } - inline void TaskDumpPath(void){ - FILE *fo = utils::FopenCheck(name_dumppath.c_str(), "w"); - learner_->DumpPath(fo, data); - fclose(fo); - } - inline void SaveModel(const char *fname) const{ - utils::FileStream fo(utils::FopenCheck(fname, "wb")); - learner_->SaveModel(fo); - fo.Close(); - } - inline void SaveModel(int i) const{ - char fname[256]; - sprintf(fname, "%s/%04d.model", model_dir_path.c_str(), i + 1); - this->SaveModel(fname); - } - inline void TaskPred(void){ - std::vector preds; - if (!silent) printf("start prediction...\n"); - learner_->Predict(preds, data); - if (!silent) printf("writing prediction to %s\n", name_pred.c_str()); - FILE *fo = utils::FopenCheck(name_pred.c_str(), "w"); - for (size_t i = 0; i < preds.size(); i++){ - fprintf(fo, "%f\n", preds[i]); - } - fclose(fo); - } - private: - /* \brief specify the learning task*/ - int learning_task; - /* \brief whether silent */ - int silent; - /* \brief whether use auto binary buffer */ - int use_buffer; - /* \brief number of boosting iterations */ - int num_round; - /* \brief the period to save the model, 0 means only save the final round model */ - int save_period; - /*! \brief interfact action */ - std::string interact_action; - /* \brief the path of training/test data set */ - std::string train_path, test_path; - /* \brief the path of test model file, or file to restart training */ - std::string model_in; - /* \brief the path of final model file, to be saved */ - std::string model_out; - /* \brief the path of directory containing the saved models */ - std::string model_dir_path; - /* \brief task to perform, choosing training or testing */ - std::string task; - /* \brief name of predict file */ - std::string name_pred; - /* \brief whether dump statistics along with model */ - int dump_model_stats; - /* \brief name of feature map */ - std::string name_fmap; - /* \brief name of dump file */ - std::string name_dump; - /* \brief name of dump path file */ - std::string name_dumppath; - /* \brief the paths of validation data sets */ - std::vector eval_data_paths; - /* \brief the names of the evaluation data used in output log */ - std::vector eval_data_names; - /*! \brief saves configurations */ - utils::ConfigSaver cfg; - /*! \brief batch configurations */ - utils::ConfigSaver cfg_batch; - private: - DMatrix data; - std::vector deval; - utils::FeatMap fmap; - BoostLearner* learner_; - - }; - }; + }; + }; }; diff --git a/dev/base/xgboost_data_instance.h b/dev/base/xgboost_data_instance.h index e33ca687a..6ac5c5d13 100644 --- a/dev/base/xgboost_data_instance.h +++ b/dev/base/xgboost_data_instance.h @@ -9,183 +9,206 @@ namespace xgboost{ - namespace base{ - /*! \brief data matrix for regression,classification,rank content */ - struct DMatrix{ - public: - /*! \brief maximum feature dimension */ - unsigned num_feature; - /*! \brief feature data content */ - booster::FMatrixS data; - /*! \brief label of each instance */ - std::vector labels; - /*! \brief the index of begin and end of a group, - * needed when the learning task is ranking*/ - std::vector group_index; - public: - /*! \brief default constructor */ - DMatrix(void){} + namespace base{ + /*! \brief data matrix for regression, classification, rank content */ + struct DMatrix{ + public: + /*! \brief maximum feature dimension */ + unsigned num_feature; + /*! \brief feature data content */ + booster::FMatrixS data; + /*! \brief label of each instance */ + std::vector labels; + /*! \brief the index of begin and end of a group, + * needed when the learning task is ranking*/ + std::vector group_index; + public: + /*! \brief default constructor */ + DMatrix(void){} - /*! \brief get the number of instances */ - inline size_t Size() const{ - return labels.size(); - } - /*! - * \brief load from text file - * \param fname file of instances data - * \param fgroup file of the group data - * \param silent whether print information or not - */ - inline void LoadText(const char* fname, const char* fgroup, bool silent = false){ - data.Clear(); - FILE* file = utils::FopenCheck(fname, "r"); - float label; bool init = true; - char tmp[1024]; - std::vector findex; - std::vector fvalue; + /*! \brief get the number of instances */ + inline size_t Size() const{ + return labels.size(); + } + /*! + * \brief load from text file + * \param fname file of instances data + * \param fgroup file of the group data + * \param silent whether print information or not + */ + inline void LoadText(const char* fname, const char* fgroup, bool silent = false){ + data.Clear(); + FILE* file = utils::FopenCheck(fname, "r"); + float label; bool init = true; + char tmp[1024]; + std::vector findex; + std::vector fvalue; - while (fscanf(file, "%s", tmp) == 1){ - unsigned index; float value; - if (sscanf(tmp, "%u:%f", &index, &value) == 2){ - findex.push_back(index); fvalue.push_back(value); - } - else{ - if (!init){ - labels.push_back(label); - data.AddRow(findex, fvalue); - } - findex.clear(); fvalue.clear(); - utils::Assert(sscanf(tmp, "%f", &label) == 1, "invalid format"); - init = false; - } - } + while (fscanf(file, "%s", tmp) == 1){ + unsigned index; float value; + if (sscanf(tmp, "%u:%f", &index, &value) == 2){ + findex.push_back(index); fvalue.push_back(value); + } + else{ + if (!init){ + labels.push_back(label); + data.AddRow(findex, fvalue); + } + findex.clear(); fvalue.clear(); + utils::Assert(sscanf(tmp, "%f", &label) == 1, "invalid format"); + init = false; + } + } - labels.push_back(label); - data.AddRow(findex, fvalue); - // initialize column support as well - data.InitData(); + labels.push_back(label); + data.AddRow(findex, fvalue); + // initialize column support as well + data.InitData(); + + if (!silent){ + printf("%ux%u matrix with %lu entries is loaded from %s\n", + (unsigned)data.NumRow(), (unsigned)data.NumCol(), (unsigned long)data.NumEntry(), fname); + } + fclose(file); + LoadGroup(fgroup,silent); + } + + inline void LoadGroup(const char* fgroup, bool silent = false){ + //if exists group data load it in + FILE *file_group = fopen64(fgroup, "r"); + + if (file_group != NULL){ + group_index.push_back(0); + int tmp = 0, acc = 0,cnt = 0; + while (fscanf(file_group, "%d", &tmp) == 1){ + acc += tmp; + group_index.push_back(acc); + cnt++; + } + if(!silent) printf("%d groups are loaded from %s\n",cnt,fgroup); + fclose(file_group); + }else{ + if(!silent) printf("There is no group file\n"); + } + + } + /*! + * \brief load from binary file + * \param fname name of binary data + * \param silent whether print information or not + * \return whether loading is success + */ + inline bool LoadBinary(const char* fname, const char* fgroup, bool silent = false){ + FILE *fp = fopen64(fname, "rb"); + if (fp == NULL) return false; + utils::FileStream fs(fp); + data.LoadBinary(fs); + labels.resize(data.NumRow()); + utils::Assert(fs.Read(&labels[0], sizeof(float) * data.NumRow()) != 0, "DMatrix LoadBinary"); + fs.Close(); + // initialize column support as well + data.InitData(); - if (!silent){ - printf("%ux%u matrix with %lu entries is loaded from %s\n", - (unsigned)data.NumRow(), (unsigned)data.NumCol(), (unsigned long)data.NumEntry(), fname); - } - fclose(file); + if (!silent){ + printf("%ux%u matrix with %lu entries is loaded from %s as binary\n", + (unsigned)data.NumRow(), (unsigned)data.NumCol(), (unsigned long)data.NumEntry(), fname); + } - //if exists group data load it in - FILE *file_group = fopen64(fgroup, "r"); - if (file_group != NULL){ - group_index.push_back(0); - int tmp = 0, acc = 0; - while (fscanf(file_group, "%d", tmp) == 1){ - acc += tmp; - group_index.push_back(acc); - } - } - } - /*! - * \brief load from binary file - * \param fname name of binary data - * \param silent whether print information or not - * \return whether loading is success - */ - inline bool LoadBinary(const char* fname, const char* fgroup, bool silent = false){ - FILE *fp = fopen64(fname, "rb"); - if (fp == NULL) return false; - utils::FileStream fs(fp); - data.LoadBinary(fs); - labels.resize(data.NumRow()); - utils::Assert(fs.Read(&labels[0], sizeof(float)* data.NumRow()) != 0, "DMatrix LoadBinary"); - fs.Close(); - // initialize column support as well - data.InitData(); + LoadGroupBinary(fgroup,silent); + return true; + } + + /*! + * \brief save to binary file + * \param fname name of binary data + * \param silent whether print information or not + */ + inline void SaveBinary(const char* fname, const char* fgroup, bool silent = false){ + // initialize column support as well + data.InitData(); - if (!silent){ - printf("%ux%u matrix with %lu entries is loaded from %s\n", - (unsigned)data.NumRow(), (unsigned)data.NumCol(), (unsigned long)data.NumEntry(), fname); - } + utils::FileStream fs(utils::FopenCheck(fname, "wb")); + data.SaveBinary(fs); + fs.Write(&labels[0], sizeof(float)* data.NumRow()); + fs.Close(); + if (!silent){ + printf("%ux%u matrix with %lu entries is saved to %s as binary\n", + (unsigned)data.NumRow(), (unsigned)data.NumCol(), (unsigned long)data.NumEntry(), fname); + } - //if group data exists load it in - FILE *file_group = fopen64(fgroup, "r"); - if (file_group != NULL){ - int group_index_size = 0; - utils::FileStream group_stream(file_group); - utils::Assert(group_stream.Read(&group_index_size, sizeof(int)) != 0, "Load group indice size"); - group_index.resize(group_index_size); - utils::Assert(group_stream.Read(&group_index, sizeof(int)* group_index_size) != 0, "Load group indice"); + SaveGroupBinary(fgroup,silent); + } + + inline void SaveGroupBinary(const char* fgroup, bool silent = false){ + //save group data + if (group_index.size() > 0){ + utils::FileStream file_group(utils::FopenCheck(fgroup, "wb")); + int group_index_size = group_index.size(); + file_group.Write(&(group_index_size), sizeof(int)); + file_group.Write(&group_index[0], sizeof(int) * group_index_size); + file_group.Close(); + if(!silent){printf("Index info of %d groups is saved to %s as binary\n",group_index_size-1,fgroup);} + } + } + + inline void LoadGroupBinary(const char* fgroup, bool silent = false){ + //if group data exists load it in + FILE *file_group = fopen64(fgroup, "r"); + if (file_group != NULL){ + int group_index_size = 0; + utils::FileStream group_stream(file_group); + utils::Assert(group_stream.Read(&group_index_size, sizeof(int)) != 0, "Load group indice size"); + group_index.resize(group_index_size); + utils::Assert(group_stream.Read(&group_index[0], sizeof(int) * group_index_size) != 0, "Load group indice"); - if (!silent){ - printf("the group index of %d groups is loaded from %s\n", - group_index_size - 1, fgroup); - } - } - return true; - } - /*! - * \brief save to binary file - * \param fname name of binary data - * \param silent whether print information or not - */ - inline void SaveBinary(const char* fname, const char* fgroup, bool silent = false){ - // initialize column support as well - data.InitData(); - - utils::FileStream fs(utils::FopenCheck(fname, "wb")); - data.SaveBinary(fs); - fs.Write(&labels[0], sizeof(float)* data.NumRow()); - fs.Close(); - if (!silent){ - printf("%ux%u matrix with %lu entries is saved to %s\n", - (unsigned)data.NumRow(), (unsigned)data.NumCol(), (unsigned long)data.NumEntry(), fname); - } - - //save group data - if (group_index.size() > 0){ - utils::FileStream file_group(utils::FopenCheck(fgroup, "wb")); - int group_index_size = group_index.size(); - file_group.Write(&(group_index_size), sizeof(int)); - file_group.Write(&group_index[0], sizeof(int) * group_index_size); - } - - } - /*! - * \brief cache load data given a file name, if filename ends with .buffer, direct load binary - * otherwise the function will first check if fname + '.buffer' exists, - * if binary buffer exists, it will reads from binary buffer, otherwise, it will load from text file, - * and try to create a buffer file - * \param fname name of binary data - * \param silent whether print information or not - * \param savebuffer whether do save binary buffer if it is text - */ - inline void CacheLoad(const char *fname, const char *fgroup, bool silent = false, bool savebuffer = true){ - int len = strlen(fname); - if (len > 8 && !strcmp(fname + len - 7, ".buffer")){ - this->LoadBinary(fname, fgroup, silent); return; - } - char bname[1024]; - sprintf(bname, "%s.buffer", fname); - if (!this->LoadBinary(bname, fgroup, silent)){ - this->LoadText(fname, fgroup, silent); - if (savebuffer) this->SaveBinary(bname, fgroup, silent); - } - } - private: - /*! \brief update num_feature info */ - inline void UpdateInfo(void){ - this->num_feature = 0; - for (size_t i = 0; i < data.NumRow(); i++){ - booster::FMatrixS::Line sp = data[i]; - for (unsigned j = 0; j < sp.len; j++){ - if (num_feature <= sp[j].findex){ - num_feature = sp[j].findex + 1; - } - } - } - } - }; - - - - } + if (!silent){ + printf("Index info of %d groups is loaded from %s as binary\n", + group_index.size() - 1, fgroup); + } + fclose(file_group); + }else{ + if(!silent){printf("The binary file of group info not exists");} + } + } + + /*! + * \brief cache load data given a file name, if filename ends with .buffer, direct load binary + * otherwise the function will first check if fname + '.buffer' exists, + * if binary buffer exists, it will reads from binary buffer, otherwise, it will load from text file, + * and try to create a buffer file + * \param fname name of binary data + * \param silent whether print information or not + * \param savebuffer whether do save binary buffer if it is text + */ + inline void CacheLoad(const char *fname, const char *fgroup, bool silent = false, bool savebuffer = true){ + int len = strlen(fname); + if (len > 8 && !strcmp(fname + len - 7, ".buffer")){ + this->LoadBinary(fname, fgroup, silent); return; + } + char bname[1024],bgroup[1024]; + sprintf(bname, "%s.buffer", fname); + sprintf(bgroup, "%s.buffer", fgroup); + if (!this->LoadBinary(bname, bgroup, silent)) + { + this->LoadText(fname, fgroup, silent); + if (savebuffer) this->SaveBinary(bname, bgroup, silent); + } + } + private: + /*! \brief update num_feature info */ + inline void UpdateInfo(void){ + this->num_feature = 0; + for (size_t i = 0; i < data.NumRow(); i++){ + booster::FMatrixS::Line sp = data[i]; + for (unsigned j = 0; j < sp.len; j++){ + if (num_feature <= sp[j].findex){ + num_feature = sp[j].findex + 1; + } + } + } + } + }; + } }; #endif \ No newline at end of file diff --git a/dev/base/xgboost_learner.h b/dev/base/xgboost_learner.h index 0b02030f8..0fc15e7f8 100644 --- a/dev/base/xgboost_learner.h +++ b/dev/base/xgboost_learner.h @@ -15,256 +15,264 @@ #include "../utils/xgboost_stream.h" namespace xgboost { - namespace base { - /*! \brief class for gradient boosting learner */ - class BoostLearner { - public: - /*! \brief constructor */ - BoostLearner(void) { - silent = 0; - } - /*! - * \brief booster associated with training and evaluating data - * \param train pointer to the training data - * \param evals array of evaluating data - * \param evname name of evaluation data, used print statistics - */ - BoostLearner(const DMatrix *train, - const std::vector &evals, - const std::vector &evname) { - silent = 0; - this->SetData(train, evals, evname); - } + namespace base { + /*! \brief class for gradient boosting learner */ + class BoostLearner { + public: + /*! \brief constructor */ + BoostLearner(void) { + silent = 0; + } + /*! + * \brief booster associated with training and evaluating data + * \param train pointer to the training data + * \param evals array of evaluating data + * \param evname name of evaluation data, used print statistics + */ + BoostLearner(const DMatrix *train, + const std::vector &evals, + const std::vector &evname) { + silent = 0; + this->SetData(train, evals, evname); + } - /*! - * \brief associate booster with training and evaluating data - * \param train pointer to the training data - * \param evals array of evaluating data - * \param evname name of evaluation data, used print statistics - */ - inline void SetData(const DMatrix *train, - const std::vector &evals, - const std::vector &evname) { - this->train_ = train; - this->evals_ = evals; - this->evname_ = evname; - // estimate feature bound - int num_feature = (int)(train->data.NumCol()); - // assign buffer index - unsigned buffer_size = static_cast(train->Size()); + /*! + * \brief associate booster with training and evaluating data + * \param train pointer to the training data + * \param evals array of evaluating data + * \param evname name of evaluation data, used print statistics + */ + inline void SetData(const DMatrix *train, + const std::vector &evals, + const std::vector &evname) { + this->train_ = train; + this->evals_ = evals; + this->evname_ = evname; + // estimate feature bound + int num_feature = (int)(train->data.NumCol()); + // assign buffer index + unsigned buffer_size = static_cast(train->Size()); - for (size_t i = 0; i < evals.size(); ++i) { - buffer_size += static_cast(evals[i]->Size()); - num_feature = std::max(num_feature, (int)(evals[i]->data.NumCol())); - } + for (size_t i = 0; i < evals.size(); ++i) { + buffer_size += static_cast(evals[i]->Size()); + num_feature = std::max(num_feature, (int)(evals[i]->data.NumCol())); + } - char str_temp[25]; - if (num_feature > mparam.num_feature) { - mparam.num_feature = num_feature; - sprintf(str_temp, "%d", num_feature); - base_gbm.SetParam("bst:num_feature", str_temp); - } + char str_temp[25]; + if (num_feature > mparam.num_feature) { + mparam.num_feature = num_feature; + sprintf(str_temp, "%d", num_feature); + base_gbm.SetParam("bst:num_feature", str_temp); + } - sprintf(str_temp, "%u", buffer_size); - base_gbm.SetParam("num_pbuffer", str_temp); - if (!silent) { - printf("buffer_size=%u\n", buffer_size); - } + sprintf(str_temp, "%u", buffer_size); + base_gbm.SetParam("num_pbuffer", str_temp); + if (!silent) { + printf("buffer_size=%u\n", buffer_size); + } - // set eval_preds tmp sapce - this->eval_preds_.resize(evals.size(), std::vector()); - } - /*! - * \brief set parameters from outside - * \param name name of the parameter - * \param val value of the parameter - */ - virtual inline void SetParam(const char *name, const char *val) { - if (!strcmp(name, "silent")) silent = atoi(val); - mparam.SetParam(name, val); - base_gbm.SetParam(name, val); - } - /*! - * \brief initialize solver before training, called before training - * this function is reserved for solver to allocate necessary space and do other preparation - */ - inline void InitTrainer(void) { - base_gbm.InitTrainer(); - } - /*! - * \brief initialize the current data storage for model, if the model is used first time, call this function - */ - inline void InitModel(void) { - base_gbm.InitModel(); - } - /*! - * \brief load model from stream - * \param fi input stream - */ - inline void LoadModel(utils::IStream &fi) { - base_gbm.LoadModel(fi); - utils::Assert(fi.Read(&mparam, sizeof(ModelParam)) != 0); - } - /*! - * \brief DumpModel - * \param fo text file - * \param fmap feature map that may help give interpretations of feature - * \param with_stats whether print statistics as well - */ - inline void DumpModel(FILE *fo, const utils::FeatMap& fmap, bool with_stats) { - base_gbm.DumpModel(fo, fmap, with_stats); - } - /*! - * \brief Dump path of all trees - * \param fo text file - * \param data input data - */ - inline void DumpPath(FILE *fo, const DMatrix &data) { - base_gbm.DumpPath(fo, data.data); - } + // set eval_preds tmp sapce + this->eval_preds_.resize(evals.size(), std::vector()); + } + /*! + * \brief set parameters from outside + * \param name name of the parameter + * \param val value of the parameter + */ + virtual inline void SetParam(const char *name, const char *val) { + if (!strcmp(name, "silent")) silent = atoi(val); + mparam.SetParam(name, val); + base_gbm.SetParam(name, val); + } + /*! + * \brief initialize solver before training, called before training + * this function is reserved for solver to allocate necessary space and do other preparation + */ + inline void InitTrainer(void) { + base_gbm.InitTrainer(); + } + /*! + * \brief initialize the current data storage for model, if the model is used first time, call this function + */ + inline void InitModel(void) { + base_gbm.InitModel(); + if(!silent) printf("BoostLearner:InitModel Done!\n"); + } + /*! + * \brief load model from stream + * \param fi input stream + */ + inline void LoadModel(utils::IStream &fi) { + base_gbm.LoadModel(fi); + utils::Assert(fi.Read(&mparam, sizeof(ModelParam)) != 0); + } + /*! + * \brief DumpModel + * \param fo text file + * \param fmap feature map that may help give interpretations of feature + * \param with_stats whether print statistics as well + */ + inline void DumpModel(FILE *fo, const utils::FeatMap& fmap, bool with_stats) { + base_gbm.DumpModel(fo, fmap, with_stats); + } + /*! + * \brief Dump path of all trees + * \param fo text file + * \param data input data + */ + inline void DumpPath(FILE *fo, const DMatrix &data) { + base_gbm.DumpPath(fo, data.data); + } - /*! - * \brief save model to stream - * \param fo output stream - */ - inline void SaveModel(utils::IStream &fo) const { - base_gbm.SaveModel(fo); - fo.Write(&mparam, sizeof(ModelParam)); - } + /*! + * \brief save model to stream + * \param fo output stream + */ + inline void SaveModel(utils::IStream &fo) const { + base_gbm.SaveModel(fo); + fo.Write(&mparam, sizeof(ModelParam)); + } - virtual void EvalOneIter(int iter, FILE *fo = stderr) {} + virtual void EvalOneIter(int iter, FILE *fo = stderr) {} - /*! - * \brief update the model for one iteration - * \param iteration iteration number - */ - inline void UpdateOneIter(int iter) { - this->PredictBuffer(preds_, *train_, 0); - this->GetGradient(preds_, train_->labels, train_->group_index, grad_, hess_); - std::vector root_index; - base_gbm.DoBoost(grad_, hess_, train_->data, root_index); - } + /*! + * \brief update the model for one iteration + * \param iteration iteration number + */ + inline void UpdateOneIter(int iter) { + this->PredictBuffer(preds_, *train_, 0); + this->GetGradient(preds_, train_->labels, train_->group_index, grad_, hess_); + std::vector root_index; + base_gbm.DoBoost(grad_, hess_, train_->data, root_index); + +// printf("xgboost_learner.h:UpdateOneIter\n"); +// const unsigned ndata = static_cast(train_->Size()); +// #pragma omp parallel for schedule( static ) +// for (unsigned j = 0; j < ndata; ++j) { +// printf("haha:%d %f\n",j,base_gbm.Predict(train_->data, j, j)); +// } + } - /*! \brief get intransformed prediction, without buffering */ - inline void Predict(std::vector &preds, const DMatrix &data) { - preds.resize(data.Size()); + /*! \brief get intransformed prediction, without buffering */ + inline void Predict(std::vector &preds, const DMatrix &data) { + preds.resize(data.Size()); + const unsigned ndata = static_cast(data.Size()); + #pragma omp parallel for schedule( static ) + for (unsigned j = 0; j < ndata; ++j) { + preds[j] = base_gbm.Predict(data.data, j, -1); + + } + } - const unsigned ndata = static_cast(data.Size()); -#pragma omp parallel for schedule( static ) - for (unsigned j = 0; j < ndata; ++j) { - preds[j] = base_gbm.Predict(data.data, j, -1); - } - } + public: + /*! + * \brief update the model for one iteration + * \param iteration iteration number + */ + virtual inline void UpdateInteract(std::string action){ + this->InteractPredict(preds_, *train_, 0); - public: - /*! - * \brief update the model for one iteration - * \param iteration iteration number - */ - virtual inline void UpdateInteract(std::string action){ - this->InteractPredict(preds_, *train_, 0); + int buffer_offset = static_cast(train_->Size()); + for (size_t i = 0; i < evals_.size(); ++i) { + std::vector &preds = this->eval_preds_[i]; + this->InteractPredict(preds, *evals_[i], buffer_offset); + buffer_offset += static_cast(evals_[i]->Size()); + } - int buffer_offset = static_cast(train_->Size()); - for (size_t i = 0; i < evals_.size(); ++i) { - std::vector &preds = this->eval_preds_[i]; - this->InteractPredict(preds, *evals_[i], buffer_offset); - buffer_offset += static_cast(evals_[i]->Size()); - } + if (action == "remove") { + base_gbm.DelteBooster(); + return; + } - if (action == "remove") { - base_gbm.DelteBooster(); - return; - } + this->GetGradient(preds_, train_->labels, train_->group_index, grad_, hess_); + std::vector root_index; + base_gbm.DoBoost(grad_, hess_, train_->data, root_index); - this->GetGradient(preds_, train_->labels, train_->group_index, grad_, hess_); - std::vector root_index; - base_gbm.DoBoost(grad_, hess_, train_->data, root_index); + this->InteractRePredict(*train_, 0); + buffer_offset = static_cast(train_->Size()); + for (size_t i = 0; i < evals_.size(); ++i) { + this->InteractRePredict(*evals_[i], buffer_offset); + buffer_offset += static_cast(evals_[i]->Size()); + } + }; - this->InteractRePredict(*train_, 0); - buffer_offset = static_cast(train_->Size()); - for (size_t i = 0; i < evals_.size(); ++i) { - this->InteractRePredict(*evals_[i], buffer_offset); - buffer_offset += static_cast(evals_[i]->Size()); - } - }; + protected: + /*! \brief get the intransformed predictions, given data */ + inline void InteractPredict(std::vector &preds, const DMatrix &data, unsigned buffer_offset) { + preds.resize(data.Size()); + const unsigned ndata = static_cast(data.Size()); + #pragma omp parallel for schedule( static ) + for (unsigned j = 0; j < ndata; ++j) { + preds[j] = base_gbm.InteractPredict(data.data, j, buffer_offset + j); + } + } + /*! \brief repredict trial */ + inline void InteractRePredict(const xgboost::base::DMatrix &data, unsigned buffer_offset) { + const unsigned ndata = static_cast(data.Size()); + #pragma omp parallel for schedule( static ) + for (unsigned j = 0; j < ndata; ++j) { + base_gbm.InteractRePredict(data.data, j, buffer_offset + j); + } + } - protected: - /*! \brief get the intransformed predictions, given data */ - inline void InteractPredict(std::vector &preds, const DMatrix &data, unsigned buffer_offset) { - preds.resize(data.Size()); - const unsigned ndata = static_cast(data.Size()); -#pragma omp parallel for schedule( static ) - for (unsigned j = 0; j < ndata; ++j) { - preds[j] = base_gbm.InteractPredict(data.data, j, buffer_offset + j); - } - } - /*! \brief repredict trial */ - inline void InteractRePredict(const xgboost::base::DMatrix &data, unsigned buffer_offset) { - const unsigned ndata = static_cast(data.Size()); -#pragma omp parallel for schedule( static ) - for (unsigned j = 0; j < ndata; ++j) { - base_gbm.InteractRePredict(data.data, j, buffer_offset + j); - } - } + /*! \brief get intransformed predictions, given data */ + virtual inline void PredictBuffer(std::vector &preds, const DMatrix &data, unsigned buffer_offset) { + preds.resize(data.Size()); + const unsigned ndata = static_cast(data.Size()); + + #pragma omp parallel for schedule( static ) + for (unsigned j = 0; j < ndata; ++j) { + preds[j] = base_gbm.Predict(data.data, j, buffer_offset + j); + } + } - /*! \brief get intransformed predictions, given data */ - virtual inline void PredictBuffer(std::vector &preds, const DMatrix &data, unsigned buffer_offset) { - preds.resize(data.Size()); - - const unsigned ndata = static_cast(data.Size()); -#pragma omp parallel for schedule( static ) - for (unsigned j = 0; j < ndata; ++j) { - preds[j] = base_gbm.Predict(data.data, j, buffer_offset + j); - } - } - - /*! \brief get the first order and second order gradient, given the transformed predictions and labels */ - virtual inline void GetGradient(const std::vector &preds, - const std::vector &labels, - const std::vector &group_index, - std::vector &grad, - std::vector &hess) {}; + /*! \brief get the first order and second order gradient, given the transformed predictions and labels */ + virtual inline void GetGradient(const std::vector &preds, + const std::vector &labels, + const std::vector &group_index, + std::vector &grad, + std::vector &hess) {}; - protected: + protected: - /*! \brief training parameter for regression */ - struct ModelParam { - /* \brief type of loss function */ - int loss_type; - /* \brief number of features */ - int num_feature; - /*! \brief reserved field */ - int reserved[16]; - /*! \brief constructor */ - ModelParam(void) { - loss_type = 0; - num_feature = 0; - memset(reserved, 0, sizeof(reserved)); - } - /*! - * \brief set parameters from outside - * \param name name of the parameter - * \param val value of the parameter - */ - inline void SetParam(const char *name, const char *val) { - if (!strcmp("loss_type", name)) loss_type = atoi(val); - if (!strcmp("bst:num_feature", name)) num_feature = atoi(val); - } + /*! \brief training parameter for regression */ + struct ModelParam { + /* \brief type of loss function */ + int loss_type; + /* \brief number of features */ + int num_feature; + /*! \brief reserved field */ + int reserved[16]; + /*! \brief constructor */ + ModelParam(void) { + loss_type = 0; + num_feature = 0; + memset(reserved, 0, sizeof(reserved)); + } + /*! + * \brief set parameters from outside + * \param name name of the parameter + * \param val value of the parameter + */ + inline void SetParam(const char *name, const char *val) { + if (!strcmp("loss_type", name)) loss_type = atoi(val); + if (!strcmp("bst:num_feature", name)) num_feature = atoi(val); + } - }; + }; - int silent; - booster::GBMBase base_gbm; - ModelParam mparam; - const DMatrix *train_; - std::vector evals_; - std::vector evname_; - std::vector buffer_index_; - std::vector grad_, hess_, preds_; - std::vector< std::vector > eval_preds_; - }; - } + int silent; + booster::GBMBase base_gbm; + ModelParam mparam; + const DMatrix *train_; + std::vector evals_; + std::vector evname_; + std::vector buffer_index_; + std::vector grad_, hess_, preds_; + std::vector< std::vector > eval_preds_; + }; + } }; #endif diff --git a/dev/rank/xgboost_rank.h b/dev/rank/xgboost_rank.h index 82320f2a9..0758e9366 100644 --- a/dev/rank/xgboost_rank.h +++ b/dev/rank/xgboost_rank.h @@ -7,7 +7,7 @@ */ #include #include -#include +#include #include "xgboost_sample.h" #include "xgboost_rank_eval.h" #include "../base/xgboost_data_instance.h" @@ -18,133 +18,273 @@ #include "../base/xgboost_learner.h" namespace xgboost { - namespace rank { - /*! \brief class for gradient boosted regression */ - class RankBoostLearner :public base::BoostLearner{ - public: - /*! \brief constructor */ - RankBoostLearner(void) { - BoostLearner(); + namespace rank { + /*! \brief class for gradient boosted regression */ + class RankBoostLearner :public base::BoostLearner{ + public: + /*! \brief constructor */ + RankBoostLearner(void) { + BoostLearner(); + } + /*! + * \brief a rank booster associated with training and evaluating data + * \param train pointer to the training data + * \param evals array of evaluating data + * \param evname name of evaluation data, used print statistics + */ + RankBoostLearner(const base::DMatrix *train, + const std::vector &evals, + const std::vector &evname) { + + BoostLearner(train, evals, evname); + } + + /*! + * \brief initialize solver before training, called before training + * this function is reserved for solver to allocate necessary space + * and do other preparation + */ + inline void InitTrainer(void) { + BoostLearner::InitTrainer(); + if (mparam.loss_type == PAIRWISE) { + evaluator_.AddEval("PAIR"); + } + else if (mparam.loss_type == MAP) { + evaluator_.AddEval("MAP"); + } + else { + evaluator_.AddEval("NDCG"); + } + evaluator_.Init(); + } + + void EvalOneIter(int iter, FILE *fo = stderr) { + fprintf(fo, "[%d]", iter); + int buffer_offset = static_cast(train_->Size()); + + for (size_t i = 0; i < evals_.size(); ++i) { + std::vector &preds = this->eval_preds_[i]; + this->PredictBuffer(preds, *evals_[i], buffer_offset); + evaluator_.Eval(fo, evname_[i].c_str(), preds, (*evals_[i]).labels, (*evals_[i]).group_index); + buffer_offset += static_cast(evals_[i]->Size()); + } + fprintf(fo, "\n"); + } + + virtual inline void SetParam(const char *name, const char *val){ + BoostLearner::SetParam(name,val); + if (!strcmp(name, "eval_metric")) evaluator_.AddEval(val); + if (!strcmp(name, "rank:sampler")) sampler.AssignSampler(atoi(val)); + } + + private: + inline std::vector< Triple > GetSortedTuple(const std::vector &preds, + const std::vector &labels, + const std::vector &group_index, + int group){ + std::vector< Triple > sorted_triple; + for(int j = group_index[group]; j < group_index[group+1]; j++){ + sorted_triple.push_back(Triple(preds[j],labels[j],j)); + } + std::sort(sorted_triple.begin(),sorted_triple.end(),Triplef1Comparer); + return sorted_triple; + } + + inline std::vector GetIndexMap(std::vector< Triple > sorted_triple,int start){ + std::vector index_remap; + index_remap.resize(sorted_triple.size()); + for(int i = 0; i < sorted_triple.size(); i++){ + index_remap[sorted_triple[i].f3_-start] = i; + } + return index_remap; + } + + inline float GetLambdaMAP(const std::vector< Triple > sorted_triple, + int index1,int index2, + std::vector< Quadruple > map_acc){ + if(index1 > index2) std::swap(index1,index2); + float original = map_acc[index2].f1_; + if(index1 != 0) original -= map_acc[index1 - 1].f1_; + float changed = 0; + if(sorted_triple[index1].f2_ < sorted_triple[index2].f2_){ + changed += map_acc[index2 - 1].f3_ - map_acc[index1].f3_; + changed += (map_acc[index1].f4_ + 1.0f)/(index1 + 1); + }else{ + changed += map_acc[index2 - 1].f2_ - map_acc[index1].f2_; + changed += map_acc[index2].f4_/(index2 + 1); + } + float ans = (changed - original)/(map_acc[map_acc.size() - 1].f4_); + if(ans < 0) ans = -ans; + return ans; + } + + inline float GetLambdaNDCG(const std::vector< Triple > sorted_triple, + int index1, + int index2,float IDCG){ + float original = pow(2,sorted_triple[index1].f2_)/log(index1+2) + + pow(2,sorted_triple[index2].f2_)/log(index2+2); + float changed = pow(2,sorted_triple[index2].f2_)/log(index1+2) + + pow(2,sorted_triple[index1].f2_)/log(index2+2); + float ans = (original - changed)/IDCG; + if(ans < 0) ans = -ans; + return ans; + } + + + inline float GetIDCG(const std::vector< Triple > sorted_triple){ + std::vector labels; + for(int i = 0; i < sorted_triple.size(); i++){ + labels.push_back(sorted_triple[i].f2_); + } + + std::sort(labels.begin(),labels.end(),std::greater()); + return EvalNDCG::DCG(labels); + } + + inline std::vector< Quadruple > GetMAPAcc(const std::vector< Triple > sorted_triple){ + std::vector< Quadruple > map_acc; + float hit = 0,acc1 = 0,acc2 = 0,acc3 = 0; + for(int i = 0; i < sorted_triple.size(); i++){ + if(sorted_triple[i].f2_ == 1) { + hit++; + acc1 += hit /( i + 1 ); + acc2 += (hit - 1)/(i+1); + acc3 += (hit + 1)/(i+1); + } + map_acc.push_back(Quadruple(acc1,acc2,acc3,hit)); + } + return map_acc; + + } + + inline void GetGroupGradient(const std::vector &preds, + const std::vector &labels, + const std::vector &group_index, + std::vector &grad, + std::vector &hess, + const std::vector< Triple > sorted_triple, + const std::vector index_remap, + const sample::Pairs& pairs, + int group){ + bool j_better; + float IDCG, pred_diff, pred_diff_exp, delta; + float first_order_gradient, second_order_gradient; + std::vector< Quadruple > map_acc; + + if(mparam.loss_type == NDCG){ + IDCG = GetIDCG(sorted_triple); + }else if(mparam.loss_type == MAP){ + map_acc = GetMAPAcc(sorted_triple); + } + + for (int j = group_index[group]; j < group_index[group + 1]; j++){ + std::vector pair_instance = pairs.GetPairs(j); + for (int k = 0; k < pair_instance.size(); k++){ + j_better = labels[j] > labels[pair_instance[k]]; + if (j_better){ + switch(mparam.loss_type){ + case PAIRWISE: delta = 1.0;break; + case MAP: delta = GetLambdaMAP(sorted_triple,index_remap[j - group_index[group]],index_remap[pair_instance[k]-group_index[group]],map_acc);break; + case NDCG: delta = GetLambdaNDCG(sorted_triple,index_remap[j - group_index[group]],index_remap[pair_instance[k]-group_index[group]],IDCG);break; + default: utils::Error("Cannot find the specified loss type"); + } + + pred_diff = preds[preds[j] - pair_instance[k]]; + pred_diff_exp = j_better ? expf(-pred_diff) : expf(pred_diff); + first_order_gradient = delta * FirstOrderGradient(pred_diff_exp); + second_order_gradient = 2 * delta * SecondOrderGradient(pred_diff_exp); + hess[j] += second_order_gradient; + grad[j] += first_order_gradient; + hess[pair_instance[k]] += second_order_gradient; + grad[pair_instance[k]] += -first_order_gradient; } - /*! - * \brief a rank booster associated with training and evaluating data - * \param train pointer to the training data - * \param evals array of evaluating data - * \param evname name of evaluation data, used print statistics - */ - RankBoostLearner(const base::DMatrix *train, - const std::vector &evals, - const std::vector &evname) { + } + } + } + public: + /*! \brief get the first order and second order gradient, given the + * intransformed predictions and labels */ + inline void GetGradient(const std::vector &preds, + const std::vector &labels, + const std::vector &group_index, + std::vector &grad, + std::vector &hess) { + grad.resize(preds.size()); + hess.resize(preds.size()); + for (int i = 0; i < group_index.size() - 1; i++){ + sample::Pairs pairs = sampler.GenPairs(preds, labels, group_index[i], group_index[i + 1]); + //pairs.GetPairs() + std::vector< Triple > sorted_triple = GetSortedTuple(preds,labels,group_index,i); + std::vector index_remap = GetIndexMap(sorted_triple,group_index[i]); + GetGroupGradient(preds,labels,group_index, + grad,hess,sorted_triple,index_remap,pairs,i); + } + } - BoostLearner(train, evals, evname); - } + inline void UpdateInteract(std::string action) { + this->InteractPredict(preds_, *train_, 0); - /*! - * \brief initialize solver before training, called before training - * this function is reserved for solver to allocate necessary space - * and do other preparation - */ - inline void InitTrainer(void) { - BoostLearner::InitTrainer(); - if (mparam.loss_type == PAIRWISE) { - evaluator_.AddEval("PAIR"); - } - else if (mparam.loss_type == MAP) { - evaluator_.AddEval("MAP"); - } - else { - evaluator_.AddEval("NDCG"); - } - evaluator_.Init(); - } + int buffer_offset = static_cast(train_->Size()); + for (size_t i = 0; i < evals_.size(); ++i){ + std::vector &preds = this->eval_preds_[i]; + this->InteractPredict(preds, *evals_[i], buffer_offset); + buffer_offset += static_cast(evals_[i]->Size()); + } - void EvalOneIter(int iter, FILE *fo = stderr) { - fprintf(fo, "[%d]", iter); - int buffer_offset = static_cast(train_->Size()); + if (action == "remove"){ + base_gbm.DelteBooster(); return; + } - for (size_t i = 0; i < evals_.size(); ++i) { - std::vector &preds = this->eval_preds_[i]; - this->PredictBuffer(preds, *evals_[i], buffer_offset); - evaluator_.Eval(fo, evname_[i].c_str(), preds, (*evals_[i]).labels, (*evals_[i]).group_index); - buffer_offset += static_cast(evals_[i]->Size()); - } - fprintf(fo, "\n"); - } + this->GetGradient(preds_, train_->labels,train_->group_index, grad_, hess_); + std::vector root_index; + base_gbm.DoBoost(grad_, hess_, train_->data, root_index); - inline void SetParam(const char *name, const char *val){ - if (!strcmp(name, "eval_metric")) evaluator_.AddEval(val); - if (!strcmp(name, "rank:sampler")) sampler.AssignSampler(atoi(val)); - } - /*! \brief get the first order and second order gradient, given the transformed predictions and labels */ - inline void GetGradient(const std::vector &preds, - const std::vector &labels, - const std::vector &group_index, - std::vector &grad, - std::vector &hess) { - grad.resize(preds.size()); - hess.resize(preds.size()); - bool j_better; - float pred_diff, pred_diff_exp, first_order_gradient, second_order_gradient; - for (int i = 0; i < group_index.size() - 1; i++){ - sample::Pairs pairs = sampler.GenPairs(preds, labels, group_index[i], group_index[i + 1]); - for (int j = group_index[i]; j < group_index[i + 1]; j++){ - std::vector pair_instance = pairs.GetPairs(j); - for (int k = 0; k < pair_instance.size(); k++){ - j_better = labels[j] > labels[pair_instance[k]]; - if (j_better){ - pred_diff = preds[preds[j] - pair_instance[k]]; - pred_diff_exp = j_better ? expf(-pred_diff) : expf(pred_diff); - first_order_gradient = FirstOrderGradient(pred_diff_exp); - second_order_gradient = 2 * SecondOrderGradient(pred_diff_exp); - hess[j] += second_order_gradient; - grad[j] += first_order_gradient; - hess[pair_instance[k]] += second_order_gradient; - grad[pair_instance[k]] += -first_order_gradient; - } - } - } - } - } - - inline void UpdateInteract(std::string action) { - - } - private: - enum LossType { - PAIRWISE = 0, - MAP = 1, - NDCG = 2 - }; + this->InteractRePredict(*train_, 0); + buffer_offset = static_cast(train_->Size()); + for (size_t i = 0; i < evals_.size(); ++i){ + this->InteractRePredict(*evals_[i], buffer_offset); + buffer_offset += static_cast(evals_[i]->Size()); + } + } + + + + private: + enum LossType { + PAIRWISE = 0, + MAP = 1, + NDCG = 2 + }; - /*! - * \brief calculate first order gradient of pairwise loss function(f(x) = ln(1+exp(-x)), - * given the exponential of the difference of intransformed pair predictions - * \param the intransformed prediction of positive instance - * \param the intransformed prediction of negative instance - * \return first order gradient - */ - inline float FirstOrderGradient(float pred_diff_exp) const { - return -pred_diff_exp / (1 + pred_diff_exp); - } + /*! + * \brief calculate first order gradient of pairwise loss function(f(x) = ln(1+exp(-x)), + * given the exponential of the difference of intransformed pair predictions + * \param the intransformed prediction of positive instance + * \param the intransformed prediction of negative instance + * \return first order gradient + */ + inline float FirstOrderGradient(float pred_diff_exp) const { + return -pred_diff_exp / (1 + pred_diff_exp); + } - /*! - * \brief calculate second order gradient of pairwise loss function(f(x) = ln(1+exp(-x)), - * given the exponential of the difference of intransformed pair predictions - * \param the intransformed prediction of positive instance - * \param the intransformed prediction of negative instance - * \return second order gradient - */ - inline float SecondOrderGradient(float pred_diff_exp) const { - return pred_diff_exp / pow(1 + pred_diff_exp, 2); - } + /*! + * \brief calculate second order gradient of pairwise loss function(f(x) = ln(1+exp(-x)), + * given the exponential of the difference of intransformed pair predictions + * \param the intransformed prediction of positive instance + * \param the intransformed prediction of negative instance + * \return second order gradient + */ + inline float SecondOrderGradient(float pred_diff_exp) const { + return pred_diff_exp / pow(1 + pred_diff_exp, 2); + } - private: - RankEvalSet evaluator_; - sample::PairSamplerWrapper sampler; - }; - }; + private: + RankEvalSet evaluator_; + sample::PairSamplerWrapper sampler; + }; + }; }; #endif diff --git a/dev/rank/xgboost_rank_eval.h b/dev/rank/xgboost_rank_eval.h index 73a7664ca..f03d3bf8f 100644 --- a/dev/rank/xgboost_rank_eval.h +++ b/dev/rank/xgboost_rank_eval.h @@ -13,170 +13,225 @@ #include "../utils/xgboost_omp.h" namespace xgboost { - namespace rank { - /*! \brief evaluator that evaluates the loss metrics */ - class IRankEvaluator { - public: - /*! - * \brief evaluate a specific metric - * \param preds prediction - * \param labels label - */ - virtual float Eval(const std::vector &preds, - const std::vector &labels, - const std::vector &group_index) const = 0; - /*! \return name of metric */ - virtual const char *Name(void) const = 0; - }; + namespace rank { + /*! \brief evaluator that evaluates the loss metrics */ + class IRankEvaluator { + public: + /*! + * \brief evaluate a specific metric + * \param preds prediction + * \param labels label + */ + virtual float Eval(const std::vector &preds, + const std::vector &labels, + const std::vector &group_index) const = 0; + /*! \return name of metric */ + virtual const char *Name(void) const = 0; + }; - class Pair{ - public: - float key_; - float value_; + class Pair{ + public: + float key_; + float value_; - Pair(float key, float value){ - key_ = key; - value_ = value_; - } - }; + Pair(float key, float value):key_(key),value_(value){ + } + }; - bool PairKeyComparer(const Pair &a, const Pair &b){ - return a.key_ < b.key_; - } + bool PairKeyComparer(const Pair &a, const Pair &b){ + return a.key_ < b.key_; + } - bool PairValueComparer(const Pair &a, const Pair &b){ - return a.value_ < b.value_; - } - - - /*! \brief Mean Average Precision */ - class EvalMAP : public IRankEvaluator { - public: - float Eval(const std::vector &preds, - const std::vector &labels, - const std::vector &group_index) const { - float acc = 0; - std::vector pairs_sort; - for (int i = 0; i < group_index.size() - 1; i++){ - for (int j = group_index[i]; j < group_index[i + 1]; j++){ - Pair pair(preds[j], labels[j]); - pairs_sort.push_back(pair); - } - acc += average_precision(pairs_sort); - } - return acc / (group_index.size() - 1); - } - - - virtual const char *Name(void) const { - return "MAP"; - } - - float average_precision(std::vector pairs_sort) const{ - - std::sort(pairs_sort.begin(), pairs_sort.end(), PairKeyComparer); - float hits = 0; - float average_precision = 0; - for (int j = 0; j < pairs_sort.size(); j++){ - if (pairs_sort[j].value_ == 1){ - hits++; - average_precision += hits / (j + 1); - } - } - if (hits != 0) average_precision /= hits; - return average_precision; - } - }; - - - class EvalPair : public IRankEvaluator{ - public: - float Eval(const std::vector &preds, - const std::vector &labels, - const std::vector &group_index) const { - return 0; - } - - const char *Name(void) const { - return "PAIR"; - } - }; - - /*! \brief Normalized DCG */ - class EvalNDCG : public IRankEvaluator { - public: - float Eval(const std::vector &preds, - const std::vector &labels, - const std::vector &group_index) const { - if (group_index.size() <= 1) return 0; - float acc = 0; - std::vector pairs_sort; - for (int i = 0; i < group_index.size() - 1; i++){ - for (int j = group_index[i]; j < group_index[i + 1]; j++){ - Pair pair(preds[j], labels[j]); - pairs_sort.push_back(pair); - } - acc += NDCG(pairs_sort); - } - return acc / (group_index.size() - 1); - } - - float NDCG(std::vector pairs_sort) const{ - std::sort(pairs_sort.begin(), pairs_sort.end(), PairKeyComparer); - float dcg = DCG(pairs_sort); - std::sort(pairs_sort.begin(), pairs_sort.end(), PairValueComparer); - float IDCG = DCG(pairs_sort); - if (IDCG == 0) return 0; - return dcg / IDCG; - } - - float DCG(std::vector pairs_sort) const{ - float ans = 0.0; - ans += pairs_sort[0].value_; - for (int i = 1; i < pairs_sort.size(); i++){ - ans += pairs_sort[i].value_ / log(i + 1); - } - return ans; - } - - virtual const char *Name(void) const { - return "NDCG"; - } - }; + bool PairValueComparer(const Pair &a, const Pair &b){ + return a.value_ < b.value_; + } + template + class Triple{ + public: + T1 f1_; + T2 f2_; + T3 f3_; + Triple(T1 f1,T2 f2,T3 f3):f1_(f1),f2_(f2),f3_(f3){ + + } }; - - namespace rank { - /*! \brief a set of evaluators */ - class RankEvalSet { - public: - inline void AddEval(const char *name) { - if (!strcmp(name, "PAIR")) evals_.push_back(&pair_); - if (!strcmp(name, "MAP")) evals_.push_back(&map_); - if (!strcmp(name, "NDCG")) evals_.push_back(&ndcg_); - } - - inline void Init(void) { - std::sort(evals_.begin(), evals_.end()); - evals_.resize(std::unique(evals_.begin(), evals_.end()) - evals_.begin()); - } - - inline void Eval(FILE *fo, const char *evname, - const std::vector &preds, - const std::vector &labels, - const std::vector &group_index) const { - for (size_t i = 0; i < evals_.size(); ++i) { - float res = evals_[i]->Eval(preds, labels, group_index); - fprintf(fo, "\t%s-%s:%f", evname, evals_[i]->Name(), res); - } - } - - private: - EvalPair pair_; - EvalMAP map_; - EvalNDCG ndcg_; - std::vector evals_; - }; + + template + class Quadruple{ + public: + T1 f1_; + T2 f2_; + T3 f3_; + T4 f4_; + Quadruple(T1 f1,T2 f2,T3 f3,T4 f4):f1_(f1),f2_(f2),f3_(f3),f4_(f4){ + + } }; + + bool Triplef1Comparer(const Triple &a, const Triple &b){ + return a.f1_< b.f1_; + } + + /*! \brief Mean Average Precision */ + class EvalMAP : public IRankEvaluator { + public: + float Eval(const std::vector &preds, + const std::vector &labels, + const std::vector &group_index) const { + if (group_index.size() <= 1) return 0; + float acc = 0; + std::vector pairs_sort; + for (int i = 0; i < group_index.size() - 1; i++){ + for (int j = group_index[i]; j < group_index[i + 1]; j++){ + Pair pair(preds[j], labels[j]); + pairs_sort.push_back(pair); + } + acc += average_precision(pairs_sort); + } + return acc / (group_index.size() - 1); + } + + + + virtual const char *Name(void) const { + return "MAP"; + } + private: + float average_precision(std::vector pairs_sort) const{ + + std::sort(pairs_sort.begin(), pairs_sort.end(), PairKeyComparer); + float hits = 0; + float average_precision = 0; + for (int j = 0; j < pairs_sort.size(); j++){ + if (pairs_sort[j].value_ == 1){ + hits++; + average_precision += hits / (j + 1); + } + } + if (hits != 0) average_precision /= hits; + return average_precision; + } + }; + + + class EvalPair : public IRankEvaluator{ + public: + float Eval(const std::vector &preds, + const std::vector &labels, + const std::vector &group_index) const { + if (group_index.size() <= 1) return 0; + float acc = 0; + for (int i = 0; i < group_index.size() - 1; i++){ + acc += Count_Inversion(preds,labels, + group_index[i],group_index[i+1]); + } + return acc / (group_index.size() - 1); + } + + const char *Name(void) const { + return "PAIR"; + } + private: + float Count_Inversion(const std::vector &preds, + const std::vector &labels,int begin,int end + ) const{ + float ans = 0; + for(int i = begin; i < end; i++){ + for(int j = i + 1; j < end; j++){ + if(preds[i] > preds[j] && labels[i] < labels[j]) + ans++; + } + } + return ans; + } + }; + + /*! \brief Normalized DCG */ + class EvalNDCG : public IRankEvaluator { + public: + float Eval(const std::vector &preds, + const std::vector &labels, + const std::vector &group_index) const { + if (group_index.size() <= 1) return 0; + float acc = 0; + std::vector pairs_sort; + for (int i = 0; i < group_index.size() - 1; i++){ + for (int j = group_index[i]; j < group_index[i + 1]; j++){ + Pair pair(preds[j], labels[j]); + pairs_sort.push_back(pair); + } + acc += NDCG(pairs_sort); + } + return acc / (group_index.size() - 1); + } + + static float DCG(const std::vector &labels){ + float ans = 0.0; + for (int i = 0; i < labels.size(); i++){ + ans += (pow(2,labels[i]) - 1 ) / log(i + 2); + } + return ans; + } + + virtual const char *Name(void) const { + return "NDCG"; + } + + private: + float NDCG(std::vector pairs_sort) const{ + std::sort(pairs_sort.begin(), pairs_sort.end(), PairKeyComparer); + float dcg = DCG(pairs_sort); + std::sort(pairs_sort.begin(), pairs_sort.end(), PairValueComparer); + float IDCG = DCG(pairs_sort); + if (IDCG == 0) return 0; + return dcg / IDCG; + } + + float DCG(std::vector pairs_sort) const{ + std::vector labels; + for (int i = 1; i < pairs_sort.size(); i++){ + labels.push_back(pairs_sort[i].value_); + } + return DCG(labels); + } + + + }; + + }; + + namespace rank { + /*! \brief a set of evaluators */ + class RankEvalSet { + public: + inline void AddEval(const char *name) { + if (!strcmp(name, "PAIR")) evals_.push_back(&pair_); + if (!strcmp(name, "MAP")) evals_.push_back(&map_); + if (!strcmp(name, "NDCG")) evals_.push_back(&ndcg_); + } + + inline void Init(void) { + std::sort(evals_.begin(), evals_.end()); + evals_.resize(std::unique(evals_.begin(), evals_.end()) - evals_.begin()); + } + + inline void Eval(FILE *fo, const char *evname, + const std::vector &preds, + const std::vector &labels, + const std::vector &group_index) const { + for (size_t i = 0; i < evals_.size(); ++i) { + float res = evals_[i]->Eval(preds, labels, group_index); + fprintf(fo, "\t%s-%s:%f", evname, evals_[i]->Name(), res); + } + } + + private: + EvalPair pair_; + EvalMAP map_; + EvalNDCG ndcg_; + std::vector evals_; + }; + }; }; #endif diff --git a/dev/rank/xgboost_rank_main.cpp b/dev/rank/xgboost_rank_main.cpp index 56f02f395..2ad6d98a4 100644 --- a/dev/rank/xgboost_rank_main.cpp +++ b/dev/rank/xgboost_rank_main.cpp @@ -11,20 +11,12 @@ #include "../base/xgboost_boost_task.h" #include "xgboost_rank.h" #include "../regression/xgboost_reg.h" +#include "../regression/xgboost_reg_main.cpp" +#include "../base/xgboost_data_instance.h" -int main(int argc, char *argv[]) { - - xgboost::random::Seed(0); - xgboost::base::BoostTask tsk; - xgboost::utils::ConfigIterator itr(argv[1]); -/* int learner_index = 0; - while (itr.Next()){ - if (!strcmp(itr.name(), "learning_task")){ - learner_index = atoi(itr.val()); - } - }*/ - xgboost::rank::RankBoostLearner* rank_learner = new xgboost::rank::RankBoostLearner; - xgboost::base::BoostLearner *parent = static_cast(rank_learner); - tsk.SetLearner(parent); - return tsk.Run(argc, argv); +int main(int argc, char *argv[]) { + xgboost::random::Seed(0); + xgboost::base::BoostTask rank_tsk; + rank_tsk.SetLearner(new xgboost::rank::RankBoostLearner); + return rank_tsk.Run(argc, argv); } diff --git a/dev/rank/xgboost_sample.h b/dev/rank/xgboost_sample.h index 85f429d56..6719390a8 100644 --- a/dev/rank/xgboost_sample.h +++ b/dev/rank/xgboost_sample.h @@ -5,123 +5,124 @@ #include"../utils/xgboost_utils.h" namespace xgboost { - namespace rank { - namespace sample { + namespace rank { + namespace sample { - /* - * \brief the data structure to maintain the sample pairs - */ - struct Pairs { + /* + * \brief the data structure to maintain the sample pairs + */ + struct Pairs { - /* - * \brief constructor given the start and end offset of the sampling group - * in overall instances - * \param start the begin index of the group - * \param end the end index of the group - */ - Pairs(int start,int end):start_(start),end_(end_){ - for(int i = start; i < end; i++){ - std::vector v; - pairs_.push_back(v); - } - } - /* - * \brief retrieve the related pair information of an data instances - * \param index, the index of retrieved instance - * \return the index of instances paired - */ - std::vector GetPairs(int index) { - utils::Assert(index >= start_ && index < end_,"The query index out of sampling bound"); - return pairs_[index-start_]; - } + /* + * \brief constructor given the start and end offset of the sampling group + * in overall instances + * \param start the begin index of the group + * \param end the end index of the group + */ + Pairs(int start, int end) :start_(start), end_(end){ + for (int i = start; i < end; i++){ + std::vector v; + pairs_.push_back(v); + } + } + /* + * \brief retrieve the related pair information of an data instances + * \param index, the index of retrieved instance + * \return the index of instances paired + */ + std::vector GetPairs(int index) const{ + utils::Assert(index >= start_ && index < end_, "The query index out of sampling bound"); + return pairs_[index - start_]; + } - /* - * \brief add in a sampled pair - * \param index the index of the instance to sample a friend - * \param paired_index the index of the instance sampled as a friend - */ - void push(int index,int paired_index){ - pairs_[index - start_].push_back(paired_index); - } - - std::vector< std::vector > pairs_; - int start_; - int end_; - }; + /* + * \brief add in a sampled pair + * \param index the index of the instance to sample a friend + * \param paired_index the index of the instance sampled as a friend + */ + void push(int index, int paired_index){ + pairs_[index - start_].push_back(paired_index); + } - /* - * \brief the interface of pair sampler - */ - struct IPairSampler { - /* - * \brief Generate sample pairs given the predcions, labels, the start and the end index - * of a specified group - * \param preds, the predictions of all data instances - * \param labels, the labels of all data instances - * \param start, the start index of a specified group - * \param end, the end index of a specified group - * \return the generated pairs - */ - virtual Pairs GenPairs(const std::vector &preds, - const std::vector &labels, - int start,int end) = 0; - - }; - - enum{ - BINARY_LINEAR_SAMPLER - }; - - /*! \brief A simple pair sampler when the rank relevence scale is binary - * for each positive instance, we will pick a negative - * instance and add in a pair. When using binary linear sampler, - * we should guarantee the labels are 0 or 1 - */ - struct BinaryLinearSampler:public IPairSampler{ - virtual Pairs GenPairs(const std::vector &preds, - const std::vector &labels, - int start,int end) { - Pairs pairs(start,end); - int pointer = 0, last_pointer = 0,index = start, interval = end - start; - for(int i = start; i < end; i++){ - if(labels[i] == 1){ - while(true){ - index = (++pointer) % interval + start; - if(labels[index] == 0) break; - if(pointer - last_pointer > interval) return pairs; - } - pairs.push(i,index); - pairs.push(index,i); - last_pointer = pointer; - } - } - return pairs; - } - }; - - - /*! \brief Pair Sampler Wrapper*/ - struct PairSamplerWrapper{ - public: - inline void AssignSampler( int sampler_index ){ - - switch(sampler_index){ - case BINARY_LINEAR_SAMPLER:sampler_ = &binary_linear_sampler;break; - - default:utils::Error("Cannot find the specified sampler"); - } - } - - Pairs GenPairs(const std::vector &preds, - const std::vector &labels, - int start,int end){ - return sampler_->GenPairs(preds,labels,start,end); - } - private: - BinaryLinearSampler binary_linear_sampler; - IPairSampler *sampler_; - }; + std::vector< std::vector > pairs_; + int start_; + int end_; + }; + + /* + * \brief the interface of pair sampler + */ + struct IPairSampler { + /* + * \brief Generate sample pairs given the predcions, labels, the start and the end index + * of a specified group + * \param preds, the predictions of all data instances + * \param labels, the labels of all data instances + * \param start, the start index of a specified group + * \param end, the end index of a specified group + * \return the generated pairs + */ + virtual Pairs GenPairs(const std::vector &preds, + const std::vector &labels, + int start, int end) = 0; + + }; + + enum{ + BINARY_LINEAR_SAMPLER + }; + + /*! \brief A simple pair sampler when the rank relevence scale is binary + * for each positive instance, we will pick a negative + * instance and add in a pair. When using binary linear sampler, + * we should guarantee the labels are 0 or 1 + */ + struct BinaryLinearSampler :public IPairSampler{ + virtual Pairs GenPairs(const std::vector &preds, + const std::vector &labels, + int start, int end) { + Pairs pairs(start, end); + int pointer = 0, last_pointer = 0, index = start, interval = end - start; + for (int i = start; i < end; i++){ + if (labels[i] == 1){ + while (true){ + index = (++pointer) % interval + start; + if (labels[index] == 0) break; + if (pointer - last_pointer > interval) return pairs; + } + pairs.push(i, index); + pairs.push(index, i); + last_pointer = pointer; + } + } + return pairs; + } + }; + + + /*! \brief Pair Sampler Wrapper*/ + struct PairSamplerWrapper{ + public: + inline void AssignSampler(int sampler_index){ + + switch (sampler_index){ + case BINARY_LINEAR_SAMPLER:sampler_ = &binary_linear_sampler; break; + + default:utils::Error("Cannot find the specified sampler"); + } + } + + Pairs GenPairs(const std::vector &preds, + const std::vector &labels, + int start, int end){ + utils::Assert(sampler_ != NULL,"Not config the sampler yet. Add rank:sampler in the config file\n"); + return sampler_->GenPairs(preds, labels, start, end); + } + private: + BinaryLinearSampler binary_linear_sampler; + IPairSampler *sampler_; + }; + } } - } } #endif \ No newline at end of file diff --git a/regression/xgboost_reg.h b/regression/xgboost_reg.h index 2abe436b7..01cf0d2f3 100644 --- a/regression/xgboost_reg.h +++ b/regression/xgboost_reg.h @@ -21,239 +21,240 @@ namespace xgboost{ class RegBoostLearner{ public: /*! \brief constructor */ - RegBoostLearner( void ){ - silent = 0; + RegBoostLearner(void){ + silent = 0; } - /*! - * \brief a regression booter associated with training and evaluating data + /*! + * \brief a regression booter associated with training and evaluating data * \param train pointer to the training data * \param evals array of evaluating data * \param evname name of evaluation data, used print statistics */ - RegBoostLearner( const DMatrix *train, - const std::vector &evals, - const std::vector &evname ){ + RegBoostLearner(const DMatrix *train, + const std::vector &evals, + const std::vector &evname){ silent = 0; - this->SetData(train,evals,evname); + this->SetData(train, evals, evname); } - /*! - * \brief associate regression booster with training and evaluating data + /*! + * \brief associate regression booster with training and evaluating data * \param train pointer to the training data * \param evals array of evaluating data * \param evname name of evaluation data, used print statistics */ - inline void SetData( const DMatrix *train, - const std::vector &evals, - const std::vector &evname ){ + inline void SetData(const DMatrix *train, + const std::vector &evals, + const std::vector &evname){ this->train_ = train; this->evals_ = evals; - this->evname_ = evname; + this->evname_ = evname; // estimate feature bound int num_feature = (int)(train->data.NumCol()); // assign buffer index - unsigned buffer_size = static_cast( train->Size() ); - - for( size_t i = 0; i < evals.size(); ++ i ){ - buffer_size += static_cast( evals[i]->Size() ); - num_feature = std::max( num_feature, (int)(evals[i]->data.NumCol()) ); + unsigned buffer_size = static_cast(train->Size()); + + for (size_t i = 0; i < evals.size(); ++i){ + buffer_size += static_cast(evals[i]->Size()); + num_feature = std::max(num_feature, (int)(evals[i]->data.NumCol())); } char str_temp[25]; - if( num_feature > mparam.num_feature ){ + if (num_feature > mparam.num_feature){ mparam.num_feature = num_feature; - sprintf( str_temp, "%d", num_feature ); - base_gbm.SetParam( "bst:num_feature", str_temp ); + sprintf(str_temp, "%d", num_feature); + base_gbm.SetParam("bst:num_feature", str_temp); } - - sprintf( str_temp, "%u", buffer_size ); - base_gbm.SetParam( "num_pbuffer", str_temp ); - if( !silent ){ - printf( "buffer_size=%u\n", buffer_size ); + + sprintf(str_temp, "%u", buffer_size); + base_gbm.SetParam("num_pbuffer", str_temp); + if (!silent){ + printf("buffer_size=%u\n", buffer_size); } - + // set eval_preds tmp sapce - this->eval_preds_.resize( evals.size(), std::vector() ); + this->eval_preds_.resize(evals.size(), std::vector()); } - /*! - * \brief set parameters from outside + /*! + * \brief set parameters from outside * \param name name of the parameter * \param val value of the parameter */ - inline void SetParam( const char *name, const char *val ){ - if( !strcmp( name, "silent") ) silent = atoi( val ); - if( !strcmp( name, "eval_metric") ) evaluator_.AddEval( val ); - mparam.SetParam( name, val ); - base_gbm.SetParam( name, val ); + inline void SetParam(const char *name, const char *val){ + if (!strcmp(name, "silent")) silent = atoi(val); + if (!strcmp(name, "eval_metric")) evaluator_.AddEval(val); + mparam.SetParam(name, val); + base_gbm.SetParam(name, val); } /*! * \brief initialize solver before training, called before training - * this function is reserved for solver to allocate necessary space and do other preparation + * this function is reserved for solver to allocate necessary space and do other preparation */ - inline void InitTrainer( void ){ + inline void InitTrainer(void){ base_gbm.InitTrainer(); - if( mparam.loss_type == kLogisticClassify ){ - evaluator_.AddEval( "error" ); - }else{ - evaluator_.AddEval( "rmse" ); + if (mparam.loss_type == kLogisticClassify){ + evaluator_.AddEval("error"); + } + else{ + evaluator_.AddEval("rmse"); } evaluator_.Init(); - } + } /*! * \brief initialize the current data storage for model, if the model is used first time, call this function */ - inline void InitModel( void ){ + inline void InitModel(void){ base_gbm.InitModel(); mparam.AdjustBase(); } - /*! + /*! * \brief load model from stream * \param fi input stream - */ - inline void LoadModel( utils::IStream &fi ){ - base_gbm.LoadModel( fi ); - utils::Assert( fi.Read( &mparam, sizeof(ModelParam) ) != 0 ); + */ + inline void LoadModel(utils::IStream &fi){ + base_gbm.LoadModel(fi); + utils::Assert(fi.Read(&mparam, sizeof(ModelParam)) != 0); } - /*! + /*! * \brief DumpModel - * \param fo text file - * \param fmap feature map that may help give interpretations of feature - * \param with_stats whether print statistics as well - */ - inline void DumpModel( FILE *fo, const utils::FeatMap& fmap, bool with_stats ){ - base_gbm.DumpModel( fo, fmap, with_stats ); + * \param fo text file + * \param fmap feature map that may help give interpretations of feature + * \param with_stats whether print statistics as well + */ + inline void DumpModel(FILE *fo, const utils::FeatMap& fmap, bool with_stats){ + base_gbm.DumpModel(fo, fmap, with_stats); } - /*! + /*! * \brief Dump path of all trees - * \param fo text file + * \param fo text file * \param data input data */ - inline void DumpPath( FILE *fo, const DMatrix &data ){ - base_gbm.DumpPath( fo, data.data ); + inline void DumpPath(FILE *fo, const DMatrix &data){ + base_gbm.DumpPath(fo, data.data); } - /*! + /*! * \brief save model to stream * \param fo output stream */ - inline void SaveModel( utils::IStream &fo ) const{ - base_gbm.SaveModel( fo ); - fo.Write( &mparam, sizeof(ModelParam) ); - } - /*! + inline void SaveModel(utils::IStream &fo) const{ + base_gbm.SaveModel(fo); + fo.Write(&mparam, sizeof(ModelParam)); + } + /*! * \brief update the model for one iteration * \param iteration iteration number */ - inline void UpdateOneIter( int iter ){ - this->PredictBuffer( preds_, *train_, 0 ); - this->GetGradient( preds_, train_->labels, grad_, hess_ ); + inline void UpdateOneIter(int iter){ + this->PredictBuffer(preds_, *train_, 0); + this->GetGradient(preds_, train_->labels, grad_, hess_); std::vector root_index; - base_gbm.DoBoost( grad_, hess_, train_->data, root_index ); + base_gbm.DoBoost(grad_, hess_, train_->data, root_index); } - /*! + /*! * \brief evaluate the model for specific iteration * \param iter iteration number * \param fo file to output log - */ - inline void EvalOneIter( int iter, FILE *fo = stderr ){ - fprintf( fo, "[%d]", iter ); - int buffer_offset = static_cast( train_->Size() ); - - for( size_t i = 0; i < evals_.size(); ++i ){ - std::vector &preds = this->eval_preds_[ i ]; - this->PredictBuffer( preds, *evals_[i], buffer_offset); - evaluator_.Eval( fo, evname_[i].c_str(), preds, (*evals_[i]).labels ); - buffer_offset += static_cast( evals_[i]->Size() ); + */ + inline void EvalOneIter(int iter, FILE *fo = stderr){ + fprintf(fo, "[%d]", iter); + int buffer_offset = static_cast(train_->Size()); + + for (size_t i = 0; i < evals_.size(); ++i){ + std::vector &preds = this->eval_preds_[i]; + this->PredictBuffer(preds, *evals_[i], buffer_offset); + evaluator_.Eval(fo, evname_[i].c_str(), preds, (*evals_[i]).labels); + buffer_offset += static_cast(evals_[i]->Size()); } - fprintf( fo,"\n" ); + fprintf(fo, "\n"); } /*! \brief get prediction, without buffering */ - inline void Predict( std::vector &preds, const DMatrix &data ){ - preds.resize( data.Size() ); + inline void Predict(std::vector &preds, const DMatrix &data){ + preds.resize(data.Size()); - const unsigned ndata = static_cast( data.Size() ); - #pragma omp parallel for schedule( static ) - for( unsigned j = 0; j < ndata; ++ j ){ + const unsigned ndata = static_cast(data.Size()); +#pragma omp parallel for schedule( static ) + for (unsigned j = 0; j < ndata; ++j){ preds[j] = mparam.PredTransform - ( mparam.base_score + base_gbm.Predict( data.data, j, -1 ) ); + (mparam.base_score + base_gbm.Predict(data.data, j, -1)); } } public: - /*! + /*! * \brief update the model for one iteration * \param iteration iteration number */ - inline void UpdateInteract( std::string action ){ - this->InteractPredict( preds_, *train_, 0 ); + inline void UpdateInteract(std::string action){ + this->InteractPredict(preds_, *train_, 0); - int buffer_offset = static_cast( train_->Size() ); - for( size_t i = 0; i < evals_.size(); ++i ){ - std::vector &preds = this->eval_preds_[ i ]; - this->InteractPredict( preds, *evals_[i], buffer_offset ); - buffer_offset += static_cast( evals_[i]->Size() ); + int buffer_offset = static_cast(train_->Size()); + for (size_t i = 0; i < evals_.size(); ++i){ + std::vector &preds = this->eval_preds_[i]; + this->InteractPredict(preds, *evals_[i], buffer_offset); + buffer_offset += static_cast(evals_[i]->Size()); } - if( action == "remove" ){ + if (action == "remove"){ base_gbm.DelteBooster(); return; } - - this->GetGradient( preds_, train_->labels, grad_, hess_ ); - std::vector root_index; - base_gbm.DoBoost( grad_, hess_, train_->data, root_index ); - this->InteractRePredict( *train_, 0 ); - buffer_offset = static_cast( train_->Size() ); - for( size_t i = 0; i < evals_.size(); ++i ){ - this->InteractRePredict( *evals_[i], buffer_offset ); - buffer_offset += static_cast( evals_[i]->Size() ); + this->GetGradient(preds_, train_->labels, grad_, hess_); + std::vector root_index; + base_gbm.DoBoost(grad_, hess_, train_->data, root_index); + + this->InteractRePredict(*train_, 0); + buffer_offset = static_cast(train_->Size()); + for (size_t i = 0; i < evals_.size(); ++i){ + this->InteractRePredict(*evals_[i], buffer_offset); + buffer_offset += static_cast(evals_[i]->Size()); } } private: /*! \brief get the transformed predictions, given data */ - inline void InteractPredict( std::vector &preds, const DMatrix &data, unsigned buffer_offset ){ - preds.resize( data.Size() ); - const unsigned ndata = static_cast( data.Size() ); - #pragma omp parallel for schedule( static ) - for( unsigned j = 0; j < ndata; ++ j ){ + inline void InteractPredict(std::vector &preds, const DMatrix &data, unsigned buffer_offset){ + preds.resize(data.Size()); + const unsigned ndata = static_cast(data.Size()); + #pragma omp parallel for schedule( static ) + for (unsigned j = 0; j < ndata; ++j){ preds[j] = mparam.PredTransform - ( mparam.base_score + base_gbm.InteractPredict( data.data, j, buffer_offset + j ) ); + (mparam.base_score + base_gbm.InteractPredict(data.data, j, buffer_offset + j)); } } /*! \brief repredict trial */ - inline void InteractRePredict( const DMatrix &data, unsigned buffer_offset ){ - const unsigned ndata = static_cast( data.Size() ); - #pragma omp parallel for schedule( static ) - for( unsigned j = 0; j < ndata; ++ j ){ - base_gbm.InteractRePredict( data.data, j, buffer_offset + j ); + inline void InteractRePredict(const DMatrix &data, unsigned buffer_offset){ + const unsigned ndata = static_cast(data.Size()); + #pragma omp parallel for schedule( static ) + for (unsigned j = 0; j < ndata; ++j){ + base_gbm.InteractRePredict(data.data, j, buffer_offset + j); } } private: /*! \brief get the transformed predictions, given data */ - inline void PredictBuffer( std::vector &preds, const DMatrix &data, unsigned buffer_offset ){ - preds.resize( data.Size() ); + inline void PredictBuffer(std::vector &preds, const DMatrix &data, unsigned buffer_offset){ + preds.resize(data.Size()); - const unsigned ndata = static_cast( data.Size() ); - #pragma omp parallel for schedule( static ) - for( unsigned j = 0; j < ndata; ++ j ){ + const unsigned ndata = static_cast(data.Size()); +#pragma omp parallel for schedule( static ) + for (unsigned j = 0; j < ndata; ++j){ preds[j] = mparam.PredTransform - ( mparam.base_score + base_gbm.Predict( data.data, j, buffer_offset + j ) ); + (mparam.base_score + base_gbm.Predict(data.data, j, buffer_offset + j)); } } /*! \brief get the first order and second order gradient, given the transformed predictions and labels */ - inline void GetGradient( const std::vector &preds, - const std::vector &labels, - std::vector &grad, - std::vector &hess ){ - grad.resize( preds.size() ); hess.resize( preds.size() ); + inline void GetGradient(const std::vector &preds, + const std::vector &labels, + std::vector &grad, + std::vector &hess){ + grad.resize(preds.size()); hess.resize(preds.size()); - const unsigned ndata = static_cast( preds.size() ); - #pragma omp parallel for schedule( static ) - for( unsigned j = 0; j < ndata; ++ j ){ - grad[j] = mparam.FirstOrderGradient( preds[j], labels[j] ); - hess[j] = mparam.SecondOrderGradient( preds[j], labels[j] ); + const unsigned ndata = static_cast(preds.size()); +#pragma omp parallel for schedule( static ) + for (unsigned j = 0; j < ndata; ++j){ + grad[j] = mparam.FirstOrderGradient(preds[j], labels[j]); + hess[j] = mparam.SecondOrderGradient(preds[j], labels[j]); } } - + private: enum LossType{ kLinearSquare = 0, @@ -270,73 +271,73 @@ namespace xgboost{ /* \brief number of features */ int num_feature; /*! \brief reserved field */ - int reserved[ 16 ]; + int reserved[16]; /*! \brief constructor */ - ModelParam( void ){ + ModelParam(void){ base_score = 0.5f; - loss_type = 0; + loss_type = 0; num_feature = 0; - memset( reserved, 0, sizeof( reserved ) ); + memset(reserved, 0, sizeof(reserved)); } - /*! - * \brief set parameters from outside + /*! + * \brief set parameters from outside * \param name name of the parameter * \param val value of the parameter */ - inline void SetParam( const char *name, const char *val ){ - if( !strcmp("base_score", name ) ) base_score = (float)atof( val ); - if( !strcmp("loss_type", name ) ) loss_type = atoi( val ); - if( !strcmp("bst:num_feature", name ) ) num_feature = atoi( val ); + inline void SetParam(const char *name, const char *val){ + if (!strcmp("base_score", name)) base_score = (float)atof(val); + if (!strcmp("loss_type", name)) loss_type = atoi(val); + if (!strcmp("bst:num_feature", name)) num_feature = atoi(val); } - /*! + /*! * \brief adjust base_score - */ - inline void AdjustBase( void ){ - if( loss_type == 1 || loss_type == 2 ){ - utils::Assert( base_score > 0.0f && base_score < 1.0f, "sigmoid range constrain" ); - base_score = - logf( 1.0f / base_score - 1.0f ); + */ + inline void AdjustBase(void){ + if (loss_type == 1 || loss_type == 2){ + utils::Assert(base_score > 0.0f && base_score < 1.0f, "sigmoid range constrain"); + base_score = -logf(1.0f / base_score - 1.0f); } } - /*! - * \brief transform the linear sum to prediction + /*! + * \brief transform the linear sum to prediction * \param x linear sum of boosting ensemble * \return transformed prediction */ - inline float PredTransform( float x ){ - switch( loss_type ){ + inline float PredTransform(float x){ + switch (loss_type){ case kLinearSquare: return x; case kLogisticClassify: - case kLogisticNeglik: return 1.0f/(1.0f + expf(-x)); + case kLogisticNeglik: return 1.0f / (1.0f + expf(-x)); default: utils::Error("unknown loss_type"); return 0.0f; } } - /*! + /*! * \brief calculate first order gradient of loss, given transformed prediction * \param predt transformed prediction * \param label true label * \return first order gradient */ - inline float FirstOrderGradient( float predt, float label ) const{ - switch( loss_type ){ + inline float FirstOrderGradient(float predt, float label) const{ + switch (loss_type){ case kLinearSquare: return predt - label; case kLogisticClassify: case kLogisticNeglik: return predt - label; default: utils::Error("unknown loss_type"); return 0.0f; } } - /*! + /*! * \brief calculate second order gradient of loss, given transformed prediction * \param predt transformed prediction * \param label true label * \return second order gradient */ - inline float SecondOrderGradient( float predt, float label ) const{ - switch( loss_type ){ + inline float SecondOrderGradient(float predt, float label) const{ + switch (loss_type){ case kLinearSquare: return 1.0f; case kLogisticClassify: - case kLogisticNeglik: return predt * ( 1 - predt ); + case kLogisticNeglik: return predt * (1 - predt); default: utils::Error("unknown loss_type"); return 0.0f; } } @@ -348,10 +349,10 @@ namespace xgboost{ * \return the specified loss */ inline float Loss(const std::vector &preds, const std::vector &labels) const{ - switch( loss_type ){ - case kLinearSquare: return SquareLoss(preds,labels); - case kLogisticNeglik: - case kLogisticClassify: return NegLoglikelihoodLoss(preds,labels); + switch (loss_type){ + case kLinearSquare: return SquareLoss(preds, labels); + case kLogisticNeglik: + case kLogisticClassify: return NegLoglikelihoodLoss(preds, labels); default: utils::Error("unknown loss_type"); return 0.0f; } } @@ -364,7 +365,7 @@ namespace xgboost{ */ inline float SquareLoss(const std::vector &preds, const std::vector &labels) const{ float ans = 0.0; - for(size_t i = 0; i < preds.size(); i++){ + for (size_t i = 0; i < preds.size(); i++){ float dif = preds[i] - labels[i]; ans += dif * dif; } @@ -379,8 +380,8 @@ namespace xgboost{ */ inline float NegLoglikelihoodLoss(const std::vector &preds, const std::vector &labels) const{ float ans = 0.0; - for(size_t i = 0; i < preds.size(); i++) - ans -= labels[i] * logf(preds[i]) + ( 1 - labels[i] ) * logf(1 - preds[i]); + for (size_t i = 0; i < preds.size(); i++) + ans -= labels[i] * logf(preds[i]) + (1 - labels[i]) * logf(1 - preds[i]); return ans; } }; diff --git a/regression/xgboost_reg_data.h b/regression/xgboost_reg_data.h index 905b80cbc..b00eb1c94 100644 --- a/regression/xgboost_reg_data.h +++ b/regression/xgboost_reg_data.h @@ -27,111 +27,112 @@ namespace xgboost{ std::vector labels; public: /*! \brief default constructor */ - DMatrix( void ){} + DMatrix(void){} /*! \brief get the number of instances */ inline size_t Size() const{ return labels.size(); } - /*! - * \brief load from text file + /*! + * \brief load from text file * \param fname name of text data * \param silent whether print information or not - */ - inline void LoadText( const char* fname, bool silent = false ){ + */ + inline void LoadText(const char* fname, bool silent = false){ data.Clear(); - FILE* file = utils::FopenCheck( fname, "r" ); + FILE* file = utils::FopenCheck(fname, "r"); float label; bool init = true; - char tmp[ 1024 ]; + char tmp[1024]; std::vector findex; std::vector fvalue; - while( fscanf( file, "%s", tmp ) == 1 ){ + while (fscanf(file, "%s", tmp) == 1){ unsigned index; float value; - if( sscanf( tmp, "%u:%f", &index, &value ) == 2 ){ - findex.push_back( index ); fvalue.push_back( value ); - }else{ - if( !init ){ - labels.push_back( label ); - data.AddRow( findex, fvalue ); + if (sscanf(tmp, "%u:%f", &index, &value) == 2){ + findex.push_back(index); fvalue.push_back(value); + } + else{ + if (!init){ + labels.push_back(label); + data.AddRow(findex, fvalue); } findex.clear(); fvalue.clear(); - utils::Assert( sscanf( tmp, "%f", &label ) == 1, "invalid format" ); + utils::Assert(sscanf(tmp, "%f", &label) == 1, "invalid format"); init = false; } } - labels.push_back( label ); - data.AddRow( findex, fvalue ); + labels.push_back(label); + data.AddRow(findex, fvalue); // initialize column support as well data.InitData(); - if( !silent ){ - printf("%ux%u matrix with %lu entries is loaded from %s\n", - (unsigned)data.NumRow(), (unsigned)data.NumCol(), (unsigned long)data.NumEntry(), fname ); + if (!silent){ + printf("%ux%u matrix with %lu entries is loaded from %s\n", + (unsigned)data.NumRow(), (unsigned)data.NumCol(), (unsigned long)data.NumEntry(), fname); } fclose(file); } - /*! - * \brief load from binary file + /*! + * \brief load from binary file * \param fname name of binary data * \param silent whether print information or not * \return whether loading is success */ - inline bool LoadBinary( const char* fname, bool silent = false ){ - FILE *fp = fopen64( fname, "rb" ); - if( fp == NULL ) return false; - utils::FileStream fs( fp ); - data.LoadBinary( fs ); - labels.resize( data.NumRow() ); - utils::Assert( fs.Read( &labels[0], sizeof(float) * data.NumRow() ) != 0, "DMatrix LoadBinary" ); + inline bool LoadBinary(const char* fname, bool silent = false){ + FILE *fp = fopen64(fname, "rb"); + if (fp == NULL) return false; + utils::FileStream fs(fp); + data.LoadBinary(fs); + labels.resize(data.NumRow()); + utils::Assert(fs.Read(&labels[0], sizeof(float)* data.NumRow()) != 0, "DMatrix LoadBinary"); fs.Close(); // initialize column support as well data.InitData(); - if( !silent ){ - printf("%ux%u matrix with %lu entries is loaded from %s\n", - (unsigned)data.NumRow(), (unsigned)data.NumCol(), (unsigned long)data.NumEntry(), fname ); + if (!silent){ + printf("%ux%u matrix with %lu entries is loaded from %s\n", + (unsigned)data.NumRow(), (unsigned)data.NumCol(), (unsigned long)data.NumEntry(), fname); } return true; } - /*! + /*! * \brief save to binary file * \param fname name of binary data * \param silent whether print information or not */ - inline void SaveBinary( const char* fname, bool silent = false ){ + inline void SaveBinary(const char* fname, bool silent = false){ // initialize column support as well data.InitData(); - utils::FileStream fs( utils::FopenCheck( fname, "wb" ) ); - data.SaveBinary( fs ); - fs.Write( &labels[0], sizeof(float) * data.NumRow() ); + utils::FileStream fs(utils::FopenCheck(fname, "wb")); + data.SaveBinary(fs); + fs.Write(&labels[0], sizeof(float)* data.NumRow()); fs.Close(); - if( !silent ){ - printf("%ux%u matrix with %lu entries is saved to %s\n", - (unsigned)data.NumRow(), (unsigned)data.NumCol(), (unsigned long)data.NumEntry(), fname ); + if (!silent){ + printf("%ux%u matrix with %lu entries is saved to %s\n", + (unsigned)data.NumRow(), (unsigned)data.NumCol(), (unsigned long)data.NumEntry(), fname); } } - /*! + /*! * \brief cache load data given a file name, if filename ends with .buffer, direct load binary * otherwise the function will first check if fname + '.buffer' exists, * if binary buffer exists, it will reads from binary buffer, otherwise, it will load from text file, - * and try to create a buffer file + * and try to create a buffer file * \param fname name of binary data * \param silent whether print information or not * \param savebuffer whether do save binary buffer if it is text */ - inline void CacheLoad( const char *fname, bool silent = false, bool savebuffer = true ){ - int len = strlen( fname ); - if( len > 8 && !strcmp( fname + len - 7, ".buffer") ){ - this->LoadBinary( fname, silent ); return; + inline void CacheLoad(const char *fname, bool silent = false, bool savebuffer = true){ + int len = strlen(fname); + if (len > 8 && !strcmp(fname + len - 7, ".buffer")){ + this->LoadBinary(fname, silent); return; } - char bname[ 1024 ]; - sprintf( bname, "%s.buffer", fname ); - if( !this->LoadBinary( bname, silent ) ){ - this->LoadText( fname, silent ); - if( savebuffer ) this->SaveBinary( bname, silent ); + char bname[1024]; + sprintf(bname, "%s.buffer", fname); + if (!this->LoadBinary(bname, silent)){ + this->LoadText(fname, silent); + if (savebuffer) this->SaveBinary(bname, silent); } } }; diff --git a/regression/xgboost_reg_eval.h b/regression/xgboost_reg_eval.h index 9b69cdd3f..ff24ca69b 100644 --- a/regression/xgboost_reg_eval.h +++ b/regression/xgboost_reg_eval.h @@ -16,72 +16,73 @@ namespace xgboost{ namespace regression{ /*! \brief evaluator that evaluates the loss metrics */ struct IEvaluator{ - /*! - * \brief evaluate a specific metric + /*! + * \brief evaluate a specific metric * \param preds prediction * \param labels label */ - virtual float Eval( const std::vector &preds, - const std::vector &labels ) const= 0; + virtual float Eval(const std::vector &preds, + const std::vector &labels) const = 0; /*! \return name of metric */ - virtual const char *Name( void ) const= 0; + virtual const char *Name(void) const = 0; }; /*! \brief RMSE */ - struct EvalRMSE : public IEvaluator{ - virtual float Eval( const std::vector &preds, - const std::vector &labels ) const{ - const unsigned ndata = static_cast( preds.size() ); + struct EvalRMSE : public IEvaluator{ + virtual float Eval(const std::vector &preds, + const std::vector &labels) const{ + const unsigned ndata = static_cast(preds.size()); float sum = 0.0; - #pragma omp parallel for reduction(+:sum) schedule( static ) - for( unsigned i = 0; i < ndata; ++ i ){ +#pragma omp parallel for reduction(+:sum) schedule( static ) + for (unsigned i = 0; i < ndata; ++i){ float diff = preds[i] - labels[i]; sum += diff * diff; - } - return sqrtf( sum / ndata ); + } + return sqrtf(sum / ndata); } - virtual const char *Name( void ) const{ + virtual const char *Name(void) const{ return "rmse"; } }; /*! \brief Error */ - struct EvalError : public IEvaluator{ - virtual float Eval( const std::vector &preds, - const std::vector &labels ) const{ - const unsigned ndata = static_cast( preds.size() ); + struct EvalError : public IEvaluator{ + virtual float Eval(const std::vector &preds, + const std::vector &labels) const{ + const unsigned ndata = static_cast(preds.size()); unsigned nerr = 0; - #pragma omp parallel for reduction(+:nerr) schedule( static ) - for( unsigned i = 0; i < ndata; ++ i ){ - if( preds[i] > 0.5f ){ - if( labels[i] < 0.5f ) nerr += 1; - }else{ - if( labels[i] > 0.5f ) nerr += 1; +#pragma omp parallel for reduction(+:nerr) schedule( static ) + for (unsigned i = 0; i < ndata; ++i){ + if (preds[i] > 0.5f){ + if (labels[i] < 0.5f) nerr += 1; } - } + else{ + if (labels[i] > 0.5f) nerr += 1; + } + } return static_cast(nerr) / ndata; } - virtual const char *Name( void ) const{ + virtual const char *Name(void) const{ return "error"; } }; /*! \brief Error */ - struct EvalLogLoss : public IEvaluator{ - virtual float Eval( const std::vector &preds, - const std::vector &labels ) const{ - const unsigned ndata = static_cast( preds.size() ); + struct EvalLogLoss : public IEvaluator{ + virtual float Eval(const std::vector &preds, + const std::vector &labels) const{ + const unsigned ndata = static_cast(preds.size()); unsigned nerr = 0; - #pragma omp parallel for reduction(+:nerr) schedule( static ) - for( unsigned i = 0; i < ndata; ++ i ){ +#pragma omp parallel for reduction(+:nerr) schedule( static ) + for (unsigned i = 0; i < ndata; ++i){ const float y = labels[i]; const float py = preds[i]; - nerr -= y * std::log(py) + (1.0f-y)*std::log(1-py); - } + nerr -= y * std::log(py) + (1.0f - y)*std::log(1 - py); + } return static_cast(nerr) / ndata; } - virtual const char *Name( void ) const{ + virtual const char *Name(void) const{ return "negllik"; } }; @@ -91,28 +92,28 @@ namespace xgboost{ /*! \brief a set of evaluators */ struct EvalSet{ public: - inline void AddEval( const char *name ){ - if( !strcmp( name, "rmse") ) evals_.push_back( &rmse_ ); - if( !strcmp( name, "error") ) evals_.push_back( &error_ ); - if( !strcmp( name, "logloss") ) evals_.push_back( &logloss_ ); + inline void AddEval(const char *name){ + if (!strcmp(name, "rmse")) evals_.push_back(&rmse_); + if (!strcmp(name, "error")) evals_.push_back(&error_); + if (!strcmp(name, "logloss")) evals_.push_back(&logloss_); } - inline void Init( void ){ - std::sort( evals_.begin(), evals_.end() ); - evals_.resize( std::unique( evals_.begin(), evals_.end() ) - evals_.begin() ); + inline void Init(void){ + std::sort(evals_.begin(), evals_.end()); + evals_.resize(std::unique(evals_.begin(), evals_.end()) - evals_.begin()); } - inline void Eval( FILE *fo, const char *evname, - const std::vector &preds, - const std::vector &labels ) const{ - for( size_t i = 0; i < evals_.size(); ++ i ){ - float res = evals_[i]->Eval( preds, labels ); - fprintf( fo, "\t%s-%s:%f", evname, evals_[i]->Name(), res ); - } + inline void Eval(FILE *fo, const char *evname, + const std::vector &preds, + const std::vector &labels) const{ + for (size_t i = 0; i < evals_.size(); ++i){ + float res = evals_[i]->Eval(preds, labels); + fprintf(fo, "\t%s-%s:%f", evname, evals_[i]->Name(), res); + } } private: EvalRMSE rmse_; EvalError error_; EvalLogLoss logloss_; - std::vector evals_; + std::vector evals_; }; }; }; diff --git a/regression/xgboost_reg_main.cpp b/regression/xgboost_reg_main.cpp index 9d43c22fb..f3ff4003f 100644 --- a/regression/xgboost_reg_main.cpp +++ b/regression/xgboost_reg_main.cpp @@ -16,83 +16,84 @@ namespace xgboost{ * given the configuation * \author Kailong Chen: chenkl198812@gmail.com, Tianqi Chen: tianqi.chen@gmail.com */ - class RegBoostTask{ + class RegBoostTask{ public: - inline int Run( int argc, char *argv[] ){ - if( argc < 2 ){ - printf("Usage: \n"); + inline int Run(int argc, char *argv[]){ + if (argc < 2){ + printf("Usage: \n"); return 0; } - utils::ConfigIterator itr( argv[1] ); - while( itr.Next() ){ - this->SetParam( itr.name(), itr.val() ); + utils::ConfigIterator itr(argv[1]); + while (itr.Next()){ + this->SetParam(itr.name(), itr.val()); } - for( int i = 2; i < argc; i ++ ){ + for (int i = 2; i < argc; i++){ char name[256], val[256]; - if( sscanf( argv[i], "%[^=]=%s", name, val ) == 2 ){ - this->SetParam( name, val ); + if (sscanf(argv[i], "%[^=]=%s", name, val) == 2){ + this->SetParam(name, val); } } this->InitData(); this->InitLearner(); - if( task == "dump" ){ + if (task == "dump"){ this->TaskDump(); return 0; } - if( task == "interact" ){ + if (task == "interact"){ this->TaskInteractive(); return 0; } - if( task == "dumppath" ){ + if (task == "dumppath"){ this->TaskDumpPath(); return 0; } - if( task == "eval" ){ + if (task == "eval"){ this->TaskEval(); return 0; } - if( task == "pred" ){ + if (task == "pred"){ this->TaskPred(); - }else{ + } + else{ this->TaskTrain(); } return 0; } - inline void SetParam( const char *name, const char *val ){ - if( !strcmp("silent", name ) ) silent = atoi( val ); - if( !strcmp("use_buffer", name ) ) use_buffer = atoi( val ); - if( !strcmp("seed", name ) ) random::Seed( atoi(val) ); - if( !strcmp("num_round", name ) ) num_round = atoi( val ); - if( !strcmp("save_period", name ) ) save_period = atoi( val ); - if( !strcmp("task", name ) ) task = val; - if( !strcmp("data", name ) ) train_path = val; - if( !strcmp("test:data", name ) ) test_path = val; - if( !strcmp("model_in", name ) ) model_in = val; - if( !strcmp("model_out", name ) ) model_out = val; - if( !strcmp("model_dir", name ) ) model_dir_path = val; - if( !strcmp("fmap", name ) ) name_fmap = val; - if( !strcmp("name_dump", name ) ) name_dump = val; - if( !strcmp("name_dumppath", name ) ) name_dumppath = val; - if( !strcmp("name_pred", name ) ) name_pred = val; - if( !strcmp("dump_stats", name ) ) dump_model_stats = atoi( val ); - if( !strcmp("interact:action", name ) ) interact_action = val; - if( !strncmp("batch:", name, 6 ) ){ - cfg_batch.PushBack( name + 6, val ); + inline void SetParam(const char *name, const char *val){ + if (!strcmp("silent", name)) silent = atoi(val); + if (!strcmp("use_buffer", name)) use_buffer = atoi(val); + if (!strcmp("seed", name)) random::Seed(atoi(val)); + if (!strcmp("num_round", name)) num_round = atoi(val); + if (!strcmp("save_period", name)) save_period = atoi(val); + if (!strcmp("task", name)) task = val; + if (!strcmp("data", name)) train_path = val; + if (!strcmp("test:data", name)) test_path = val; + if (!strcmp("model_in", name)) model_in = val; + if (!strcmp("model_out", name)) model_out = val; + if (!strcmp("model_dir", name)) model_dir_path = val; + if (!strcmp("fmap", name)) name_fmap = val; + if (!strcmp("name_dump", name)) name_dump = val; + if (!strcmp("name_dumppath", name)) name_dumppath = val; + if (!strcmp("name_pred", name)) name_pred = val; + if (!strcmp("dump_stats", name)) dump_model_stats = atoi(val); + if (!strcmp("interact:action", name)) interact_action = val; + if (!strncmp("batch:", name, 6)){ + cfg_batch.PushBack(name + 6, val); } - if( !strncmp("eval[", name, 5 ) ) { - char evname[ 256 ]; - utils::Assert( sscanf( name, "eval[%[^]]", evname ) == 1, "must specify evaluation name for display"); - eval_data_names.push_back( std::string( evname ) ); - eval_data_paths.push_back( std::string( val ) ); + if (!strncmp("eval[", name, 5)) { + char evname[256]; + utils::Assert(sscanf(name, "eval[%[^]]", evname) == 1, "must specify evaluation name for display"); + eval_data_names.push_back(std::string(evname)); + eval_data_paths.push_back(std::string(val)); } - cfg.PushBack( name, val ); + cfg.PushBack(name, val); } public: - RegBoostTask( void ){ + RegBoostTask(void){ // default parameters silent = 0; use_buffer = 1; num_round = 10; save_period = 0; dump_model_stats = 0; - task = "train"; + task = "train"; model_in = "NULL"; model_out = "NULL"; name_fmap = "NULL"; @@ -102,128 +103,132 @@ namespace xgboost{ model_dir_path = "./"; interact_action = "update"; } - ~RegBoostTask( void ){ - for( size_t i = 0; i < deval.size(); i ++ ){ + ~RegBoostTask(void){ + for (size_t i = 0; i < deval.size(); i++){ delete deval[i]; } } private: - inline void InitData( void ){ - if( name_fmap != "NULL" ) fmap.LoadText( name_fmap.c_str() ); - if( task == "dump" ) return; - if( task == "pred" || task == "dumppath" ){ - data.CacheLoad( test_path.c_str(), silent!=0, use_buffer!=0 ); - }else{ + inline void InitData(void){ + if (name_fmap != "NULL") fmap.LoadText(name_fmap.c_str()); + if (task == "dump") return; + if (task == "pred" || task == "dumppath"){ + data.CacheLoad(test_path.c_str(), silent != 0, use_buffer != 0); + } + else{ // training - data.CacheLoad( train_path.c_str(), silent!=0, use_buffer!=0 ); - utils::Assert( eval_data_names.size() == eval_data_paths.size() ); - for( size_t i = 0; i < eval_data_names.size(); ++ i ){ - deval.push_back( new DMatrix() ); - deval.back()->CacheLoad( eval_data_paths[i].c_str(), silent!=0, use_buffer!=0 ); + data.CacheLoad(train_path.c_str(), silent != 0, use_buffer != 0); + utils::Assert(eval_data_names.size() == eval_data_paths.size()); + for (size_t i = 0; i < eval_data_names.size(); ++i){ + deval.push_back(new DMatrix()); + deval.back()->CacheLoad(eval_data_paths[i].c_str(), silent != 0, use_buffer != 0); } } - learner.SetData( &data, deval, eval_data_names ); + learner.SetData(&data, deval, eval_data_names); } - inline void InitLearner( void ){ + inline void InitLearner(void){ cfg.BeforeFirst(); - while( cfg.Next() ){ - learner.SetParam( cfg.name(), cfg.val() ); + while (cfg.Next()){ + learner.SetParam(cfg.name(), cfg.val()); } - if( model_in != "NULL" ){ - utils::FileStream fi( utils::FopenCheck( model_in.c_str(), "rb") ); - learner.LoadModel( fi ); + if (model_in != "NULL"){ + utils::FileStream fi(utils::FopenCheck(model_in.c_str(), "rb")); + learner.LoadModel(fi); fi.Close(); - }else{ - utils::Assert( task == "train", "model_in not specified" ); + } + else{ + utils::Assert(task == "train", "model_in not specified"); learner.InitModel(); } learner.InitTrainer(); } - inline void TaskTrain( void ){ - const time_t start = time( NULL ); + inline void TaskTrain(void){ + const time_t start = time(NULL); unsigned long elapsed = 0; - for( int i = 0; i < num_round; ++ i ){ - elapsed = (unsigned long)(time(NULL) - start); - if( !silent ) printf("boosting round %d, %lu sec elapsed\n", i , elapsed ); - learner.UpdateOneIter( i ); - learner.EvalOneIter( i ); - if( save_period != 0 && (i+1) % save_period == 0 ){ - this->SaveModel( i ); + for (int i = 0; i < num_round; ++i){ + elapsed = (unsigned long)(time(NULL) - start); + if (!silent) printf("boosting round %d, %lu sec elapsed\n", i, elapsed); + learner.UpdateOneIter(i); + learner.EvalOneIter(i); + if (save_period != 0 && (i + 1) % save_period == 0){ + this->SaveModel(i); } - elapsed = (unsigned long)(time(NULL) - start); + elapsed = (unsigned long)(time(NULL) - start); } // always save final round - if( save_period == 0 || num_round % save_period != 0 ){ - if( model_out == "NULL" ){ - this->SaveModel( num_round - 1 ); - }else{ - this->SaveModel( model_out.c_str() ); + if (save_period == 0 || num_round % save_period != 0){ + if (model_out == "NULL"){ + this->SaveModel(num_round - 1); + } + else{ + this->SaveModel(model_out.c_str()); } } - if( !silent ){ - printf("\nupdating end, %lu sec in all\n", elapsed ); + if (!silent){ + printf("\nupdating end, %lu sec in all\n", elapsed); } } - inline void TaskEval( void ){ - learner.EvalOneIter( 0 ); + inline void TaskEval(void){ + learner.EvalOneIter(0); } - inline void TaskInteractive( void ){ - const time_t start = time( NULL ); + inline void TaskInteractive(void){ + const time_t start = time(NULL); unsigned long elapsed = 0; int batch_action = 0; - + cfg_batch.BeforeFirst(); - while( cfg_batch.Next() ){ - if( !strcmp( cfg_batch.name(), "run" ) ){ - learner.UpdateInteract( interact_action ); + while (cfg_batch.Next()){ + if (!strcmp(cfg_batch.name(), "run")){ + learner.UpdateInteract(interact_action); batch_action += 1; - } else{ - learner.SetParam( cfg_batch.name(), cfg_batch.val() ); + } + else{ + learner.SetParam(cfg_batch.name(), cfg_batch.val()); } } - if( batch_action == 0 ){ - learner.UpdateInteract( interact_action ); + if (batch_action == 0){ + learner.UpdateInteract(interact_action); } - utils::Assert( model_out != "NULL", "interactive mode must specify model_out" ); - this->SaveModel( model_out.c_str() ); - elapsed = (unsigned long)(time(NULL) - start); + utils::Assert(model_out != "NULL", "interactive mode must specify model_out"); + this->SaveModel(model_out.c_str()); + elapsed = (unsigned long)(time(NULL) - start); - if( !silent ){ - printf("\ninteractive update, %d batch actions, %lu sec in all\n", batch_action, elapsed ); + if (!silent){ + printf("\ninteractive update, %d batch actions, %lu sec in all\n", batch_action, elapsed); } } - inline void TaskDump( void ){ - FILE *fo = utils::FopenCheck( name_dump.c_str(), "w" ); - learner.DumpModel( fo, fmap, dump_model_stats != 0 ); - fclose( fo ); + inline void TaskDump(void){ + FILE *fo = utils::FopenCheck(name_dump.c_str(), "w"); + learner.DumpModel(fo, fmap, dump_model_stats != 0); + fclose(fo); } - inline void TaskDumpPath( void ){ - FILE *fo = utils::FopenCheck( name_dumppath.c_str(), "w" ); - learner.DumpPath( fo, data ); - fclose( fo ); + inline void TaskDumpPath(void){ + FILE *fo = utils::FopenCheck(name_dumppath.c_str(), "w"); + learner.DumpPath(fo, data); + fclose(fo); } - inline void SaveModel( const char *fname ) const{ - utils::FileStream fo( utils::FopenCheck( fname, "wb" ) ); - learner.SaveModel( fo ); + inline void SaveModel(const char *fname) const{ + utils::FileStream fo(utils::FopenCheck(fname, "wb")); + learner.SaveModel(fo); fo.Close(); } - inline void SaveModel( int i ) const{ + inline void SaveModel(int i) const{ char fname[256]; - sprintf( fname ,"%s/%04d.model", model_dir_path.c_str(), i+1 ); - this->SaveModel( fname ); + sprintf(fname, "%s/%04d.model", model_dir_path.c_str(), i + 1); + this->SaveModel(fname); } - inline void TaskPred( void ){ + inline void TaskPred(void){ std::vector preds; - if( !silent ) printf("start prediction...\n"); - learner.Predict( preds, data ); - if( !silent ) printf("writing prediction to %s\n", name_pred.c_str() ); - FILE *fo = utils::FopenCheck( name_pred.c_str(), "w" ); - for( size_t i = 0; i < preds.size(); i ++ ){ - fprintf( fo, "%f\n", preds[i] ); + if (!silent) printf("start prediction...\n"); + learner.Predict(preds, data); + if (!silent) printf("writing prediction to %s\n", name_pred.c_str()); + FILE *fo = utils::FopenCheck(name_pred.c_str(), "w"); + for (size_t i = 0; i < preds.size(); i++){ + fprintf(fo, "%f\n", preds[i]); } - fclose( fo ); + fclose(fo); } private: /* \brief whether silent */ @@ -231,7 +236,7 @@ namespace xgboost{ /* \brief whether use auto binary buffer */ int use_buffer; /* \brief number of boosting iterations */ - int num_round; + int num_round; /* \brief the period to save the model, 0 means only save the final round model */ int save_period; /*! \brief interfact action */ @@ -257,9 +262,9 @@ namespace xgboost{ /* \brief name of dump path file */ std::string name_dumppath; /* \brief the paths of validation data sets */ - std::vector eval_data_paths; + std::vector eval_data_paths; /* \brief the names of the evaluation data used in output log */ - std::vector eval_data_names; + std::vector eval_data_names; /*! \brief saves configurations */ utils::ConfigSaver cfg; /*! \brief batch configurations */ @@ -274,7 +279,7 @@ namespace xgboost{ }; int main( int argc, char *argv[] ){ - xgboost::random::Seed( 0 ); - xgboost::regression::RegBoostTask tsk; - return tsk.Run( argc, argv ); + xgboost::random::Seed( 0 ); + xgboost::regression::RegBoostTask tsk; + return tsk.Run( argc, argv ); } diff --git a/utils/xgboost_config.h b/utils/xgboost_config.h index bcd58b504..22a343370 100644 --- a/utils/xgboost_config.h +++ b/utils/xgboost_config.h @@ -14,198 +14,203 @@ namespace xgboost{ namespace utils{ - /*! + /*! * \brief an iterator that iterates over a configure file and gets the configures */ class ConfigIterator{ public: - /*! - * \brief constructor + /*! + * \brief constructor * \param fname name of configure file */ - ConfigIterator( const char *fname ){ - fi = FopenCheck( fname, "r"); - ch_buf = fgetc( fi ); + ConfigIterator(const char *fname){ + fi = FopenCheck(fname, "r"); + ch_buf = fgetc(fi); } /*! \brief destructor */ ~ConfigIterator(){ - fclose( fi ); + fclose(fi); } - /*! + /*! * \brief get current name, called after Next returns true - * \return current parameter name + * \return current parameter name */ - inline const char *name( void )const{ + inline const char *name(void)const{ return s_name; } - /*! + /*! * \brief get current value, called after Next returns true - * \return current parameter value + * \return current parameter value */ - inline const char *val( void ) const{ + inline const char *val(void) const{ return s_val; } - /*! + /*! * \brief move iterator to next position * \return true if there is value in next position */ - inline bool Next( void ){ - while( !feof( fi ) ){ - GetNextToken( s_name ); - if( s_name[0] == '=') return false; - if( GetNextToken( s_buf ) || s_buf[0] != '=' ) return false; - if( GetNextToken( s_val ) || s_val[0] == '=' ) return false; + inline bool Next(void){ + while (!feof(fi)){ + GetNextToken(s_name); + if (s_name[0] == '=') return false; + if (GetNextToken(s_buf) || s_buf[0] != '=') return false; + if (GetNextToken(s_val) || s_val[0] == '=') return false; return true; } return false; } private: - FILE *fi; + FILE *fi; char ch_buf; - char s_name[256],s_val[256],s_buf[246]; - - inline void SkipLine(){ + char s_name[256], s_val[256], s_buf[246]; + + inline void SkipLine(){ do{ - ch_buf = fgetc( fi ); - }while( ch_buf != EOF && ch_buf != '\n' && ch_buf != '\r' ); + ch_buf = fgetc(fi); + } while (ch_buf != EOF && ch_buf != '\n' && ch_buf != '\r'); } - - inline void ParseStr( char tok[] ){ - int i = 0; - while( (ch_buf = fgetc(fi)) != EOF ){ - switch( ch_buf ){ - case '\\': tok[i++] = fgetc( fi ); break; - case '\"': tok[i++] = '\0'; - return; + + inline void ParseStr(char tok[]){ + int i = 0; + while ((ch_buf = fgetc(fi)) != EOF){ + switch (ch_buf){ + case '\\': tok[i++] = fgetc(fi); break; + case '\"': tok[i++] = '\0'; + return; case '\r': case '\n': Error("unterminated string"); break; default: tok[i++] = ch_buf; } } - Error("unterminated string"); + Error("unterminated string"); } // return newline - inline bool GetNextToken( char tok[] ){ + inline bool GetNextToken(char tok[]){ int i = 0; - bool new_line = false; - while( ch_buf != EOF ){ - switch( ch_buf ){ - case '#' : SkipLine(); new_line = true; break; + bool new_line = false; + while (ch_buf != EOF){ + switch (ch_buf){ + case '#': SkipLine(); new_line = true; break; case '\"': - if( i == 0 ){ - ParseStr( tok );ch_buf = fgetc(fi); return new_line; - }else{ - Error("token followed directly by string"); + if (i == 0){ + ParseStr(tok); ch_buf = fgetc(fi); return new_line; + } + else{ + Error("token followed directly by string"); } case '=': - if( i == 0 ) { - ch_buf = fgetc( fi ); - tok[0] = '='; - tok[1] = '\0'; - }else{ - tok[i] = '\0'; + if (i == 0) { + ch_buf = fgetc(fi); + tok[0] = '='; + tok[1] = '\0'; + } + else{ + tok[i] = '\0'; } return new_line; - case '\r': + case '\r': case '\n': - if( i == 0 ) new_line = true; + if (i == 0) new_line = true; case '\t': - case ' ' : - ch_buf = fgetc( fi ); - if( i > 0 ){ - tok[i] = '\0'; + case ' ': + ch_buf = fgetc(fi); + if (i > 0){ + tok[i] = '\0'; return new_line; - } + } break; - default: + default: tok[i++] = ch_buf; - ch_buf = fgetc( fi ); - break; + ch_buf = fgetc(fi); + break; } } return true; } }; }; - + namespace utils{ - /*! - * \brief a class that save parameter configurations - * temporally and allows to get them out later + /*! + * \brief a class that save parameter configurations + * temporally and allows to get them out later * there are two kinds of priority in ConfigSaver */ class ConfigSaver{ public: /*! \brief constructor */ - ConfigSaver( void ){ idx = 0; } + ConfigSaver(void){ idx = 0; } /*! \brief clear all saves */ - inline void Clear( void ){ + inline void Clear(void){ idx = 0; names.clear(); values.clear(); names_high.clear(); values_high.clear(); } - /*! - * \brief push back a parameter setting + /*! + * \brief push back a parameter setting * \param name name of parameter * \param val value of parameter - * \param priority whether the setting has higher priority: high priority occurs + * \param priority whether the setting has higher priority: high priority occurs * latter when read from ConfigSaver, and can overwrite existing settings */ - inline void PushBack( const char *name, const char *val, int priority = 0 ){ - if( priority == 0 ){ - names.push_back( std::string( name ) ); - values.push_back( std::string( val ) ); - }else{ - names_high.push_back( std::string( name ) ); - values_high.push_back( std::string( val ) ); + inline void PushBack(const char *name, const char *val, int priority = 0){ + if (priority == 0){ + names.push_back(std::string(name)); + values.push_back(std::string(val)); + } + else{ + names_high.push_back(std::string(name)); + values_high.push_back(std::string(val)); } } /*! \brief set pointer to beginning of the ConfigSaver */ - inline void BeforeFirst( void ){ + inline void BeforeFirst(void){ idx = 0; } - /*! + /*! * \brief move iterator to next position * \return true if there is value in next position */ - inline bool Next( void ){ - if( idx >= names.size() + names_high.size() ){ + inline bool Next(void){ + if (idx >= names.size() + names_high.size()){ return false; } - idx ++; + idx++; return true; } - /*! + /*! * \brief get current name, called after Next returns true - * \return current parameter name - */ - inline const char *name( void ) const{ - Assert( idx > 0, "can't call name before first"); + * \return current parameter name + */ + inline const char *name(void) const{ + Assert(idx > 0, "can't call name before first"); size_t i = idx - 1; - if( i >= names.size() ){ - return names_high[ i - names.size() ].c_str(); - }else{ - return names[ i ].c_str(); + if (i >= names.size()){ + return names_high[i - names.size()].c_str(); + } + else{ + return names[i].c_str(); } } - /*! + /*! * \brief get current value, called after Next returns true - * \return current parameter value + * \return current parameter value */ - inline const char *val( void ) const{ - Assert( idx > 0, "can't call name before first"); + inline const char *val(void) const{ + Assert(idx > 0, "can't call name before first"); size_t i = idx - 1; - if( i >= values.size() ){ - return values_high[ i - values.size() ].c_str(); - }else{ - return values[ i ].c_str(); + if (i >= values.size()){ + return values_high[i - values.size()].c_str(); + } + else{ + return values[i].c_str(); } } private: std::vector names; std::vector values; std::vector names_high; - std::vector values_high; + std::vector values_high; size_t idx; }; }; diff --git a/utils/xgboost_fmap.h b/utils/xgboost_fmap.h index 4ab7e3909..cbb974d9b 100644 --- a/utils/xgboost_fmap.h +++ b/utils/xgboost_fmap.h @@ -16,48 +16,48 @@ namespace xgboost{ class FeatMap{ public: enum Type{ - kIndicator = 0, + kIndicator = 0, kQuantitive = 1, kInteger = 2, kFloat = 3 }; public: /*! \brief load feature map from text format */ - inline void LoadText( const char *fname ){ - FILE *fi = utils::FopenCheck( fname, "r" ); - this->LoadText( fi ); - fclose( fi ); + inline void LoadText(const char *fname){ + FILE *fi = utils::FopenCheck(fname, "r"); + this->LoadText(fi); + fclose(fi); } /*! \brief load feature map from text format */ - inline void LoadText( FILE *fi ){ + inline void LoadText(FILE *fi){ int fid; char fname[256], ftype[256]; - while( fscanf( fi, "%d%s%s", &fid, fname, ftype ) == 3 ){ - utils::Assert( fid == (int)names_.size(), "invalid fmap format" ); - names_.push_back( std::string(fname) ); - types_.push_back( GetType( ftype ) ); + while (fscanf(fi, "%d%s%s", &fid, fname, ftype) == 3){ + utils::Assert(fid == (int)names_.size(), "invalid fmap format"); + names_.push_back(std::string(fname)); + types_.push_back(GetType(ftype)); } } /*! \brief number of known features */ - size_t size( void ) const{ + size_t size(void) const{ return names_.size(); } /*! \brief return name of specific feature */ - const char* name( size_t idx ) const{ - utils::Assert( idx < names_.size(), "utils::FMap::name feature index exceed bound" ); - return names_[ idx ].c_str(); + const char* name(size_t idx) const{ + utils::Assert(idx < names_.size(), "utils::FMap::name feature index exceed bound"); + return names_[idx].c_str(); } /*! \brief return type of specific feature */ - const Type& type( size_t idx ) const{ - utils::Assert( idx < names_.size(), "utils::FMap::name feature index exceed bound" ); - return types_[ idx ]; + const Type& type(size_t idx) const{ + utils::Assert(idx < names_.size(), "utils::FMap::name feature index exceed bound"); + return types_[idx]; } private: - inline static Type GetType( const char *tname ){ - if( !strcmp( "i", tname ) ) return kIndicator; - if( !strcmp( "q", tname ) ) return kQuantitive; - if( !strcmp( "int", tname ) ) return kInteger; - if( !strcmp( "float", tname ) ) return kFloat; + inline static Type GetType(const char *tname){ + if (!strcmp("i", tname)) return kIndicator; + if (!strcmp("q", tname)) return kQuantitive; + if (!strcmp("int", tname)) return kInteger; + if (!strcmp("float", tname)) return kFloat; utils::Error("unknown feature type, use i for indicator and q for quantity"); return kIndicator; } @@ -73,50 +73,50 @@ namespace xgboost{ /*! \brief feature constraint, allow or disallow some feature during training */ class FeatConstrain{ public: - FeatConstrain( void ){ + FeatConstrain(void){ default_state_ = +1; } /*!\brief set parameters */ - inline void SetParam( const char *name, const char *val ){ + inline void SetParam(const char *name, const char *val){ int a, b; - if( !strcmp( name, "fban") ){ - this->ParseRange( val, a, b ); - this->SetRange( a, b, -1 ); + if (!strcmp(name, "fban")){ + this->ParseRange(val, a, b); + this->SetRange(a, b, -1); } - if( !strcmp( name, "fpass") ){ - this->ParseRange( val, a, b ); - this->SetRange( a, b, +1 ); + if (!strcmp(name, "fpass")){ + this->ParseRange(val, a, b); + this->SetRange(a, b, +1); } - if( !strcmp( name, "fdefault") ){ - default_state_ = atoi( val ); + if (!strcmp(name, "fdefault")){ + default_state_ = atoi(val); } } /*! \brief whether constrain is specified */ - inline bool HasConstrain( void ) const { + inline bool HasConstrain(void) const { return state_.size() != 0 && default_state_ == 1; } /*! \brief whether a feature index is banned or not */ - inline bool NotBanned( unsigned index ) const{ + inline bool NotBanned(unsigned index) const{ int rt = index < state_.size() ? state_[index] : default_state_; - if( rt == 0 ) rt = default_state_; + if (rt == 0) rt = default_state_; return rt == 1; } private: - inline void SetRange( int a, int b, int st ){ - if( b > (int)state_.size() ) state_.resize( b, 0 ); - for( int i = a; i < b; ++ i ){ + inline void SetRange(int a, int b, int st){ + if (b >(int)state_.size()) state_.resize(b, 0); + for (int i = a; i < b; ++i){ state_[i] = st; - } + } } - inline void ParseRange( const char *val, int &a, int &b ){ - if( sscanf( val, "%d-%d", &a, &b ) == 2 ) return; - utils::Assert( sscanf( val, "%d", &a ) == 1 ); + inline void ParseRange(const char *val, int &a, int &b){ + if (sscanf(val, "%d-%d", &a, &b) == 2) return; + utils::Assert(sscanf(val, "%d", &a) == 1); b = a + 1; } /*! \brief default state */ int default_state_; /*! \brief whether the state here is, +1:pass, -1: ban, 0:default */ - std::vector state_; + std::vector state_; }; }; // namespace utils }; // namespace xgboost diff --git a/utils/xgboost_matrix_csr.h b/utils/xgboost_matrix_csr.h index aa47f6dbb..7ac9a30b6 100644 --- a/utils/xgboost_matrix_csr.h +++ b/utils/xgboost_matrix_csr.h @@ -2,7 +2,7 @@ * \file xgboost_matrix_csr.h * \brief this file defines some easy to use STL based class for in memory sparse CSR matrix * \author Tianqi Chen: tianqi.tchen@gmail.com -*/ + */ #ifndef XGBOOST_MATRIX_CSR_H #define XGBOOST_MATRIX_CSR_H #include @@ -11,13 +11,13 @@ namespace xgboost{ namespace utils{ - /*! - * \brief a class used to help construct CSR format matrix, + /*! + * \brief a class used to help construct CSR format matrix, * can be used to convert row major CSR to column major CSR * \tparam IndexType type of index used to store the index position, usually unsigned or size_t * \tparam whether enabling the usage of aclist, this option must be enabled manually */ - template + template struct SparseCSRMBuilder{ private: /*! \brief dummy variable used in the indicator matrix construction */ @@ -29,100 +29,102 @@ namespace xgboost{ /*! \brief a list of active rows, used when many rows are empty */ std::vector &aclist; public: - SparseCSRMBuilder( std::vector &p_rptr, - std::vector &p_findex ) - :rptr(p_rptr), findex( p_findex ), aclist( dummy_aclist ){ - Assert( !UseAcList, "enabling bug" ); - } - /*! \brief use with caution! rptr must be cleaned before use */ - SparseCSRMBuilder( std::vector &p_rptr, - std::vector &p_findex, - std::vector &p_aclist ) - :rptr(p_rptr), findex( p_findex ), aclist( p_aclist ){ - Assert( UseAcList, "must manually enable the option use aclist" ); + SparseCSRMBuilder(std::vector &p_rptr, + std::vector &p_findex) + :rptr(p_rptr), findex(p_findex), aclist(dummy_aclist){ + Assert(!UseAcList, "enabling bug"); + } + /*! \brief use with caution! rptr must be cleaned before use */ + SparseCSRMBuilder(std::vector &p_rptr, + std::vector &p_findex, + std::vector &p_aclist) + :rptr(p_rptr), findex(p_findex), aclist(p_aclist){ + Assert(UseAcList, "must manually enable the option use aclist"); } public: - /*! + /*! * \brief step 1: initialize the number of rows in the data, not necessary exact * \nrows number of rows in the matrix, can be smaller than expected */ - inline void InitBudget( size_t nrows = 0 ){ - if( !UseAcList ){ + inline void InitBudget(size_t nrows = 0){ + if (!UseAcList){ rptr.clear(); - rptr.resize( nrows + 1, 0 ); - }else{ - Assert( nrows + 1 == rptr.size(), "rptr must be initialized already" ); + rptr.resize(nrows + 1, 0); + } + else{ + Assert(nrows + 1 == rptr.size(), "rptr must be initialized already"); this->Cleanup(); } } - /*! + /*! * \brief step 2: add budget to each rows, this function is called when aclist is used * \param row_id the id of the row * \param nelem number of element budget add to this row */ - inline void AddBudget( size_t row_id, size_t nelem = 1 ){ - if( rptr.size() < row_id + 2 ){ - rptr.resize( row_id + 2, 0 ); + inline void AddBudget(size_t row_id, size_t nelem = 1){ + if (rptr.size() < row_id + 2){ + rptr.resize(row_id + 2, 0); } - if( UseAcList ){ - if( rptr[ row_id + 1 ] == 0 ) aclist.push_back( row_id ); + if (UseAcList){ + if (rptr[row_id + 1] == 0) aclist.push_back(row_id); } - rptr[ row_id + 1 ] += nelem; + rptr[row_id + 1] += nelem; } /*! \brief step 3: initialize the necessary storage */ - inline void InitStorage( void ){ + inline void InitStorage(void){ // initialize rptr to be beginning of each segment size_t start = 0; - if( !UseAcList ){ - for( size_t i = 1; i < rptr.size(); i ++ ){ - size_t rlen = rptr[ i ]; - rptr[ i ] = start; - start += rlen; - } - }else{ - // case with active list - std::sort( aclist.begin(), aclist.end() ); - - for( size_t i = 0; i < aclist.size(); i ++ ){ - size_t ridx = aclist[ i ]; - size_t rlen = rptr[ ridx + 1 ]; - rptr[ ridx + 1 ] = start; - // set previous rptr to right position if previous feature is not active - if( i == 0 || ridx != aclist[i-1] + 1 ) rptr[ ridx ] = start; + if (!UseAcList){ + for (size_t i = 1; i < rptr.size(); i++){ + size_t rlen = rptr[i]; + rptr[i] = start; start += rlen; } } - findex.resize( start ); + else{ + // case with active list + std::sort(aclist.begin(), aclist.end()); + + for (size_t i = 0; i < aclist.size(); i++){ + size_t ridx = aclist[i]; + size_t rlen = rptr[ridx + 1]; + rptr[ridx + 1] = start; + // set previous rptr to right position if previous feature is not active + if (i == 0 || ridx != aclist[i - 1] + 1) rptr[ridx] = start; + start += rlen; + } + } + findex.resize(start); } - /*! - * \brief step 4: - * used in indicator matrix construction, add new - * element to each row, the number of calls shall be exactly same as add_budget + /*! + * \brief step 4: + * used in indicator matrix construction, add new + * element to each row, the number of calls shall be exactly same as add_budget */ - inline void PushElem( size_t row_id, IndexType col_id ){ - size_t &rp = rptr[ row_id + 1 ]; - findex[ rp ++ ] = col_id; + inline void PushElem(size_t row_id, IndexType col_id){ + size_t &rp = rptr[row_id + 1]; + findex[rp++] = col_id; } - /*! + /*! * \brief step 5: only needed when aclist is used * clean up the rptr for next usage - */ - inline void Cleanup( void ){ - Assert( UseAcList, "this function can only be called use AcList" ); - for( size_t i = 0; i < aclist.size(); i ++ ){ + */ + inline void Cleanup(void){ + Assert(UseAcList, "this function can only be called use AcList"); + for (size_t i = 0; i < aclist.size(); i++){ const size_t ridx = aclist[i]; - rptr[ ridx ] = 0; rptr[ ridx + 1 ] = 0; + rptr[ridx] = 0; rptr[ridx + 1] = 0; } aclist.clear(); } }; }; - + namespace utils{ - /*! + /*! * \brief simple sparse matrix container * \tparam IndexType type of index used to store the index position, usually unsigned or size_t - */ + */ template struct SparseCSRMat{ private: @@ -134,22 +136,22 @@ namespace xgboost{ /*! \brief matrix builder*/ SparseCSRMBuilder builder; public: - SparseCSRMat( void ):builder( rptr, findex ){ - } + SparseCSRMat(void) :builder(rptr, findex){ + } public: /*! \return number of rows in the matrx */ - inline size_t NumRow( void ) const{ + inline size_t NumRow(void) const{ return rptr.size() - 1; } /*! \return number of elements r-th row */ - inline size_t NumElem( size_t r ) const{ - return rptr[ r + 1 ] - rptr[ r ]; + inline size_t NumElem(size_t r) const{ + return rptr[r + 1] - rptr[r]; } - /*! \return r-th row */ - inline const IndexType *operator[]( size_t r ) const{ - return &findex[ rptr[r] ]; - } - }; + /*! \return r-th row */ + inline const IndexType *operator[](size_t r) const{ + return &findex[rptr[r]]; + } + }; }; }; #endif diff --git a/utils/xgboost_omp.h b/utils/xgboost_omp.h index 8fb80d302..ea1e7173c 100644 --- a/utils/xgboost_omp.h +++ b/utils/xgboost_omp.h @@ -3,16 +3,16 @@ /*! * \file xgboost_omp.h * \brief header to handle OpenMP compatibility issues - * + * * \author Tianqi Chen: tianqi.tchen@gmail.com */ #if defined(_OPENMP) #include #else -//#warning "OpenMP is not available, compile to single thread code" +#warning "OpenMP is not available, compile to single thread code" inline int omp_get_thread_num() { return 0; } inline int omp_get_num_threads() { return 1; } -inline void omp_set_num_threads( int nthread ) {} +inline void omp_set_num_threads(int nthread) {} #endif #endif diff --git a/utils/xgboost_random.h b/utils/xgboost_random.h index d2d7625a7..f955d36b8 100644 --- a/utils/xgboost_random.h +++ b/utils/xgboost_random.h @@ -23,107 +23,108 @@ typedef unsigned int uint32_t; namespace xgboost{ namespace random{ /*! \brief seed the PRNG */ - inline void Seed( uint32_t seed ){ - srand( seed ); + inline void Seed(uint32_t seed){ + srand(seed); } - + /*! \brief return a real number uniform in [0,1) */ inline double NextDouble(){ - return static_cast( rand() ) / (static_cast( RAND_MAX )+1.0); + return static_cast(rand()) / (static_cast(RAND_MAX)+1.0); } /*! \brief return a real numer uniform in (0,1) */ inline double NextDouble2(){ - return (static_cast( rand() ) + 1.0 ) / (static_cast(RAND_MAX) + 2.0); + return (static_cast(rand()) + 1.0) / (static_cast(RAND_MAX)+2.0); } }; - + namespace random{ /*! \brief return a random number */ - inline uint32_t NextUInt32( void ){ + inline uint32_t NextUInt32(void){ return (uint32_t)rand(); } /*! \brief return a random number in n */ - inline uint32_t NextUInt32( uint32_t n ){ - return (uint32_t) floor( NextDouble() * n ) ; - } + inline uint32_t NextUInt32(uint32_t n){ + return (uint32_t)floor(NextDouble() * n); + } /*! \brief return x~N(0,1) */ inline double SampleNormal(){ - double x,y,s; + double x, y, s; do{ x = 2 * NextDouble2() - 1.0; y = 2 * NextDouble2() - 1.0; s = x*x + y*y; - }while( s >= 1.0 || s == 0.0 ); - - return x * sqrt( -2.0 * log(s) / s ) ; + } while (s >= 1.0 || s == 0.0); + + return x * sqrt(-2.0 * log(s) / s); } - + /*! \brief return iid x,y ~N(0,1) */ - inline void SampleNormal2D( double &xx, double &yy ){ - double x,y,s; + inline void SampleNormal2D(double &xx, double &yy){ + double x, y, s; do{ x = 2 * NextDouble2() - 1.0; y = 2 * NextDouble2() - 1.0; s = x*x + y*y; - }while( s >= 1.0 || s == 0.0 ); - double t = sqrt( -2.0 * log(s) / s ) ; - xx = x * t; + } while (s >= 1.0 || s == 0.0); + double t = sqrt(-2.0 * log(s) / s); + xx = x * t; yy = y * t; } /*! \brief return x~N(mu,sigma^2) */ - inline double SampleNormal( double mu, double sigma ){ + inline double SampleNormal(double mu, double sigma){ return SampleNormal() * sigma + mu; } /*! \brief return 1 with probability p, coin flip */ - inline int SampleBinary( double p ){ - return NextDouble() < p; + inline int SampleBinary(double p){ + return NextDouble() < p; } - + /*! \brief return distribution from Gamma( alpha, beta ) */ - inline double SampleGamma( double alpha, double beta ) { - if ( alpha < 1.0 ) { + inline double SampleGamma(double alpha, double beta) { + if (alpha < 1.0) { double u; do { u = NextDouble(); } while (u == 0.0); return SampleGamma(alpha + 1.0, beta) * pow(u, 1.0 / alpha); - } else { - double d,c,x,v,u; - d = alpha - 1.0/3.0; - c = 1.0 / sqrt( 9.0 * d ); + } + else { + double d, c, x, v, u; + d = alpha - 1.0 / 3.0; + c = 1.0 / sqrt(9.0 * d); do { do { x = SampleNormal(); v = 1.0 + c*x; - } while ( v <= 0.0 ); + } while (v <= 0.0); v = v * v * v; u = NextDouble(); - } while ( (u >= (1.0 - 0.0331 * (x*x) * (x*x))) - && (log(u) >= (0.5 * x * x + d * (1.0 - v + log(v)))) ); + } while ((u >= (1.0 - 0.0331 * (x*x) * (x*x))) + && (log(u) >= (0.5 * x * x + d * (1.0 - v + log(v))))); return d * v / beta; } } - + template - inline void Exchange( T &a, T &b ){ + inline void Exchange(T &a, T &b){ T c; c = a; a = b; b = c; } - + template - inline void Shuffle( T *data, size_t sz ){ - if( sz == 0 ) return; - for( uint32_t i = (uint32_t)sz - 1; i > 0; i-- ){ - Exchange( data[i], data[ NextUInt32( i+1 ) ] ); - } + inline void Shuffle(T *data, size_t sz){ + if (sz == 0) return; + for (uint32_t i = (uint32_t)sz - 1; i > 0; i--){ + Exchange(data[i], data[NextUInt32(i + 1)]); + } } // random shuffle the data inside, require PRNG template - inline void Shuffle( std::vector &data ){ - Shuffle( &data[0], data.size() ); + inline void Shuffle(std::vector &data){ + Shuffle(&data[0], data.size()); } }; }; diff --git a/utils/xgboost_stream.h b/utils/xgboost_stream.h index 7388e01c1..b7b513d18 100644 --- a/utils/xgboost_stream.h +++ b/utils/xgboost_stream.h @@ -9,44 +9,44 @@ */ namespace xgboost{ namespace utils{ - /*! - * \brief interface of stream I/O, used to serialize model + /*! + * \brief interface of stream I/O, used to serialize model */ class IStream{ public: - /*! + /*! * \brief read data from stream * \param ptr pointer to memory buffer * \param size size of block * \return usually is the size of data readed */ - virtual size_t Read( void *ptr, size_t size ) = 0; - /*! + virtual size_t Read(void *ptr, size_t size) = 0; + /*! * \brief write data to stream * \param ptr pointer to memory buffer * \param size size of block */ - virtual void Write( const void *ptr, size_t size ) = 0; + virtual void Write(const void *ptr, size_t size) = 0; /*! \brief virtual destructor */ - virtual ~IStream( void ){} + virtual ~IStream(void){} }; /*! \brief implementation of file i/o stream */ - class FileStream: public IStream{ + class FileStream : public IStream{ private: FILE *fp; - public: - FileStream( FILE *fp ){ + public: + FileStream(FILE *fp){ this->fp = fp; } - virtual size_t Read( void *ptr, size_t size ){ - return fread( ptr, size, 1, fp ); + virtual size_t Read(void *ptr, size_t size){ + return fread(ptr, size, 1, fp); } - virtual void Write( const void *ptr, size_t size ){ - fwrite( ptr, size, 1, fp ); + virtual void Write(const void *ptr, size_t size){ + fwrite(ptr, size, 1, fp); } - inline void Close( void ){ - fclose( fp ); + inline void Close(void){ + fclose(fp); } }; }; diff --git a/utils/xgboost_utils.h b/utils/xgboost_utils.h index 9373e1076..7c0e53f2e 100644 --- a/utils/xgboost_utils.h +++ b/utils/xgboost_utils.h @@ -36,39 +36,29 @@ extern "C"{ namespace xgboost{ /*! \brief namespace for helper utils of the project */ namespace utils{ - inline void Error( const char *msg ){ - fprintf( stderr, "Error:%s\n",msg ); - exit( -1 ); - } - - inline void Assert( bool exp ){ - if( !exp ) Error( "AssertError" ); - } - - inline void Assert( bool exp, const char *msg ){ - if( !exp ) Error( msg ); + inline void Error(const char *msg){ + fprintf(stderr, "Error:%s\n", msg); + exit(-1); } - inline void Warning( const char *msg ){ - fprintf( stderr, "warning:%s\n",msg ); + inline void Assert(bool exp){ + if (!exp) Error("AssertError"); + } + + inline void Assert(bool exp, const char *msg){ + if (!exp) Error(msg); + } + + inline void Warning(const char *msg){ + fprintf(stderr, "warning:%s\n", msg); } /*! \brief replace fopen, report error when the file open fails */ - inline FILE *FopenCheck( const char *fname , const char *flag ){ - FILE *fp = fopen64( fname , flag ); - if( fp == NULL ){ - fprintf( stderr, "can not open file \"%s\"\n",fname ); - exit( -1 ); - } - return fp; - } - - /*! \brief replace fopen, */ - inline FILE *FopenTry( const char *fname , const char *flag ){ - FILE *fp = fopen64( fname , flag ); - if( fp == NULL ){ - fprintf( stderr, "can not open file \"%s\"\n",fname ); - exit( -1 ); + inline FILE *FopenCheck(const char *fname, const char *flag){ + FILE *fp = fopen64(fname, flag); + if (fp == NULL){ + fprintf(stderr, "can not open file \"%s\"\n", fname); + exit(-1); } return fp; }