Merge branch 'master' of ssh://github.com/tqchen/xgboost
Conflicts: regression/xgboost_reg_data.h
This commit is contained in:
commit
c3592dc06c
2
Makefile
2
Makefile
@ -12,6 +12,8 @@ export LDFLAGS= -pthread -lm
|
||||
|
||||
xgboost: regression/xgboost_reg_main.cpp regression/*.h booster/*.h booster/*/*.hpp booster/*.hpp
|
||||
|
||||
#xgboost: rank/xgboost_rank_main.cpp base/*.h rank/*.h booster/*.h booster/*/*.hpp booster/*.hpp
|
||||
|
||||
$(BIN) :
|
||||
$(CXX) $(CFLAGS) $(LDFLAGS) -o $@ $(filter %.cpp %.o %.c, $^)
|
||||
|
||||
|
||||
@ -2,7 +2,7 @@
|
||||
#define XGBOOST_INL_HPP
|
||||
/*!
|
||||
* \file xgboost-inl.hpp
|
||||
* \brief bootser implementations
|
||||
* \brief bootser implementations
|
||||
* \author Tianqi Chen: tianqi.tchen@gmail.com
|
||||
*/
|
||||
// implementation of boosters go to here
|
||||
@ -18,7 +18,7 @@
|
||||
#include "linear/xgboost_linear.hpp"
|
||||
|
||||
namespace xgboost{
|
||||
namespace booster{
|
||||
namespace booster{
|
||||
/*!
|
||||
* \brief create a gradient booster, given type of booster
|
||||
* \param booster_type type of gradient booster, can be used to specify implements
|
||||
@ -26,14 +26,14 @@ namespace xgboost{
|
||||
* \return the pointer to the gradient booster created
|
||||
*/
|
||||
template<typename FMatrix>
|
||||
inline InterfaceBooster<FMatrix> *CreateBooster( int booster_type ){
|
||||
switch( booster_type ){
|
||||
inline InterfaceBooster<FMatrix> *CreateBooster(int booster_type){
|
||||
switch (booster_type){
|
||||
case 0: return new RegTreeTrainer<FMatrix>();
|
||||
case 1: return new LinearBooster<FMatrix>();
|
||||
default: utils::Error("unknown booster_type"); return NULL;
|
||||
}
|
||||
}
|
||||
}; // namespace booster
|
||||
}
|
||||
}; // namespace booster
|
||||
}; // namespace xgboost
|
||||
|
||||
#endif // XGBOOST_INL_HPP
|
||||
|
||||
@ -19,8 +19,8 @@
|
||||
namespace xgboost{
|
||||
/*! \brief namespace for boosters */
|
||||
namespace booster{
|
||||
/*!
|
||||
* \brief interface of a gradient boosting learner
|
||||
/*!
|
||||
* \brief interface of a gradient boosting learner
|
||||
* \tparam FMatrix the feature matrix format that the booster takes
|
||||
*/
|
||||
template<typename FMatrix>
|
||||
@ -35,101 +35,101 @@ namespace xgboost{
|
||||
// call booster->LoadModel
|
||||
// (3) booster->DoBoost to update the model
|
||||
// (4) booster->Predict to get new prediction
|
||||
/*!
|
||||
* \brief set parameters from outside
|
||||
/*!
|
||||
* \brief set parameters from outside
|
||||
* \param name name of the parameter
|
||||
* \param val value of the parameter
|
||||
*/
|
||||
virtual void SetParam( const char *name, const char *val ) = 0;
|
||||
/*!
|
||||
virtual void SetParam(const char *name, const char *val) = 0;
|
||||
/*!
|
||||
* \brief load model from stream
|
||||
* \param fi input stream
|
||||
*/
|
||||
virtual void LoadModel( utils::IStream &fi ) = 0;
|
||||
/*!
|
||||
virtual void LoadModel(utils::IStream &fi) = 0;
|
||||
/*!
|
||||
* \brief save model to stream
|
||||
* \param fo output stream
|
||||
*/
|
||||
virtual void SaveModel( utils::IStream &fo ) const = 0;
|
||||
virtual void SaveModel(utils::IStream &fo) const = 0;
|
||||
/*!
|
||||
* \brief initialize solver before training, called before training
|
||||
* this function is reserved for solver to allocate necessary space and do other preparation
|
||||
* this function is reserved for solver to allocate necessary space and do other preparation
|
||||
*/
|
||||
virtual void InitModel( void ) = 0;
|
||||
virtual void InitModel(void) = 0;
|
||||
public:
|
||||
/*!
|
||||
* \brief do gradient boost training for one step, using the information given,
|
||||
/*!
|
||||
* \brief do gradient boost training for one step, using the information given,
|
||||
* Note: content of grad and hess can change after DoBoost
|
||||
* \param grad first order gradient of each instance
|
||||
* \param hess second order gradient of each instance
|
||||
* \param feats features of each instance
|
||||
* \param root_index pre-partitioned root index of each instance,
|
||||
* \param root_index pre-partitioned root index of each instance,
|
||||
* root_index.size() can be 0 which indicates that no pre-partition involved
|
||||
*/
|
||||
virtual void DoBoost( std::vector<float> &grad,
|
||||
std::vector<float> &hess,
|
||||
const FMatrix &feats,
|
||||
const std::vector<unsigned> &root_index ) = 0;
|
||||
/*!
|
||||
virtual void DoBoost(std::vector<float> &grad,
|
||||
std::vector<float> &hess,
|
||||
const FMatrix &feats,
|
||||
const std::vector<unsigned> &root_index) = 0;
|
||||
/*!
|
||||
* \brief predict the path ids along a trees, for given sparse feature vector. When booster is a tree
|
||||
* \param path the result of path
|
||||
* \param feats feature matrix
|
||||
* \param row_index row index in the feature matrix
|
||||
* \param root_index root id of current instance, default = 0
|
||||
*/
|
||||
virtual void PredPath( std::vector<int> &path, const FMatrix &feats,
|
||||
bst_uint row_index, unsigned root_index = 0 ){
|
||||
utils::Error( "not implemented" );
|
||||
virtual void PredPath(std::vector<int> &path, const FMatrix &feats,
|
||||
bst_uint row_index, unsigned root_index = 0){
|
||||
utils::Error("not implemented");
|
||||
}
|
||||
/*!
|
||||
/*!
|
||||
* \brief predict values for given sparse feature vector
|
||||
*
|
||||
*
|
||||
* NOTE: in tree implementation, Sparse Predict is OpenMP threadsafe, but not threadsafe in general,
|
||||
* dense version of Predict to ensures threadsafety
|
||||
* \param feats feature matrix
|
||||
* \param row_index row index in the feature matrix
|
||||
* \param root_index root id of current instance, default = 0
|
||||
* \return prediction
|
||||
*/
|
||||
virtual float Predict( const FMatrix &feats, bst_uint row_index, unsigned root_index = 0 ){
|
||||
utils::Error( "not implemented" );
|
||||
* \return prediction
|
||||
*/
|
||||
virtual float Predict(const FMatrix &feats, bst_uint row_index, unsigned root_index = 0){
|
||||
utils::Error("not implemented");
|
||||
return 0.0f;
|
||||
}
|
||||
/*!
|
||||
/*!
|
||||
* \brief predict values for given dense feature vector
|
||||
* \param feat feature vector in dense format
|
||||
* \param funknown indicator that the feature is missing
|
||||
* \param rid root id of current instance, default = 0
|
||||
* \return prediction
|
||||
*/
|
||||
virtual float Predict( const std::vector<float> &feat,
|
||||
const std::vector<bool> &funknown,
|
||||
unsigned rid = 0 ){
|
||||
utils::Error( "not implemented" );
|
||||
virtual float Predict(const std::vector<float> &feat,
|
||||
const std::vector<bool> &funknown,
|
||||
unsigned rid = 0){
|
||||
utils::Error("not implemented");
|
||||
return 0.0f;
|
||||
}
|
||||
/*!
|
||||
/*!
|
||||
* \brief print information
|
||||
* \param fo output stream
|
||||
*/
|
||||
virtual void PrintInfo( FILE *fo ){}
|
||||
/*!
|
||||
* \param fo output stream
|
||||
*/
|
||||
virtual void PrintInfo(FILE *fo){}
|
||||
/*!
|
||||
* \brief dump model into text file
|
||||
* \param fo output stream
|
||||
* \param fo output stream
|
||||
* \param fmap feature map that may help give interpretations of feature
|
||||
* \param with_stats whether print statistics
|
||||
*/
|
||||
virtual void DumpModel( FILE *fo, const utils::FeatMap& fmap, bool with_stats = false ){
|
||||
utils::Error( "not implemented" );
|
||||
virtual void DumpModel(FILE *fo, const utils::FeatMap& fmap, bool with_stats = false){
|
||||
utils::Error("not implemented");
|
||||
}
|
||||
public:
|
||||
/*! \brief virtual destructor */
|
||||
virtual ~InterfaceBooster( void ){}
|
||||
virtual ~InterfaceBooster(void){}
|
||||
};
|
||||
};
|
||||
namespace booster{
|
||||
/*!
|
||||
* \brief this will is the most commonly used booster interface
|
||||
/*!
|
||||
* \brief this will is the most commonly used booster interface
|
||||
* we try to make booster invariant of data structures, but most cases, FMatrixS is what we wnat
|
||||
*/
|
||||
typedef InterfaceBooster<FMatrixS> IBooster;
|
||||
@ -138,7 +138,7 @@ namespace xgboost{
|
||||
|
||||
namespace xgboost{
|
||||
namespace booster{
|
||||
/*!
|
||||
/*!
|
||||
* \brief create a gradient booster, given type of booster
|
||||
* normally we use FMatrixS, by calling CreateBooster<FMatrixS>
|
||||
* \param booster_type type of gradient booster, can be used to specify implements
|
||||
@ -146,7 +146,7 @@ namespace xgboost{
|
||||
* \return the pointer to the gradient booster created
|
||||
*/
|
||||
template<typename FMatrix>
|
||||
inline InterfaceBooster<FMatrix> *CreateBooster( int booster_type );
|
||||
inline InterfaceBooster<FMatrix> *CreateBooster(int booster_type);
|
||||
};
|
||||
};
|
||||
|
||||
|
||||
@ -21,76 +21,76 @@ namespace xgboost{
|
||||
typedef unsigned bst_uint;
|
||||
/*! \brief float type used in boost */
|
||||
typedef float bst_float;
|
||||
/*! \brief debug option for booster */
|
||||
const bool bst_debug = false;
|
||||
/*! \brief debug option for booster */
|
||||
const bool bst_debug = false;
|
||||
};
|
||||
};
|
||||
|
||||
namespace xgboost{
|
||||
namespace booster{
|
||||
/**
|
||||
* \brief This is a interface, defining the way to access features,
|
||||
* \brief This is a interface, defining the way to access features,
|
||||
* by column or by row. This interface is used to make implementation
|
||||
* of booster does not depend on how feature is stored.
|
||||
*
|
||||
* Why template instead of virtual class: for efficiency
|
||||
* feature matrix is going to be used by most inner loop of the algorithm
|
||||
*
|
||||
* \tparam Derived type of actual implementation
|
||||
* \tparam Derived type of actual implementation
|
||||
* \sa FMatrixS: most of time FMatrixS is sufficient, refer to it if you find it confusing
|
||||
*/
|
||||
template<typename Derived>
|
||||
struct FMatrix{
|
||||
public:
|
||||
/*! \brief exmaple iterator over one row */
|
||||
/*! \brief exmaple iterator over one row */
|
||||
struct RowIter{
|
||||
/*!
|
||||
* \brief move to next position
|
||||
/*!
|
||||
* \brief move to next position
|
||||
* \return whether there is element in next position
|
||||
*/
|
||||
|
||||
inline bool Next( void );
|
||||
inline bool Next(void);
|
||||
/*! \return feature index in current position */
|
||||
inline bst_uint findex( void ) const;
|
||||
inline bst_uint findex(void) const;
|
||||
/*! \return feature value in current position */
|
||||
inline bst_float fvalue( void ) const;
|
||||
inline bst_float fvalue(void) const;
|
||||
};
|
||||
/*! \brief example iterator over one column */
|
||||
struct ColIter{
|
||||
/*!
|
||||
* \brief move to next position
|
||||
/*!
|
||||
* \brief move to next position
|
||||
* \return whether there is element in next position
|
||||
*/
|
||||
inline bool Next( void );
|
||||
inline bool Next(void);
|
||||
/*! \return row index of current position */
|
||||
inline bst_uint rindex( void ) const;
|
||||
inline bst_uint rindex(void) const;
|
||||
/*! \return feature value in current position */
|
||||
inline bst_float fvalue( void ) const;
|
||||
inline bst_float fvalue(void) const;
|
||||
};
|
||||
/*! \brief backward iterator over column */
|
||||
struct ColBackIter : public ColIter {};
|
||||
public:
|
||||
/*!
|
||||
* \brief get number of rows
|
||||
/*!
|
||||
* \brief get number of rows
|
||||
* \return number of rows
|
||||
*/
|
||||
inline size_t NumRow( void ) const;
|
||||
/*!
|
||||
inline size_t NumRow(void) const;
|
||||
/*!
|
||||
* \brief get number of columns
|
||||
* \return number of columns
|
||||
*/
|
||||
inline size_t NumCol( void ) const;
|
||||
inline size_t NumCol(void) const;
|
||||
/*!
|
||||
* \brief get row iterator
|
||||
* \param ridx row index
|
||||
* \return row iterator
|
||||
*/
|
||||
inline RowIter GetRow( size_t ridx ) const;
|
||||
/*!
|
||||
inline RowIter GetRow(size_t ridx) const;
|
||||
/*!
|
||||
* \brief get number of column groups, this ise used together with GetRow( ridx, gid )
|
||||
* \return number of column group
|
||||
*/
|
||||
inline unsigned NumColGroup( void ) const{
|
||||
inline unsigned NumColGroup(void) const{
|
||||
return 1;
|
||||
}
|
||||
/*!
|
||||
@ -99,32 +99,32 @@ namespace xgboost{
|
||||
* \param gid colmun group id
|
||||
* \return row iterator, only iterates over features of specified column group
|
||||
*/
|
||||
inline RowIter GetRow( size_t ridx, unsigned gid ) const;
|
||||
inline RowIter GetRow(size_t ridx, unsigned gid) const;
|
||||
|
||||
/*! \return whether column access is enabled */
|
||||
inline bool HaveColAccess( void ) const;
|
||||
inline bool HaveColAccess(void) const;
|
||||
/*!
|
||||
* \brief get column iterator, the columns must be sorted by feature value
|
||||
* \param ridx column index
|
||||
* \return column iterator
|
||||
*/
|
||||
inline ColIter GetSortedCol( size_t ridx ) const;
|
||||
inline ColIter GetSortedCol(size_t ridx) const;
|
||||
/*!
|
||||
* \brief get column backward iterator, starts from biggest fvalue, and iterator back
|
||||
* \param ridx column index
|
||||
* \return reverse column iterator
|
||||
*/
|
||||
inline ColBackIter GetReverseSortedCol( size_t ridx ) const;
|
||||
inline ColBackIter GetReverseSortedCol(size_t ridx) const;
|
||||
};
|
||||
};
|
||||
};
|
||||
|
||||
namespace xgboost{
|
||||
namespace booster{
|
||||
/*!
|
||||
/*!
|
||||
* \brief feature matrix to store training instance, in sparse CSR format
|
||||
*/
|
||||
class FMatrixS: public FMatrix<FMatrixS>{
|
||||
*/
|
||||
class FMatrixS : public FMatrix<FMatrixS>{
|
||||
public:
|
||||
/*! \brief one entry in a row */
|
||||
struct REntry{
|
||||
@ -133,10 +133,10 @@ namespace xgboost{
|
||||
/*! \brief feature value */
|
||||
bst_float fvalue;
|
||||
/*! \brief constructor */
|
||||
REntry( void ){}
|
||||
REntry(void){}
|
||||
/*! \brief constructor */
|
||||
REntry( bst_uint findex, bst_float fvalue ) : findex(findex), fvalue(fvalue){}
|
||||
inline static bool cmp_fvalue( const REntry &a, const REntry &b ){
|
||||
REntry(bst_uint findex, bst_float fvalue) : findex(findex), fvalue(fvalue){}
|
||||
inline static bool cmp_fvalue(const REntry &a, const REntry &b){
|
||||
return a.fvalue < b.fvalue;
|
||||
}
|
||||
};
|
||||
@ -147,79 +147,79 @@ namespace xgboost{
|
||||
/*! \brief size of the data */
|
||||
bst_uint len;
|
||||
/*! \brief get k-th element */
|
||||
inline const REntry& operator[]( unsigned i ) const{
|
||||
inline const REntry& operator[](unsigned i) const{
|
||||
return data_[i];
|
||||
}
|
||||
}
|
||||
};
|
||||
/*! \brief row iterator */
|
||||
struct RowIter{
|
||||
const REntry *dptr_, *end_;
|
||||
RowIter( const REntry* dptr, const REntry* end )
|
||||
:dptr_(dptr),end_(end){}
|
||||
inline bool Next( void ){
|
||||
if( dptr_ == end_ ) return false;
|
||||
RowIter(const REntry* dptr, const REntry* end)
|
||||
:dptr_(dptr), end_(end){}
|
||||
inline bool Next(void){
|
||||
if (dptr_ == end_) return false;
|
||||
else{
|
||||
++ dptr_; return true;
|
||||
++dptr_; return true;
|
||||
}
|
||||
}
|
||||
inline bst_uint findex( void ) const{
|
||||
inline bst_uint findex(void) const{
|
||||
return dptr_->findex;
|
||||
}
|
||||
inline bst_float fvalue( void ) const{
|
||||
inline bst_float fvalue(void) const{
|
||||
return dptr_->fvalue;
|
||||
}
|
||||
};
|
||||
/*! \brief column iterator */
|
||||
struct ColIter: public RowIter{
|
||||
ColIter( const REntry* dptr, const REntry* end )
|
||||
:RowIter( dptr, end ){}
|
||||
inline bst_uint rindex( void ) const{
|
||||
struct ColIter : public RowIter{
|
||||
ColIter(const REntry* dptr, const REntry* end)
|
||||
:RowIter(dptr, end){}
|
||||
inline bst_uint rindex(void) const{
|
||||
return this->findex();
|
||||
}
|
||||
};
|
||||
/*! \brief reverse column iterator */
|
||||
struct ColBackIter: public ColIter{
|
||||
ColBackIter( const REntry* dptr, const REntry* end )
|
||||
:ColIter( dptr, end ){}
|
||||
struct ColBackIter : public ColIter{
|
||||
ColBackIter(const REntry* dptr, const REntry* end)
|
||||
:ColIter(dptr, end){}
|
||||
// shadows RowIter::Next
|
||||
inline bool Next( void ){
|
||||
if( dptr_ == end_ ) return false;
|
||||
inline bool Next(void){
|
||||
if (dptr_ == end_) return false;
|
||||
else{
|
||||
-- dptr_; return true;
|
||||
--dptr_; return true;
|
||||
}
|
||||
}
|
||||
};
|
||||
public:
|
||||
/*! \brief constructor */
|
||||
FMatrixS( void ){ this->Clear(); }
|
||||
FMatrixS(void){ this->Clear(); }
|
||||
/*! \brief get number of rows */
|
||||
inline size_t NumRow( void ) const{
|
||||
inline size_t NumRow(void) const{
|
||||
return row_ptr_.size() - 1;
|
||||
}
|
||||
/*!
|
||||
/*!
|
||||
* \brief get number of nonzero entries
|
||||
* \return number of nonzero entries
|
||||
*/
|
||||
inline size_t NumEntry( void ) const{
|
||||
inline size_t NumEntry(void) const{
|
||||
return row_data_.size();
|
||||
}
|
||||
/*! \brief clear the storage */
|
||||
inline void Clear( void ){
|
||||
inline void Clear(void){
|
||||
row_ptr_.clear();
|
||||
row_ptr_.push_back( 0 );
|
||||
row_ptr_.push_back(0);
|
||||
row_data_.clear();
|
||||
col_ptr_.clear();
|
||||
col_data_.clear();
|
||||
}
|
||||
/*! \brief get sparse part of current row */
|
||||
inline Line operator[]( size_t sidx ) const{
|
||||
inline Line operator[](size_t sidx) const{
|
||||
Line sp;
|
||||
utils::Assert( !bst_debug || sidx < this->NumRow(), "row id exceed bound" );
|
||||
sp.len = static_cast<bst_uint>( row_ptr_[ sidx + 1 ] - row_ptr_[ sidx ] );
|
||||
sp.data_ = &row_data_[ row_ptr_[ sidx ] ];
|
||||
utils::Assert(!bst_debug || sidx < this->NumRow(), "row id exceed bound");
|
||||
sp.len = static_cast<bst_uint>(row_ptr_[sidx + 1] - row_ptr_[sidx]);
|
||||
sp.data_ = &row_data_[row_ptr_[sidx]];
|
||||
return sp;
|
||||
}
|
||||
/*!
|
||||
/*!
|
||||
* \brief add a row to the matrix, with data stored in STL container
|
||||
* \param findex feature index
|
||||
* \param fvalue feature value
|
||||
@ -227,155 +227,155 @@ namespace xgboost{
|
||||
* \param fend end bound range of feature
|
||||
* \return the row id added line
|
||||
*/
|
||||
inline size_t AddRow( const std::vector<bst_uint> &findex,
|
||||
const std::vector<bst_float> &fvalue,
|
||||
unsigned fstart = 0, unsigned fend = UINT_MAX ){
|
||||
utils::Assert( findex.size() == fvalue.size() );
|
||||
inline size_t AddRow(const std::vector<bst_uint> &findex,
|
||||
const std::vector<bst_float> &fvalue,
|
||||
unsigned fstart = 0, unsigned fend = UINT_MAX){
|
||||
utils::Assert(findex.size() == fvalue.size());
|
||||
unsigned cnt = 0;
|
||||
for( size_t i = 0; i < findex.size(); i ++ ){
|
||||
if( findex[i] < fstart || findex[i] >= fend ) continue;
|
||||
row_data_.push_back( REntry( findex[i], fvalue[i] ) );
|
||||
cnt ++;
|
||||
for (size_t i = 0; i < findex.size(); i++){
|
||||
if (findex[i] < fstart || findex[i] >= fend) continue;
|
||||
row_data_.push_back(REntry(findex[i], fvalue[i]));
|
||||
cnt++;
|
||||
}
|
||||
row_ptr_.push_back( row_ptr_.back() + cnt );
|
||||
row_ptr_.push_back(row_ptr_.back() + cnt);
|
||||
return row_ptr_.size() - 2;
|
||||
}
|
||||
/*! \brief get row iterator*/
|
||||
inline RowIter GetRow( size_t ridx ) const{
|
||||
utils::Assert( !bst_debug || ridx < this->NumRow(), "row id exceed bound" );
|
||||
return RowIter( &row_data_[ row_ptr_[ridx] ] - 1, &row_data_[ row_ptr_[ridx+1] ] - 1 );
|
||||
inline RowIter GetRow(size_t ridx) const{
|
||||
utils::Assert(!bst_debug || ridx < this->NumRow(), "row id exceed bound");
|
||||
return RowIter(&row_data_[row_ptr_[ridx]] - 1, &row_data_[row_ptr_[ridx + 1]] - 1);
|
||||
}
|
||||
/*! \brief get row iterator*/
|
||||
inline RowIter GetRow( size_t ridx, unsigned gid ) const{
|
||||
utils::Assert( gid == 0, "FMatrixS only have 1 column group" );
|
||||
return FMatrixS::GetRow( ridx );
|
||||
inline RowIter GetRow(size_t ridx, unsigned gid) const{
|
||||
utils::Assert(gid == 0, "FMatrixS only have 1 column group");
|
||||
return FMatrixS::GetRow(ridx);
|
||||
}
|
||||
public:
|
||||
/*! \return whether column access is enabled */
|
||||
inline bool HaveColAccess( void ) const{
|
||||
inline bool HaveColAccess(void) const{
|
||||
return col_ptr_.size() != 0 && col_data_.size() == row_data_.size();
|
||||
}
|
||||
/*! \brief get number of colmuns */
|
||||
inline size_t NumCol( void ) const{
|
||||
utils::Assert( this->HaveColAccess() );
|
||||
inline size_t NumCol(void) const{
|
||||
utils::Assert(this->HaveColAccess());
|
||||
return col_ptr_.size() - 1;
|
||||
}
|
||||
/*! \brief get col iterator*/
|
||||
inline ColIter GetSortedCol( size_t cidx ) const{
|
||||
utils::Assert( !bst_debug || cidx < this->NumCol(), "col id exceed bound" );
|
||||
return ColIter( &col_data_[ col_ptr_[cidx] ] - 1, &col_data_[ col_ptr_[cidx+1] ] - 1 );
|
||||
inline ColIter GetSortedCol(size_t cidx) const{
|
||||
utils::Assert(!bst_debug || cidx < this->NumCol(), "col id exceed bound");
|
||||
return ColIter(&col_data_[col_ptr_[cidx]] - 1, &col_data_[col_ptr_[cidx + 1]] - 1);
|
||||
}
|
||||
/*! \brief get col iterator */
|
||||
inline ColBackIter GetReverseSortedCol( size_t cidx ) const{
|
||||
utils::Assert( !bst_debug || cidx < this->NumCol(), "col id exceed bound" );
|
||||
return ColBackIter( &col_data_[ col_ptr_[cidx+1] ], &col_data_[ col_ptr_[cidx] ] );
|
||||
inline ColBackIter GetReverseSortedCol(size_t cidx) const{
|
||||
utils::Assert(!bst_debug || cidx < this->NumCol(), "col id exceed bound");
|
||||
return ColBackIter(&col_data_[col_ptr_[cidx + 1]], &col_data_[col_ptr_[cidx]]);
|
||||
}
|
||||
/*!
|
||||
* \brief intialize the data so that we have both column and row major
|
||||
* access, call this whenever we need column access
|
||||
*/
|
||||
inline void InitData( void ){
|
||||
utils::SparseCSRMBuilder<REntry> builder( col_ptr_, col_data_ );
|
||||
builder.InitBudget( 0 );
|
||||
for( size_t i = 0; i < this->NumRow(); i ++ ){
|
||||
for( RowIter it = this->GetRow(i); it.Next(); ){
|
||||
builder.AddBudget( it.findex() );
|
||||
inline void InitData(void){
|
||||
utils::SparseCSRMBuilder<REntry> builder(col_ptr_, col_data_);
|
||||
builder.InitBudget(0);
|
||||
for (size_t i = 0; i < this->NumRow(); i++){
|
||||
for (RowIter it = this->GetRow(i); it.Next();){
|
||||
builder.AddBudget(it.findex());
|
||||
}
|
||||
}
|
||||
builder.InitStorage();
|
||||
for( size_t i = 0; i < this->NumRow(); i ++ ){
|
||||
for( RowIter it = this->GetRow(i); it.Next(); ){
|
||||
builder.PushElem( it.findex(), REntry( (bst_uint)i, it.fvalue() ) );
|
||||
for (size_t i = 0; i < this->NumRow(); i++){
|
||||
for (RowIter it = this->GetRow(i); it.Next();){
|
||||
builder.PushElem(it.findex(), REntry((bst_uint)i, it.fvalue()));
|
||||
}
|
||||
}
|
||||
// sort columns
|
||||
unsigned ncol = static_cast<unsigned>( this->NumCol() );
|
||||
for( unsigned i = 0; i < ncol; i ++ ){
|
||||
std::sort( &col_data_[ col_ptr_[ i ] ], &col_data_[ col_ptr_[ i+1 ] ], REntry::cmp_fvalue );
|
||||
unsigned ncol = static_cast<unsigned>(this->NumCol());
|
||||
for (unsigned i = 0; i < ncol; i++){
|
||||
std::sort(&col_data_[col_ptr_[i]], &col_data_[col_ptr_[i + 1]], REntry::cmp_fvalue);
|
||||
}
|
||||
}
|
||||
/*!
|
||||
* \brief save data to binary stream
|
||||
* note: since we have size_t in ptr,
|
||||
* \brief save data to binary stream
|
||||
* note: since we have size_t in ptr,
|
||||
* the function is not consistent between 64bit and 32bit machine
|
||||
* \param fo output stream
|
||||
*/
|
||||
inline void SaveBinary( utils::IStream &fo ) const{
|
||||
FMatrixS::SaveBinary( fo, row_ptr_, row_data_ );
|
||||
inline void SaveBinary(utils::IStream &fo) const{
|
||||
FMatrixS::SaveBinary(fo, row_ptr_, row_data_);
|
||||
int col_access = this->HaveColAccess() ? 1 : 0;
|
||||
fo.Write( &col_access, sizeof(int) );
|
||||
if( col_access != 0 ){
|
||||
FMatrixS::SaveBinary( fo, col_ptr_, col_data_ );
|
||||
fo.Write(&col_access, sizeof(int));
|
||||
if (col_access != 0){
|
||||
FMatrixS::SaveBinary(fo, col_ptr_, col_data_);
|
||||
}
|
||||
}
|
||||
/*!
|
||||
* \brief load data from binary stream
|
||||
* note: since we have size_t in ptr,
|
||||
* \brief load data from binary stream
|
||||
* note: since we have size_t in ptr,
|
||||
* the function is not consistent between 64bit and 32bit machin
|
||||
* \param fi input stream
|
||||
*/
|
||||
inline void LoadBinary( utils::IStream &fi ){
|
||||
FMatrixS::LoadBinary( fi, row_ptr_, row_data_ );
|
||||
int col_access;
|
||||
fi.Read( &col_access, sizeof(int) );
|
||||
if( col_access != 0 ){
|
||||
FMatrixS::LoadBinary( fi, col_ptr_, col_data_ );
|
||||
inline void LoadBinary(utils::IStream &fi){
|
||||
FMatrixS::LoadBinary(fi, row_ptr_, row_data_);
|
||||
int col_access;
|
||||
fi.Read(&col_access, sizeof(int));
|
||||
if (col_access != 0){
|
||||
FMatrixS::LoadBinary(fi, col_ptr_, col_data_);
|
||||
}
|
||||
}
|
||||
/*!
|
||||
* \brief load from text file
|
||||
* \brief load from text file
|
||||
* \param fi input file pointer
|
||||
*/
|
||||
inline void LoadText( FILE *fi ){
|
||||
*/
|
||||
inline void LoadText(FILE *fi){
|
||||
this->Clear();
|
||||
int ninst;
|
||||
while( fscanf( fi, "%d", &ninst ) == 1 ){
|
||||
while (fscanf(fi, "%d", &ninst) == 1){
|
||||
std::vector<booster::bst_uint> findex;
|
||||
std::vector<booster::bst_float> fvalue;
|
||||
while( ninst -- ){
|
||||
while (ninst--){
|
||||
unsigned index; float value;
|
||||
utils::Assert( fscanf( fi, "%u:%f", &index, &value ) == 2, "load Text" );
|
||||
findex.push_back( index ); fvalue.push_back( value );
|
||||
utils::Assert(fscanf(fi, "%u:%f", &index, &value) == 2, "load Text");
|
||||
findex.push_back(index); fvalue.push_back(value);
|
||||
}
|
||||
this->AddRow( findex, fvalue );
|
||||
this->AddRow(findex, fvalue);
|
||||
}
|
||||
// initialize column support as well
|
||||
this->InitData();
|
||||
}
|
||||
private:
|
||||
/*!
|
||||
* \brief save data to binary stream
|
||||
* \brief save data to binary stream
|
||||
* \param fo output stream
|
||||
* \param ptr pointer data
|
||||
* \param data data content
|
||||
*/
|
||||
inline static void SaveBinary( utils::IStream &fo,
|
||||
const std::vector<size_t> &ptr,
|
||||
const std::vector<REntry> &data ){
|
||||
inline static void SaveBinary(utils::IStream &fo,
|
||||
const std::vector<size_t> &ptr,
|
||||
const std::vector<REntry> &data){
|
||||
size_t nrow = ptr.size() - 1;
|
||||
fo.Write( &nrow, sizeof(size_t) );
|
||||
fo.Write( &ptr[0], ptr.size() * sizeof(size_t) );
|
||||
if( data.size() != 0 ){
|
||||
fo.Write( &data[0] , data.size() * sizeof(REntry) );
|
||||
fo.Write(&nrow, sizeof(size_t));
|
||||
fo.Write(&ptr[0], ptr.size() * sizeof(size_t));
|
||||
if (data.size() != 0){
|
||||
fo.Write(&data[0], data.size() * sizeof(REntry));
|
||||
}
|
||||
}
|
||||
/*!
|
||||
* \brief load data from binary stream
|
||||
* \brief load data from binary stream
|
||||
* \param fi input stream
|
||||
* \param ptr pointer data
|
||||
* \param data data content
|
||||
*/
|
||||
inline static void LoadBinary( utils::IStream &fi,
|
||||
std::vector<size_t> &ptr,
|
||||
std::vector<REntry> &data ){
|
||||
inline static void LoadBinary(utils::IStream &fi,
|
||||
std::vector<size_t> &ptr,
|
||||
std::vector<REntry> &data){
|
||||
size_t nrow;
|
||||
utils::Assert( fi.Read( &nrow, sizeof(size_t) ) != 0, "Load FMatrixS" );
|
||||
ptr.resize( nrow + 1 );
|
||||
utils::Assert( fi.Read( &ptr[0], ptr.size() * sizeof(size_t) ), "Load FMatrixS" );
|
||||
utils::Assert(fi.Read(&nrow, sizeof(size_t)) != 0, "Load FMatrixS");
|
||||
ptr.resize(nrow + 1);
|
||||
utils::Assert(fi.Read(&ptr[0], ptr.size() * sizeof(size_t)), "Load FMatrixS");
|
||||
|
||||
data.resize( ptr.back() );
|
||||
if( data.size() != 0 ){
|
||||
utils::Assert( fi.Read( &data[0] , data.size() * sizeof(REntry) ) , "Load FMatrixS" );
|
||||
data.resize(ptr.back());
|
||||
if (data.size() != 0){
|
||||
utils::Assert(fi.Read(&data[0], data.size() * sizeof(REntry)), "Load FMatrixS");
|
||||
}
|
||||
}
|
||||
protected:
|
||||
@ -387,7 +387,7 @@ namespace xgboost{
|
||||
std::vector<size_t> col_ptr_;
|
||||
/*! \brief column datas */
|
||||
std::vector<REntry> col_data_;
|
||||
};
|
||||
};
|
||||
};
|
||||
};
|
||||
#endif
|
||||
|
||||
@ -8,25 +8,25 @@
|
||||
#include "../utils/xgboost_config.h"
|
||||
/*!
|
||||
* \file xgboost_gbmbase.h
|
||||
* \brief a base model class,
|
||||
* \brief a base model class,
|
||||
* that assembles the ensembles of booster together and do model update
|
||||
* this class can be used as base code to create booster variants
|
||||
* this class can be used as base code to create booster variants
|
||||
*
|
||||
* The detailed implementation of boosters should start by using the class
|
||||
* provided by this file
|
||||
*
|
||||
*
|
||||
* \author Tianqi Chen: tianqi.tchen@gmail.com
|
||||
*/
|
||||
namespace xgboost{
|
||||
namespace booster{
|
||||
/*!
|
||||
* \brief a base model class,
|
||||
* \brief a base model class,
|
||||
* that assembles the ensembles of booster together and provide single routines to do prediction buffer and update
|
||||
* this class can be used as base code to create booster variants
|
||||
* this class can be used as base code to create booster variants
|
||||
* *
|
||||
* relation to xgboost.h:
|
||||
* (1) xgboost.h provides a interface to a single booster(e.g. a single regression tree )
|
||||
* while GBMBaseModel builds upon IBooster to build a class that
|
||||
* while GBMBaseModel builds upon IBooster to build a class that
|
||||
* ensembls the boosters together;
|
||||
* (2) GBMBaseModel provides prediction buffering scheme to speedup training;
|
||||
* (3) Summary: GBMBaseModel is a standard wrapper for boosting ensembles;
|
||||
@ -37,259 +37,260 @@ namespace xgboost{
|
||||
* (3) model.InitTrainer before calling model.Predict and model.DoBoost
|
||||
* (4) model.Predict to get predictions given a instance
|
||||
* (4) model.DoBoost to update the ensembles, add new booster to the model
|
||||
* (4) model.SaveModel to save learned results
|
||||
* (4) model.SaveModel to save learned results
|
||||
*
|
||||
* Bufferring: each instance comes with a buffer_index in Predict.
|
||||
* when mparam.num_pbuffer != 0, a unique buffer index can be
|
||||
* Bufferring: each instance comes with a buffer_index in Predict.
|
||||
* when mparam.num_pbuffer != 0, a unique buffer index can be
|
||||
* assigned to each instance to buffer previous results of boosters,
|
||||
* this helps to speedup training, so consider assign buffer_index
|
||||
* this helps to speedup training, so consider assign buffer_index
|
||||
* for each training instances, if buffer_index = -1, the code
|
||||
* recalculate things from scratch and will still works correctly
|
||||
*/
|
||||
class GBMBase{
|
||||
public:
|
||||
/*! \brief number of thread used */
|
||||
GBMBase( void ){}
|
||||
GBMBase(void){}
|
||||
/*! \brief destructor */
|
||||
virtual ~GBMBase( void ){
|
||||
virtual ~GBMBase(void){
|
||||
this->FreeSpace();
|
||||
}
|
||||
/*!
|
||||
* \brief set parameters from outside
|
||||
/*!
|
||||
* \brief set parameters from outside
|
||||
* \param name name of the parameter
|
||||
* \param val value of the parameter
|
||||
*/
|
||||
inline void SetParam( const char *name, const char *val ){
|
||||
if( !strncmp( name, "bst:", 4 ) ){
|
||||
cfg.PushBack( name + 4, val );
|
||||
inline void SetParam(const char *name, const char *val){
|
||||
if (!strncmp(name, "bst:", 4)){
|
||||
cfg.PushBack(name + 4, val);
|
||||
}
|
||||
if( !strcmp( name, "silent") ){
|
||||
cfg.PushBack( name, val );
|
||||
if (!strcmp(name, "silent")){
|
||||
cfg.PushBack(name, val);
|
||||
}
|
||||
tparam.SetParam( name, val );
|
||||
if( boosters.size() == 0 ) mparam.SetParam( name, val );
|
||||
tparam.SetParam(name, val);
|
||||
if (boosters.size() == 0) mparam.SetParam(name, val);
|
||||
}
|
||||
/*!
|
||||
/*!
|
||||
* \brief load model from stream
|
||||
* \param fi input stream
|
||||
*/
|
||||
inline void LoadModel( utils::IStream &fi ){
|
||||
if( boosters.size() != 0 ) this->FreeSpace();
|
||||
utils::Assert( fi.Read( &mparam, sizeof(ModelParam) ) != 0 );
|
||||
boosters.resize( mparam.num_boosters );
|
||||
for( size_t i = 0; i < boosters.size(); i ++ ){
|
||||
boosters[ i ] = booster::CreateBooster<FMatrixS>( mparam.booster_type );
|
||||
boosters[ i ]->LoadModel( fi );
|
||||
inline void LoadModel(utils::IStream &fi){
|
||||
if (boosters.size() != 0) this->FreeSpace();
|
||||
utils::Assert(fi.Read(&mparam, sizeof(ModelParam)) != 0);
|
||||
boosters.resize(mparam.num_boosters);
|
||||
for (size_t i = 0; i < boosters.size(); i++){
|
||||
boosters[i] = booster::CreateBooster<FMatrixS>(mparam.booster_type);
|
||||
boosters[i]->LoadModel(fi);
|
||||
}
|
||||
{// load info
|
||||
booster_info.resize( mparam.num_boosters );
|
||||
if( mparam.num_boosters != 0 ){
|
||||
utils::Assert( fi.Read( &booster_info[0], sizeof(int)*mparam.num_boosters ) != 0 );
|
||||
booster_info.resize(mparam.num_boosters);
|
||||
if (mparam.num_boosters != 0){
|
||||
utils::Assert(fi.Read(&booster_info[0], sizeof(int)*mparam.num_boosters) != 0);
|
||||
}
|
||||
}
|
||||
if( mparam.num_pbuffer != 0 ){
|
||||
pred_buffer.resize ( mparam.num_pbuffer );
|
||||
pred_counter.resize( mparam.num_pbuffer );
|
||||
utils::Assert( fi.Read( &pred_buffer[0] , pred_buffer.size()*sizeof(float) ) != 0 );
|
||||
utils::Assert( fi.Read( &pred_counter[0], pred_counter.size()*sizeof(unsigned) ) != 0 );
|
||||
if (mparam.num_pbuffer != 0){
|
||||
pred_buffer.resize(mparam.num_pbuffer);
|
||||
pred_counter.resize(mparam.num_pbuffer);
|
||||
utils::Assert(fi.Read(&pred_buffer[0], pred_buffer.size()*sizeof(float)) != 0);
|
||||
utils::Assert(fi.Read(&pred_counter[0], pred_counter.size()*sizeof(unsigned)) != 0);
|
||||
}
|
||||
}
|
||||
/*!
|
||||
/*!
|
||||
* \brief save model to stream
|
||||
* \param fo output stream
|
||||
*/
|
||||
inline void SaveModel( utils::IStream &fo ) const {
|
||||
utils::Assert( mparam.num_boosters == (int)boosters.size() );
|
||||
fo.Write( &mparam, sizeof(ModelParam) );
|
||||
for( size_t i = 0; i < boosters.size(); i ++ ){
|
||||
boosters[ i ]->SaveModel( fo );
|
||||
inline void SaveModel(utils::IStream &fo) const {
|
||||
utils::Assert(mparam.num_boosters == (int)boosters.size());
|
||||
fo.Write(&mparam, sizeof(ModelParam));
|
||||
for (size_t i = 0; i < boosters.size(); i++){
|
||||
boosters[i]->SaveModel(fo);
|
||||
}
|
||||
if( booster_info.size() != 0 ){
|
||||
fo.Write( &booster_info[0], sizeof(int) * booster_info.size() );
|
||||
if (booster_info.size() != 0){
|
||||
fo.Write(&booster_info[0], sizeof(int)* booster_info.size());
|
||||
}
|
||||
if( mparam.num_pbuffer != 0 ){
|
||||
fo.Write( &pred_buffer[0] , pred_buffer.size()*sizeof(float) );
|
||||
fo.Write( &pred_counter[0], pred_counter.size()*sizeof(unsigned) );
|
||||
if (mparam.num_pbuffer != 0){
|
||||
fo.Write(&pred_buffer[0], pred_buffer.size()*sizeof(float));
|
||||
fo.Write(&pred_counter[0], pred_counter.size()*sizeof(unsigned));
|
||||
}
|
||||
}
|
||||
/*!
|
||||
* \brief initialize the current data storage for model, if the model is used first time, call this function
|
||||
*/
|
||||
inline void InitModel( void ){
|
||||
inline void InitModel(void){
|
||||
pred_buffer.clear(); pred_counter.clear();
|
||||
pred_buffer.resize ( mparam.num_pbuffer, 0.0 );
|
||||
pred_counter.resize( mparam.num_pbuffer, 0 );
|
||||
utils::Assert( mparam.num_boosters == 0 );
|
||||
utils::Assert( boosters.size() == 0 );
|
||||
pred_buffer.resize(mparam.num_pbuffer, 0.0);
|
||||
pred_counter.resize(mparam.num_pbuffer, 0);
|
||||
utils::Assert(mparam.num_boosters == 0);
|
||||
utils::Assert(boosters.size() == 0);
|
||||
}
|
||||
/*!
|
||||
* \brief initialize solver before training, called before training
|
||||
* this function is reserved for solver to allocate necessary space and do other preparation
|
||||
*/
|
||||
inline void InitTrainer( void ){
|
||||
if( tparam.nthread != 0 ){
|
||||
omp_set_num_threads( tparam.nthread );
|
||||
* this function is reserved for solver to allocate necessary space and do other preparation
|
||||
*/
|
||||
inline void InitTrainer(void){
|
||||
if (tparam.nthread != 0){
|
||||
omp_set_num_threads(tparam.nthread);
|
||||
}
|
||||
// make sure all the boosters get the latest parameters
|
||||
for( size_t i = 0; i < this->boosters.size(); i ++ ){
|
||||
this->ConfigBooster( this->boosters[i] );
|
||||
for (size_t i = 0; i < this->boosters.size(); i++){
|
||||
this->ConfigBooster(this->boosters[i]);
|
||||
}
|
||||
}
|
||||
/*!
|
||||
/*!
|
||||
* \brief DumpModel
|
||||
* \param fo text file
|
||||
* \param fo text file
|
||||
* \param fmap feature map that may help give interpretations of feature
|
||||
* \param with_stats whether print statistics
|
||||
*/
|
||||
inline void DumpModel( FILE *fo, const utils::FeatMap& fmap, bool with_stats ){
|
||||
for( size_t i = 0; i < boosters.size(); i ++ ){
|
||||
fprintf( fo, "booster[%d]\n", (int)i );
|
||||
boosters[i]->DumpModel( fo, fmap, with_stats );
|
||||
*/
|
||||
inline void DumpModel(FILE *fo, const utils::FeatMap& fmap, bool with_stats){
|
||||
for (size_t i = 0; i < boosters.size(); i++){
|
||||
fprintf(fo, "booster[%d]\n", (int)i);
|
||||
boosters[i]->DumpModel(fo, fmap, with_stats);
|
||||
}
|
||||
}
|
||||
/*!
|
||||
/*!
|
||||
* \brief Dump path of all trees
|
||||
* \param fo text file
|
||||
* \param fo text file
|
||||
* \param data input data
|
||||
*/
|
||||
inline void DumpPath( FILE *fo, const FMatrixS &data ){
|
||||
for( size_t i = 0; i < data.NumRow(); ++ i ){
|
||||
for( size_t j = 0; j < boosters.size(); ++ j ){
|
||||
if( j != 0 ) fprintf( fo, "\t" );
|
||||
inline void DumpPath(FILE *fo, const FMatrixS &data){
|
||||
for (size_t i = 0; i < data.NumRow(); ++i){
|
||||
for (size_t j = 0; j < boosters.size(); ++j){
|
||||
if (j != 0) fprintf(fo, "\t");
|
||||
std::vector<int> path;
|
||||
boosters[j]->PredPath( path, data, i );
|
||||
fprintf( fo, "%d", path[0] );
|
||||
for( size_t k = 1; k < path.size(); ++ k ){
|
||||
fprintf( fo, ",%d", path[k] );
|
||||
boosters[j]->PredPath(path, data, i);
|
||||
fprintf(fo, "%d", path[0]);
|
||||
for (size_t k = 1; k < path.size(); ++k){
|
||||
fprintf(fo, ",%d", path[k]);
|
||||
}
|
||||
}
|
||||
fprintf( fo, "\n" );
|
||||
fprintf(fo, "\n");
|
||||
}
|
||||
}
|
||||
public:
|
||||
/*!
|
||||
/*!
|
||||
* \brief do gradient boost training for one step, using the information given
|
||||
* Note: content of grad and hess can change after DoBoost
|
||||
* \param grad first order gradient of each instance
|
||||
* \param hess second order gradient of each instance
|
||||
* \param feats features of each instance
|
||||
* \param root_index pre-partitioned root index of each instance,
|
||||
* \param root_index pre-partitioned root index of each instance,
|
||||
* root_index.size() can be 0 which indicates that no pre-partition involved
|
||||
*/
|
||||
inline void DoBoost( std::vector<float> &grad,
|
||||
std::vector<float> &hess,
|
||||
const booster::FMatrixS &feats,
|
||||
const std::vector<unsigned> &root_index ) {
|
||||
inline void DoBoost(std::vector<float> &grad,
|
||||
std::vector<float> &hess,
|
||||
const booster::FMatrixS &feats,
|
||||
const std::vector<unsigned> &root_index) {
|
||||
booster::IBooster *bst = this->GetUpdateBooster();
|
||||
bst->DoBoost( grad, hess, feats, root_index );
|
||||
bst->DoBoost(grad, hess, feats, root_index);
|
||||
}
|
||||
/*!
|
||||
/*!
|
||||
* \brief predict values for given sparse feature vector
|
||||
* NOTE: in tree implementation, this is only OpenMP threadsafe, but not threadsafe
|
||||
* \param feats feature matrix
|
||||
* \param row_index row index in the feature matrix
|
||||
* \param buffer_index the buffer index of the current feature line, default -1 means no buffer assigned
|
||||
* \param root_index root id of current instance, default = 0
|
||||
* \return prediction
|
||||
* \return prediction
|
||||
*/
|
||||
inline float Predict( const FMatrixS &feats, bst_uint row_index, int buffer_index = -1, unsigned root_index = 0 ){
|
||||
inline float Predict(const FMatrixS &feats, bst_uint row_index, int buffer_index = -1, unsigned root_index = 0){
|
||||
size_t istart = 0;
|
||||
float psum = 0.0f;
|
||||
|
||||
// load buffered results if any
|
||||
if( mparam.do_reboost == 0 && buffer_index >= 0 ){
|
||||
utils::Assert( buffer_index < mparam.num_pbuffer, "buffer index exceed num_pbuffer" );
|
||||
istart = this->pred_counter[ buffer_index ];
|
||||
psum = this->pred_buffer [ buffer_index ];
|
||||
if (mparam.do_reboost == 0 && buffer_index >= 0){
|
||||
utils::Assert(buffer_index < mparam.num_pbuffer, "buffer index exceed num_pbuffer");
|
||||
istart = this->pred_counter[buffer_index];
|
||||
psum = this->pred_buffer[buffer_index];
|
||||
}
|
||||
|
||||
for (size_t i = istart; i < this->boosters.size(); i++){
|
||||
psum += this->boosters[i]->Predict(feats, row_index, root_index);
|
||||
}
|
||||
|
||||
for( size_t i = istart; i < this->boosters.size(); i ++ ){
|
||||
psum += this->boosters[ i ]->Predict( feats, row_index, root_index );
|
||||
}
|
||||
// updated the buffered results
|
||||
if( mparam.do_reboost == 0 && buffer_index >= 0 ){
|
||||
this->pred_counter[ buffer_index ] = static_cast<unsigned>( boosters.size() );
|
||||
this->pred_buffer [ buffer_index ] = psum;
|
||||
if (mparam.do_reboost == 0 && buffer_index >= 0){
|
||||
this->pred_counter[buffer_index] = static_cast<unsigned>(boosters.size());
|
||||
this->pred_buffer[buffer_index] = psum;
|
||||
}
|
||||
return psum;
|
||||
}
|
||||
public:
|
||||
//--------trial code for interactive update an existing booster------
|
||||
//-------- usually not needed, ignore this region ---------
|
||||
/*!
|
||||
* \brief same as Predict, but removes the prediction of booster to be updated
|
||||
/*!
|
||||
* \brief same as Predict, but removes the prediction of booster to be updated
|
||||
* this function must be called once and only once for every data with pbuffer
|
||||
*/
|
||||
inline float InteractPredict( const FMatrixS &feats, bst_uint row_index, int buffer_index = -1, unsigned root_index = 0 ){
|
||||
float psum = this->Predict( feats, row_index, buffer_index, root_index );
|
||||
if( tparam.reupdate_booster != -1 ){
|
||||
inline float InteractPredict(const FMatrixS &feats, bst_uint row_index, int buffer_index = -1, unsigned root_index = 0){
|
||||
float psum = this->Predict(feats, row_index, buffer_index, root_index);
|
||||
if (tparam.reupdate_booster != -1){
|
||||
const int bid = tparam.reupdate_booster;
|
||||
utils::Assert( bid >= 0 && bid < (int)boosters.size(), "interact:booster_index exceed existing bound" );
|
||||
psum -= boosters[ bid ]->Predict( feats, row_index, root_index );
|
||||
if( mparam.do_reboost == 0 && buffer_index >= 0 ){
|
||||
this->pred_buffer[ buffer_index ] = psum;
|
||||
utils::Assert(bid >= 0 && bid < (int)boosters.size(), "interact:booster_index exceed existing bound");
|
||||
psum -= boosters[bid]->Predict(feats, row_index, root_index);
|
||||
if (mparam.do_reboost == 0 && buffer_index >= 0){
|
||||
this->pred_buffer[buffer_index] = psum;
|
||||
}
|
||||
}
|
||||
return psum;
|
||||
}
|
||||
/*! \brief delete the specified booster */
|
||||
inline void DelteBooster( void ){
|
||||
inline void DelteBooster(void){
|
||||
const int bid = tparam.reupdate_booster;
|
||||
utils::Assert( bid >= 0 && bid < mparam.num_boosters , "must specify booster index for deletion");
|
||||
delete boosters[ bid ];
|
||||
for( int i = bid + 1; i < mparam.num_boosters; ++ i ){
|
||||
boosters[i-1] = boosters[ i ];
|
||||
booster_info[i-1] = booster_info[ i ];
|
||||
}
|
||||
boosters.resize( mparam.num_boosters -= 1 );
|
||||
booster_info.resize( boosters.size() );
|
||||
utils::Assert(bid >= 0 && bid < mparam.num_boosters, "must specify booster index for deletion");
|
||||
delete boosters[bid];
|
||||
for (int i = bid + 1; i < mparam.num_boosters; ++i){
|
||||
boosters[i - 1] = boosters[i];
|
||||
booster_info[i - 1] = booster_info[i];
|
||||
}
|
||||
boosters.resize(mparam.num_boosters -= 1);
|
||||
booster_info.resize(boosters.size());
|
||||
}
|
||||
/*! \brief update the prediction buffer, after booster have been updated */
|
||||
inline void InteractRePredict( const FMatrixS &feats, bst_uint row_index, int buffer_index = -1, unsigned root_index = 0 ){
|
||||
if( tparam.reupdate_booster != -1 ){
|
||||
/*! \brief update the prediction buffer, after booster have been updated */
|
||||
inline void InteractRePredict(const FMatrixS &feats, bst_uint row_index, int buffer_index = -1, unsigned root_index = 0){
|
||||
if (tparam.reupdate_booster != -1){
|
||||
const int bid = tparam.reupdate_booster;
|
||||
utils::Assert( bid >= 0 && bid < (int)boosters.size(), "interact:booster_index exceed existing bound" );
|
||||
if( mparam.do_reboost == 0 && buffer_index >= 0 ){
|
||||
this->pred_buffer[ buffer_index ] += boosters[ bid ]->Predict( feats, row_index, root_index );
|
||||
utils::Assert(bid >= 0 && bid < (int)boosters.size(), "interact:booster_index exceed existing bound");
|
||||
if (mparam.do_reboost == 0 && buffer_index >= 0){
|
||||
this->pred_buffer[buffer_index] += boosters[bid]->Predict(feats, row_index, root_index);
|
||||
}
|
||||
}
|
||||
}
|
||||
//-----------non public fields afterwards-------------
|
||||
protected:
|
||||
/*! \brief free space of the model */
|
||||
inline void FreeSpace( void ){
|
||||
for( size_t i = 0; i < boosters.size(); i ++ ){
|
||||
inline void FreeSpace(void){
|
||||
for (size_t i = 0; i < boosters.size(); i++){
|
||||
delete boosters[i];
|
||||
}
|
||||
boosters.clear(); booster_info.clear(); mparam.num_boosters = 0;
|
||||
boosters.clear(); booster_info.clear(); mparam.num_boosters = 0;
|
||||
}
|
||||
/*! \brief configure a booster */
|
||||
inline void ConfigBooster( booster::IBooster *bst ){
|
||||
inline void ConfigBooster(booster::IBooster *bst){
|
||||
cfg.BeforeFirst();
|
||||
while( cfg.Next() ){
|
||||
bst->SetParam( cfg.name(), cfg.val() );
|
||||
while (cfg.Next()){
|
||||
bst->SetParam(cfg.name(), cfg.val());
|
||||
}
|
||||
}
|
||||
/*!
|
||||
* \brief get a booster to update
|
||||
/*!
|
||||
* \brief get a booster to update
|
||||
* \return the booster created
|
||||
*/
|
||||
inline booster::IBooster *GetUpdateBooster( void ){
|
||||
if( tparam.reupdate_booster != -1 ){
|
||||
inline booster::IBooster *GetUpdateBooster(void){
|
||||
if (tparam.reupdate_booster != -1){
|
||||
const int bid = tparam.reupdate_booster;
|
||||
utils::Assert( bid >= 0 && bid < (int)boosters.size(), "interact:booster_index exceed existing bound" );
|
||||
this->ConfigBooster( boosters[bid] );
|
||||
return boosters[ bid ];
|
||||
utils::Assert(bid >= 0 && bid < (int)boosters.size(), "interact:booster_index exceed existing bound");
|
||||
this->ConfigBooster(boosters[bid]);
|
||||
return boosters[bid];
|
||||
}
|
||||
|
||||
if( mparam.do_reboost == 0 || boosters.size() == 0 ){
|
||||
if (mparam.do_reboost == 0 || boosters.size() == 0){
|
||||
mparam.num_boosters += 1;
|
||||
boosters.push_back( booster::CreateBooster<FMatrixS>( mparam.booster_type ) );
|
||||
booster_info.push_back( 0 );
|
||||
this->ConfigBooster( boosters.back() );
|
||||
boosters.back()->InitModel();
|
||||
}else{
|
||||
this->ConfigBooster( boosters.back() );
|
||||
boosters.push_back(booster::CreateBooster<FMatrixS>(mparam.booster_type));
|
||||
booster_info.push_back(0);
|
||||
this->ConfigBooster(boosters.back());
|
||||
boosters.back()->InitModel();
|
||||
}
|
||||
else{
|
||||
this->ConfigBooster(boosters.back());
|
||||
}
|
||||
return boosters.back();
|
||||
}
|
||||
@ -306,76 +307,76 @@ namespace xgboost{
|
||||
int num_feature;
|
||||
/*! \brief size of predicton buffer allocated for buffering boosting computation */
|
||||
int num_pbuffer;
|
||||
/*!
|
||||
/*!
|
||||
* \brief whether we repeatly update a single booster each round: default 0
|
||||
* set to 1 for linear booster, so that regularization term can be considered
|
||||
*/
|
||||
int do_reboost;
|
||||
/*! \brief reserved parameters */
|
||||
int reserved[ 32 ];
|
||||
int reserved[32];
|
||||
/*! \brief constructor */
|
||||
ModelParam( void ){
|
||||
num_boosters = 0;
|
||||
ModelParam(void){
|
||||
num_boosters = 0;
|
||||
booster_type = 0;
|
||||
num_roots = num_feature = 0;
|
||||
num_roots = num_feature = 0;
|
||||
do_reboost = 0;
|
||||
num_pbuffer = 0;
|
||||
memset( reserved, 0, sizeof( reserved ) );
|
||||
memset(reserved, 0, sizeof(reserved));
|
||||
}
|
||||
/*!
|
||||
* \brief set parameters from outside
|
||||
/*!
|
||||
* \brief set parameters from outside
|
||||
* \param name name of the parameter
|
||||
* \param val value of the parameter
|
||||
*/
|
||||
inline void SetParam( const char *name, const char *val ){
|
||||
if( !strcmp("booster_type", name ) ){
|
||||
booster_type = atoi( val );
|
||||
inline void SetParam(const char *name, const char *val){
|
||||
if (!strcmp("booster_type", name)){
|
||||
booster_type = atoi(val);
|
||||
// linear boost automatically set do reboost
|
||||
if( booster_type == 1 ) do_reboost = 1;
|
||||
if (booster_type == 1) do_reboost = 1;
|
||||
}
|
||||
if( !strcmp("num_pbuffer", name ) ) num_pbuffer = atoi( val );
|
||||
if( !strcmp("do_reboost", name ) ) do_reboost = atoi( val );
|
||||
if( !strcmp("bst:num_roots", name ) ) num_roots = atoi( val );
|
||||
if( !strcmp("bst:num_feature", name ) ) num_feature = atoi( val );
|
||||
if (!strcmp("num_pbuffer", name)) num_pbuffer = atoi(val);
|
||||
if (!strcmp("do_reboost", name)) do_reboost = atoi(val);
|
||||
if (!strcmp("bst:num_roots", name)) num_roots = atoi(val);
|
||||
if (!strcmp("bst:num_feature", name)) num_feature = atoi(val);
|
||||
}
|
||||
};
|
||||
/*! \brief training parameters */
|
||||
struct TrainParam{
|
||||
/*! \brief number of OpenMP threads */
|
||||
int nthread;
|
||||
/*!
|
||||
* \brief index of specific booster to be re-updated, default = -1: update new booster
|
||||
/*!
|
||||
* \brief index of specific booster to be re-updated, default = -1: update new booster
|
||||
* parameter this is part of trial interactive update mode
|
||||
*/
|
||||
int reupdate_booster;
|
||||
/*! \brief constructor */
|
||||
TrainParam( void ) {
|
||||
TrainParam(void) {
|
||||
nthread = 1;
|
||||
reupdate_booster = -1;
|
||||
}
|
||||
/*!
|
||||
* \brief set parameters from outside
|
||||
/*!
|
||||
* \brief set parameters from outside
|
||||
* \param name name of the parameter
|
||||
* \param val value of the parameter
|
||||
*/
|
||||
inline void SetParam( const char *name, const char *val ){
|
||||
if( !strcmp("nthread", name ) ) nthread = atoi( val );
|
||||
if( !strcmp("interact:booster_index", name ) ) reupdate_booster = atoi( val );
|
||||
*/
|
||||
inline void SetParam(const char *name, const char *val){
|
||||
if (!strcmp("nthread", name)) nthread = atoi(val);
|
||||
if (!strcmp("interact:booster_index", name)) reupdate_booster = atoi(val);
|
||||
}
|
||||
};
|
||||
protected:
|
||||
/*! \brief model parameters */
|
||||
/*! \brief model parameters */
|
||||
ModelParam mparam;
|
||||
/*! \brief training parameters */
|
||||
/*! \brief training parameters */
|
||||
TrainParam tparam;
|
||||
protected:
|
||||
/*! \brief component boosters */
|
||||
/*! \brief component boosters */
|
||||
std::vector<booster::IBooster*> boosters;
|
||||
/*! \brief some information indicator of the booster, reserved */
|
||||
/*! \brief some information indicator of the booster, reserved */
|
||||
std::vector<int> booster_info;
|
||||
/*! \brief prediction buffer */
|
||||
/*! \brief prediction buffer */
|
||||
std::vector<float> pred_buffer;
|
||||
/*! \brief prediction buffer counter, record the progress so fart of the buffer */
|
||||
/*! \brief prediction buffer counter, record the progress so fart of the buffer */
|
||||
std::vector<unsigned> pred_counter;
|
||||
/*! \brief configurations saved for each booster */
|
||||
utils::ConfigSaver cfg;
|
||||
|
||||
13
demo/rank/README
Normal file
13
demo/rank/README
Normal file
@ -0,0 +1,13 @@
|
||||
Demonstrating how to use XGBoost accomplish regression tasks on computer hardware dataset https://archive.ics.uci.edu/ml/datasets/Computer+Hardware
|
||||
|
||||
Run: ./runexp.sh
|
||||
|
||||
Format of input: LIBSVM format
|
||||
|
||||
Format of ```featmap.txt: <featureid> <featurename> <q or i or int>\n ```:
|
||||
- Feature id must be from 0 to number of features, in sorted order.
|
||||
- i means this feature is binary indicator feature
|
||||
- q means this feature is a quantitative value, such as age, time, can be missing
|
||||
- int means this feature is integer value (when int is hinted, the decision boundary will be integer)
|
||||
|
||||
Explainations: https://github.com/tqchen/xgboost/wiki/Regression
|
||||
16
demo/rank/runexp.sh
Normal file
16
demo/rank/runexp.sh
Normal file
@ -0,0 +1,16 @@
|
||||
#!/bin/bash
|
||||
# map the data to features. For convenience we only use 7 original attributes and encode them as features in a trivial way
|
||||
python mapfeat.py
|
||||
# split train and test
|
||||
python mknfold.py machine.txt 1
|
||||
# training and output the models
|
||||
../../xgboost machine.conf
|
||||
# output predictions of test data
|
||||
../../xgboost machine.conf task=pred model_in=0002.model
|
||||
# print the boosters of 0002.model in dump.raw.txt
|
||||
../../xgboost machine.conf task=dump model_in=0002.model name_dump=dump.raw.txt
|
||||
# print the boosters of 0002.model in dump.nice.txt with feature map
|
||||
../../xgboost machine.conf task=dump model_in=0002.model fmap=featmap.txt name_dump=dump.nice.txt
|
||||
|
||||
# cat the result
|
||||
cat dump.nice.txt
|
||||
5
demo/rank/toy.eval
Normal file
5
demo/rank/toy.eval
Normal file
@ -0,0 +1,5 @@
|
||||
1 0:2 1:3 2:2
|
||||
0 0:2 1:3 2:2
|
||||
0 0:2 1:3 2:2
|
||||
0 0:2 1:3 2:2
|
||||
1 0:2 1:3 2:2
|
||||
2
demo/rank/toy.eval.group
Normal file
2
demo/rank/toy.eval.group
Normal file
@ -0,0 +1,2 @@
|
||||
2
|
||||
3
|
||||
5
demo/rank/toy.test
Normal file
5
demo/rank/toy.test
Normal file
@ -0,0 +1,5 @@
|
||||
1 0:2 1:3 2:2
|
||||
0 0:2 1:3 2:2
|
||||
0 0:2 1:3 2:2
|
||||
0 0:2 1:3 2:2
|
||||
1 0:2 1:3 2:2
|
||||
2
demo/rank/toy.test.group
Normal file
2
demo/rank/toy.test.group
Normal file
@ -0,0 +1,2 @@
|
||||
2
|
||||
3
|
||||
11
demo/rank/toy.train
Normal file
11
demo/rank/toy.train
Normal file
@ -0,0 +1,11 @@
|
||||
1 0:1.2 1:3 2:5.6
|
||||
0 0:2.0 1:2.3 2:5.1
|
||||
0 0:3.9 1:3 2:3.1
|
||||
0 0:2 1:3.2 2:3.4
|
||||
1 0:2.1 1:4.5 2:4.2
|
||||
0 0:1.9 1:2.8 2:3.1
|
||||
1 0:3.0 1:2.0 2:1.1
|
||||
0 0:1.9 1:1.8 2:2.1
|
||||
0 0:1.1 1:2.2 2:1.4
|
||||
1 0:2.1 1:4.1 2:4.0
|
||||
0 0:1.9 1:2.2 2:1.1
|
||||
2
demo/rank/toy.train.group
Normal file
2
demo/rank/toy.train.group
Normal file
@ -0,0 +1,2 @@
|
||||
6
|
||||
5
|
||||
0
demo/rank/train
Normal file
0
demo/rank/train
Normal file
@ -11,314 +11,319 @@
|
||||
#include "../utils/xgboost_config.h"
|
||||
|
||||
namespace xgboost{
|
||||
namespace base{
|
||||
/*!
|
||||
* \brief wrapping the training process of the gradient boosting model,
|
||||
* given the configuation
|
||||
* \author Kailong Chen: chenkl198812@gmail.com, Tianqi Chen: tianqi.chen@gmail.com
|
||||
*/
|
||||
class BoostTask{
|
||||
public:
|
||||
inline int Run(int argc, char *argv[]){
|
||||
if (argc < 2){
|
||||
printf("Usage: <config>\n");
|
||||
return 0;
|
||||
}
|
||||
utils::ConfigIterator itr(argv[1]);
|
||||
while (itr.Next()){
|
||||
this->SetParam(itr.name(), itr.val());
|
||||
}
|
||||
for (int i = 2; i < argc; i++){
|
||||
char name[256], val[256];
|
||||
if (sscanf(argv[i], "%[^=]=%s", name, val) == 2){
|
||||
this->SetParam(name, val);
|
||||
}
|
||||
}
|
||||
this->InitData();
|
||||
this->InitLearner();
|
||||
if (task == "dump"){
|
||||
this->TaskDump();
|
||||
return 0;
|
||||
}
|
||||
if (task == "interact"){
|
||||
this->TaskInteractive(); return 0;
|
||||
}
|
||||
if (task == "dumppath"){
|
||||
this->TaskDumpPath(); return 0;
|
||||
}
|
||||
if (task == "eval"){
|
||||
this->TaskEval(); return 0;
|
||||
}
|
||||
if (task == "pred"){
|
||||
this->TaskPred();
|
||||
}
|
||||
else{
|
||||
this->TaskTrain();
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
namespace base{
|
||||
/*!
|
||||
* \brief wrapping the training process of the gradient boosting model,
|
||||
* given the configuation
|
||||
* \author Kailong Chen: chenkl198812@gmail.com, Tianqi Chen: tianqi.chen@gmail.com
|
||||
*/
|
||||
class BoostTask{
|
||||
public:
|
||||
inline int Run(int argc, char *argv[]){
|
||||
|
||||
enum learning_tasks{
|
||||
REGRESSION = 0,
|
||||
BINARY_CLASSIFICATION = 1,
|
||||
RANKING = 2
|
||||
};
|
||||
if (argc < 2){
|
||||
printf("Usage: <config>\n");
|
||||
return 0;
|
||||
}
|
||||
utils::ConfigIterator itr(argv[1]);
|
||||
while (itr.Next()){
|
||||
this->SetParam(itr.name(), itr.val());
|
||||
}
|
||||
for (int i = 2; i < argc; i++){
|
||||
char name[256], val[256];
|
||||
if (sscanf(argv[i], "%[^=]=%s", name, val) == 2){
|
||||
this->SetParam(name, val);
|
||||
}
|
||||
}
|
||||
|
||||
this->InitData();
|
||||
this->InitLearner();
|
||||
if (task == "dump"){
|
||||
this->TaskDump();
|
||||
return 0;
|
||||
}
|
||||
if (task == "interact"){
|
||||
this->TaskInteractive(); return 0;
|
||||
}
|
||||
if (task == "dumppath"){
|
||||
this->TaskDumpPath(); return 0;
|
||||
}
|
||||
if (task == "eval"){
|
||||
this->TaskEval(); return 0;
|
||||
}
|
||||
if (task == "pred"){
|
||||
this->TaskPred();
|
||||
}
|
||||
else{
|
||||
this->TaskTrain();
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* \brief set learner
|
||||
* \param learner the passed in learner
|
||||
*/
|
||||
inline void SetLearner(BoostLearner* learner){
|
||||
learner_ = learner;
|
||||
}
|
||||
enum learning_tasks{
|
||||
REGRESSION = 0,
|
||||
BINARY_CLASSIFICATION = 1,
|
||||
RANKING = 2
|
||||
};
|
||||
|
||||
inline void SetParam(const char *name, const char *val){
|
||||
if (!strcmp("learning_task", name)) learning_task = atoi(val);
|
||||
if (!strcmp("silent", name)) silent = atoi(val);
|
||||
if (!strcmp("use_buffer", name)) use_buffer = atoi(val);
|
||||
if (!strcmp("seed", name)) random::Seed(atoi(val));
|
||||
if (!strcmp("num_round", name)) num_round = atoi(val);
|
||||
if (!strcmp("save_period", name)) save_period = atoi(val);
|
||||
if (!strcmp("task", name)) task = val;
|
||||
if (!strcmp("data", name)) train_path = val;
|
||||
if (!strcmp("test:data", name)) test_path = val;
|
||||
if (!strcmp("model_in", name)) model_in = val;
|
||||
if (!strcmp("model_out", name)) model_out = val;
|
||||
if (!strcmp("model_dir", name)) model_dir_path = val;
|
||||
if (!strcmp("fmap", name)) name_fmap = val;
|
||||
if (!strcmp("name_dump", name)) name_dump = val;
|
||||
if (!strcmp("name_dumppath", name)) name_dumppath = val;
|
||||
if (!strcmp("name_pred", name)) name_pred = val;
|
||||
if (!strcmp("dump_stats", name)) dump_model_stats = atoi(val);
|
||||
if (!strcmp("interact:action", name)) interact_action = val;
|
||||
if (!strncmp("batch:", name, 6)){
|
||||
cfg_batch.PushBack(name + 6, val);
|
||||
}
|
||||
if (!strncmp("eval[", name, 5)) {
|
||||
char evname[256];
|
||||
utils::Assert(sscanf(name, "eval[%[^]]", evname) == 1, "must specify evaluation name for display");
|
||||
eval_data_names.push_back(std::string(evname));
|
||||
eval_data_paths.push_back(std::string(val));
|
||||
}
|
||||
cfg.PushBack(name, val);
|
||||
}
|
||||
public:
|
||||
BoostTask(void){
|
||||
// default parameters
|
||||
silent = 0;
|
||||
use_buffer = 1;
|
||||
num_round = 10;
|
||||
save_period = 0;
|
||||
dump_model_stats = 0;
|
||||
task = "train";
|
||||
model_in = "NULL";
|
||||
model_out = "NULL";
|
||||
name_fmap = "NULL";
|
||||
name_pred = "pred.txt";
|
||||
name_dump = "dump.txt";
|
||||
name_dumppath = "dump.path.txt";
|
||||
model_dir_path = "./";
|
||||
interact_action = "update";
|
||||
}
|
||||
~BoostTask(void){
|
||||
for (size_t i = 0; i < deval.size(); i++){
|
||||
delete deval[i];
|
||||
}
|
||||
}
|
||||
private:
|
||||
/* \brief set learner
|
||||
* \param learner the passed in learner
|
||||
*/
|
||||
inline void SetLearner(BoostLearner* learner){
|
||||
learner_ = learner;
|
||||
}
|
||||
|
||||
inline void SetParam(const char *name, const char *val){
|
||||
if (!strcmp("learning_task", name)) learning_task = atoi(val);
|
||||
if (!strcmp("silent", name)) silent = atoi(val);
|
||||
if (!strcmp("use_buffer", name)) use_buffer = atoi(val);
|
||||
if (!strcmp("seed", name)) random::Seed(atoi(val));
|
||||
if (!strcmp("num_round", name)) num_round = atoi(val);
|
||||
if (!strcmp("save_period", name)) save_period = atoi(val);
|
||||
if (!strcmp("task", name)) task = val;
|
||||
if (!strcmp("data", name)) train_path = val;
|
||||
if (!strcmp("test:data", name)) test_path = val;
|
||||
if (!strcmp("model_in", name)) model_in = val;
|
||||
if (!strcmp("model_out", name)) model_out = val;
|
||||
if (!strcmp("model_dir", name)) model_dir_path = val;
|
||||
if (!strcmp("fmap", name)) name_fmap = val;
|
||||
if (!strcmp("name_dump", name)) name_dump = val;
|
||||
if (!strcmp("name_dumppath", name)) name_dumppath = val;
|
||||
if (!strcmp("name_pred", name)) name_pred = val;
|
||||
if (!strcmp("dump_stats", name)) dump_model_stats = atoi(val);
|
||||
if (!strcmp("interact:action", name)) interact_action = val;
|
||||
if (!strncmp("batch:", name, 6)){
|
||||
cfg_batch.PushBack(name + 6, val);
|
||||
}
|
||||
if (!strncmp("eval[", name, 5)) {
|
||||
char evname[256];
|
||||
utils::Assert(sscanf(name, "eval[%[^]]", evname) == 1, "must specify evaluation name for display");
|
||||
eval_data_names.push_back(std::string(evname));
|
||||
eval_data_paths.push_back(std::string(val));
|
||||
}
|
||||
cfg.PushBack(name, val);
|
||||
}
|
||||
public:
|
||||
BoostTask(void){
|
||||
// default parameters
|
||||
silent = 0;
|
||||
use_buffer = 1;
|
||||
num_round = 10;
|
||||
save_period = 0;
|
||||
dump_model_stats = 0;
|
||||
task = "train";
|
||||
model_in = "NULL";
|
||||
model_out = "NULL";
|
||||
name_fmap = "NULL";
|
||||
name_pred = "pred.txt";
|
||||
name_dump = "dump.txt";
|
||||
name_dumppath = "dump.path.txt";
|
||||
model_dir_path = "./";
|
||||
interact_action = "update";
|
||||
}
|
||||
~BoostTask(void){
|
||||
for (size_t i = 0; i < deval.size(); i++){
|
||||
delete deval[i];
|
||||
}
|
||||
}
|
||||
private:
|
||||
|
||||
|
||||
inline void InitData(void){
|
||||
if (name_fmap != "NULL") fmap.LoadText(name_fmap.c_str());
|
||||
if (task == "dump") return;
|
||||
if (learning_task == RANKING){
|
||||
char instance_path[256], group_path[256];
|
||||
if (task == "pred" || task == "dumppath"){
|
||||
sscanf(test_path.c_str(), "%[^;];%s", instance_path, group_path);
|
||||
data.CacheLoad(instance_path, group_path, silent != 0, use_buffer != 0);
|
||||
}
|
||||
else{
|
||||
// training
|
||||
sscanf(train_path.c_str(), "%[^;];%s", instance_path, group_path);
|
||||
data.CacheLoad(instance_path, group_path, silent != 0, use_buffer != 0);
|
||||
utils::Assert(eval_data_names.size() == eval_data_paths.size());
|
||||
for (size_t i = 0; i < eval_data_names.size(); ++i){
|
||||
deval.push_back(new DMatrix());
|
||||
sscanf(eval_data_paths[i].c_str(), "%[^;];%s", instance_path, group_path);
|
||||
deval.back()->CacheLoad(instance_path, group_path, silent != 0, use_buffer != 0);
|
||||
}
|
||||
}
|
||||
inline void InitData(void){
|
||||
|
||||
if (name_fmap != "NULL") fmap.LoadText(name_fmap.c_str());
|
||||
if (task == "dump") return;
|
||||
if (learning_task == RANKING){
|
||||
char instance_path[256], group_path[256];
|
||||
if (task == "pred" || task == "dumppath"){
|
||||
sscanf(test_path.c_str(), "%[^;];%s", instance_path, group_path);
|
||||
data.CacheLoad(instance_path, group_path, silent != 0, use_buffer != 0);
|
||||
}
|
||||
else{
|
||||
// training
|
||||
sscanf(train_path.c_str(), "%[^;];%s", instance_path, group_path);
|
||||
data.CacheLoad(instance_path, group_path, silent != 0, use_buffer != 0);
|
||||
|
||||
utils::Assert(eval_data_names.size() == eval_data_paths.size());
|
||||
for (size_t i = 0; i < eval_data_names.size(); ++i){
|
||||
deval.push_back(new DMatrix());
|
||||
sscanf(eval_data_paths[i].c_str(), "%[^;];%s", instance_path, group_path);
|
||||
deval.back()->CacheLoad(instance_path, group_path, silent != 0, use_buffer != 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
else{
|
||||
if (task == "pred" || task == "dumppath"){
|
||||
data.CacheLoad(test_path.c_str(), "", silent != 0, use_buffer != 0);
|
||||
}
|
||||
else{
|
||||
// training
|
||||
data.CacheLoad(train_path.c_str(), "", silent != 0, use_buffer != 0);
|
||||
utils::Assert(eval_data_names.size() == eval_data_paths.size());
|
||||
for (size_t i = 0; i < eval_data_names.size(); ++i){
|
||||
deval.push_back(new DMatrix());
|
||||
deval.back()->CacheLoad(eval_data_paths[i].c_str(), "", silent != 0, use_buffer != 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
learner_->SetData(&data, deval, eval_data_names);
|
||||
if(!silent) printf("BoostTask:Data Initiation Done!\n");
|
||||
}
|
||||
|
||||
inline void InitLearner(void){
|
||||
cfg.BeforeFirst();
|
||||
while (cfg.Next()){
|
||||
learner_->SetParam(cfg.name(), cfg.val());
|
||||
}
|
||||
if (model_in != "NULL"){
|
||||
utils::FileStream fi(utils::FopenCheck(model_in.c_str(), "rb"));
|
||||
learner_->LoadModel(fi);
|
||||
fi.Close();
|
||||
}
|
||||
else{
|
||||
utils::Assert(task == "train", "model_in not specified");
|
||||
learner_->InitModel();
|
||||
}
|
||||
learner_->InitTrainer();
|
||||
if(!silent) printf("BoostTask:InitLearner Done!\n");
|
||||
}
|
||||
|
||||
}
|
||||
else{
|
||||
if (task == "pred" || task == "dumppath"){
|
||||
data.CacheLoad(test_path.c_str(), "", silent != 0, use_buffer != 0);
|
||||
}
|
||||
else{
|
||||
// training
|
||||
data.CacheLoad(train_path.c_str(), "", silent != 0, use_buffer != 0);
|
||||
utils::Assert(eval_data_names.size() == eval_data_paths.size());
|
||||
for (size_t i = 0; i < eval_data_names.size(); ++i){
|
||||
deval.push_back(new DMatrix());
|
||||
deval.back()->CacheLoad(eval_data_paths[i].c_str(), "", silent != 0, use_buffer != 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
inline void TaskTrain(void){
|
||||
const time_t start = time(NULL);
|
||||
unsigned long elapsed = 0;
|
||||
for (int i = 0; i < num_round; ++i){
|
||||
elapsed = (unsigned long)(time(NULL) - start);
|
||||
if (!silent) printf("boosting round %d, %lu sec elapsed\n", i, elapsed);
|
||||
learner_->UpdateOneIter(i);
|
||||
learner_->EvalOneIter(i);
|
||||
if (save_period != 0 && (i + 1) % save_period == 0){
|
||||
this->SaveModel(i);
|
||||
}
|
||||
elapsed = (unsigned long)(time(NULL) - start);
|
||||
}
|
||||
// always save final round
|
||||
if (save_period == 0 || num_round % save_period != 0){
|
||||
if (model_out == "NULL"){
|
||||
this->SaveModel(num_round - 1);
|
||||
}
|
||||
else{
|
||||
this->SaveModel(model_out.c_str());
|
||||
}
|
||||
}
|
||||
if (!silent){
|
||||
printf("\nupdating end, %lu sec in all\n", elapsed);
|
||||
}
|
||||
}
|
||||
inline void TaskEval(void){
|
||||
learner_->EvalOneIter(0);
|
||||
}
|
||||
inline void TaskInteractive(void){
|
||||
const time_t start = time(NULL);
|
||||
unsigned long elapsed = 0;
|
||||
int batch_action = 0;
|
||||
|
||||
learner_->SetData(&data, deval, eval_data_names);
|
||||
}
|
||||
inline void InitLearner(void){
|
||||
cfg.BeforeFirst();
|
||||
while (cfg.Next()){
|
||||
learner_->SetParam(cfg.name(), cfg.val());
|
||||
}
|
||||
if (model_in != "NULL"){
|
||||
utils::FileStream fi(utils::FopenCheck(model_in.c_str(), "rb"));
|
||||
learner_->LoadModel(fi);
|
||||
fi.Close();
|
||||
}
|
||||
else{
|
||||
utils::Assert(task == "train", "model_in not specified");
|
||||
learner_->InitModel();
|
||||
}
|
||||
learner_->InitTrainer();
|
||||
}
|
||||
cfg_batch.BeforeFirst();
|
||||
while (cfg_batch.Next()){
|
||||
if (!strcmp(cfg_batch.name(), "run")){
|
||||
learner_->UpdateInteract(interact_action);
|
||||
batch_action += 1;
|
||||
}
|
||||
else{
|
||||
learner_->SetParam(cfg_batch.name(), cfg_batch.val());
|
||||
}
|
||||
}
|
||||
|
||||
inline void TaskTrain(void){
|
||||
const time_t start = time(NULL);
|
||||
unsigned long elapsed = 0;
|
||||
for (int i = 0; i < num_round; ++i){
|
||||
elapsed = (unsigned long)(time(NULL) - start);
|
||||
if (!silent) printf("boosting round %d, %lu sec elapsed\n", i, elapsed);
|
||||
learner_->UpdateOneIter(i);
|
||||
learner_->EvalOneIter(i);
|
||||
if (save_period != 0 && (i + 1) % save_period == 0){
|
||||
this->SaveModel(i);
|
||||
}
|
||||
elapsed = (unsigned long)(time(NULL) - start);
|
||||
}
|
||||
// always save final round
|
||||
if (save_period == 0 || num_round % save_period != 0){
|
||||
if (model_out == "NULL"){
|
||||
this->SaveModel(num_round - 1);
|
||||
}
|
||||
else{
|
||||
this->SaveModel(model_out.c_str());
|
||||
}
|
||||
}
|
||||
if (!silent){
|
||||
printf("\nupdating end, %lu sec in all\n", elapsed);
|
||||
}
|
||||
}
|
||||
inline void TaskEval(void){
|
||||
learner_->EvalOneIter(0);
|
||||
}
|
||||
inline void TaskInteractive(void){
|
||||
const time_t start = time(NULL);
|
||||
unsigned long elapsed = 0;
|
||||
int batch_action = 0;
|
||||
if (batch_action == 0){
|
||||
learner_->UpdateInteract(interact_action);
|
||||
}
|
||||
utils::Assert(model_out != "NULL", "interactive mode must specify model_out");
|
||||
this->SaveModel(model_out.c_str());
|
||||
elapsed = (unsigned long)(time(NULL) - start);
|
||||
|
||||
cfg_batch.BeforeFirst();
|
||||
while (cfg_batch.Next()){
|
||||
if (!strcmp(cfg_batch.name(), "run")){
|
||||
learner_->UpdateInteract(interact_action);
|
||||
batch_action += 1;
|
||||
}
|
||||
else{
|
||||
learner_->SetParam(cfg_batch.name(), cfg_batch.val());
|
||||
}
|
||||
}
|
||||
if (!silent){
|
||||
printf("\ninteractive update, %d batch actions, %lu sec in all\n", batch_action, elapsed);
|
||||
}
|
||||
}
|
||||
|
||||
if (batch_action == 0){
|
||||
learner_->UpdateInteract(interact_action);
|
||||
}
|
||||
utils::Assert(model_out != "NULL", "interactive mode must specify model_out");
|
||||
this->SaveModel(model_out.c_str());
|
||||
elapsed = (unsigned long)(time(NULL) - start);
|
||||
inline void TaskDump(void){
|
||||
FILE *fo = utils::FopenCheck(name_dump.c_str(), "w");
|
||||
learner_->DumpModel(fo, fmap, dump_model_stats != 0);
|
||||
fclose(fo);
|
||||
}
|
||||
inline void TaskDumpPath(void){
|
||||
FILE *fo = utils::FopenCheck(name_dumppath.c_str(), "w");
|
||||
learner_->DumpPath(fo, data);
|
||||
fclose(fo);
|
||||
}
|
||||
inline void SaveModel(const char *fname) const{
|
||||
utils::FileStream fo(utils::FopenCheck(fname, "wb"));
|
||||
learner_->SaveModel(fo);
|
||||
fo.Close();
|
||||
}
|
||||
inline void SaveModel(int i) const{
|
||||
char fname[256];
|
||||
sprintf(fname, "%s/%04d.model", model_dir_path.c_str(), i + 1);
|
||||
this->SaveModel(fname);
|
||||
}
|
||||
inline void TaskPred(void){
|
||||
std::vector<float> preds;
|
||||
if (!silent) printf("start prediction...\n");
|
||||
learner_->Predict(preds, data);
|
||||
if (!silent) printf("writing prediction to %s\n", name_pred.c_str());
|
||||
FILE *fo = utils::FopenCheck(name_pred.c_str(), "w");
|
||||
for (size_t i = 0; i < preds.size(); i++){
|
||||
fprintf(fo, "%f\n", preds[i]);
|
||||
}
|
||||
fclose(fo);
|
||||
}
|
||||
private:
|
||||
/* \brief specify the learning task*/
|
||||
int learning_task;
|
||||
/* \brief whether silent */
|
||||
int silent;
|
||||
/* \brief whether use auto binary buffer */
|
||||
int use_buffer;
|
||||
/* \brief number of boosting iterations */
|
||||
int num_round;
|
||||
/* \brief the period to save the model, 0 means only save the final round model */
|
||||
int save_period;
|
||||
/*! \brief interfact action */
|
||||
std::string interact_action;
|
||||
/* \brief the path of training/test data set */
|
||||
std::string train_path, test_path;
|
||||
/* \brief the path of test model file, or file to restart training */
|
||||
std::string model_in;
|
||||
/* \brief the path of final model file, to be saved */
|
||||
std::string model_out;
|
||||
/* \brief the path of directory containing the saved models */
|
||||
std::string model_dir_path;
|
||||
/* \brief task to perform, choosing training or testing */
|
||||
std::string task;
|
||||
/* \brief name of predict file */
|
||||
std::string name_pred;
|
||||
/* \brief whether dump statistics along with model */
|
||||
int dump_model_stats;
|
||||
/* \brief name of feature map */
|
||||
std::string name_fmap;
|
||||
/* \brief name of dump file */
|
||||
std::string name_dump;
|
||||
/* \brief name of dump path file */
|
||||
std::string name_dumppath;
|
||||
/* \brief the paths of validation data sets */
|
||||
std::vector<std::string> eval_data_paths;
|
||||
/* \brief the names of the evaluation data used in output log */
|
||||
std::vector<std::string> eval_data_names;
|
||||
/*! \brief saves configurations */
|
||||
utils::ConfigSaver cfg;
|
||||
/*! \brief batch configurations */
|
||||
utils::ConfigSaver cfg_batch;
|
||||
private:
|
||||
DMatrix data;
|
||||
std::vector<DMatrix*> deval;
|
||||
utils::FeatMap fmap;
|
||||
BoostLearner* learner_;
|
||||
|
||||
if (!silent){
|
||||
printf("\ninteractive update, %d batch actions, %lu sec in all\n", batch_action, elapsed);
|
||||
}
|
||||
}
|
||||
|
||||
inline void TaskDump(void){
|
||||
FILE *fo = utils::FopenCheck(name_dump.c_str(), "w");
|
||||
learner_->DumpModel(fo, fmap, dump_model_stats != 0);
|
||||
fclose(fo);
|
||||
}
|
||||
inline void TaskDumpPath(void){
|
||||
FILE *fo = utils::FopenCheck(name_dumppath.c_str(), "w");
|
||||
learner_->DumpPath(fo, data);
|
||||
fclose(fo);
|
||||
}
|
||||
inline void SaveModel(const char *fname) const{
|
||||
utils::FileStream fo(utils::FopenCheck(fname, "wb"));
|
||||
learner_->SaveModel(fo);
|
||||
fo.Close();
|
||||
}
|
||||
inline void SaveModel(int i) const{
|
||||
char fname[256];
|
||||
sprintf(fname, "%s/%04d.model", model_dir_path.c_str(), i + 1);
|
||||
this->SaveModel(fname);
|
||||
}
|
||||
inline void TaskPred(void){
|
||||
std::vector<float> preds;
|
||||
if (!silent) printf("start prediction...\n");
|
||||
learner_->Predict(preds, data);
|
||||
if (!silent) printf("writing prediction to %s\n", name_pred.c_str());
|
||||
FILE *fo = utils::FopenCheck(name_pred.c_str(), "w");
|
||||
for (size_t i = 0; i < preds.size(); i++){
|
||||
fprintf(fo, "%f\n", preds[i]);
|
||||
}
|
||||
fclose(fo);
|
||||
}
|
||||
private:
|
||||
/* \brief specify the learning task*/
|
||||
int learning_task;
|
||||
/* \brief whether silent */
|
||||
int silent;
|
||||
/* \brief whether use auto binary buffer */
|
||||
int use_buffer;
|
||||
/* \brief number of boosting iterations */
|
||||
int num_round;
|
||||
/* \brief the period to save the model, 0 means only save the final round model */
|
||||
int save_period;
|
||||
/*! \brief interfact action */
|
||||
std::string interact_action;
|
||||
/* \brief the path of training/test data set */
|
||||
std::string train_path, test_path;
|
||||
/* \brief the path of test model file, or file to restart training */
|
||||
std::string model_in;
|
||||
/* \brief the path of final model file, to be saved */
|
||||
std::string model_out;
|
||||
/* \brief the path of directory containing the saved models */
|
||||
std::string model_dir_path;
|
||||
/* \brief task to perform, choosing training or testing */
|
||||
std::string task;
|
||||
/* \brief name of predict file */
|
||||
std::string name_pred;
|
||||
/* \brief whether dump statistics along with model */
|
||||
int dump_model_stats;
|
||||
/* \brief name of feature map */
|
||||
std::string name_fmap;
|
||||
/* \brief name of dump file */
|
||||
std::string name_dump;
|
||||
/* \brief name of dump path file */
|
||||
std::string name_dumppath;
|
||||
/* \brief the paths of validation data sets */
|
||||
std::vector<std::string> eval_data_paths;
|
||||
/* \brief the names of the evaluation data used in output log */
|
||||
std::vector<std::string> eval_data_names;
|
||||
/*! \brief saves configurations */
|
||||
utils::ConfigSaver cfg;
|
||||
/*! \brief batch configurations */
|
||||
utils::ConfigSaver cfg_batch;
|
||||
private:
|
||||
DMatrix data;
|
||||
std::vector<DMatrix*> deval;
|
||||
utils::FeatMap fmap;
|
||||
BoostLearner* learner_;
|
||||
|
||||
};
|
||||
};
|
||||
};
|
||||
};
|
||||
};
|
||||
|
||||
@ -9,183 +9,206 @@
|
||||
|
||||
|
||||
namespace xgboost{
|
||||
namespace base{
|
||||
/*! \brief data matrix for regression,classification,rank content */
|
||||
struct DMatrix{
|
||||
public:
|
||||
/*! \brief maximum feature dimension */
|
||||
unsigned num_feature;
|
||||
/*! \brief feature data content */
|
||||
booster::FMatrixS data;
|
||||
/*! \brief label of each instance */
|
||||
std::vector<float> labels;
|
||||
/*! \brief the index of begin and end of a group,
|
||||
* needed when the learning task is ranking*/
|
||||
std::vector<int> group_index;
|
||||
public:
|
||||
/*! \brief default constructor */
|
||||
DMatrix(void){}
|
||||
namespace base{
|
||||
/*! \brief data matrix for regression, classification, rank content */
|
||||
struct DMatrix{
|
||||
public:
|
||||
/*! \brief maximum feature dimension */
|
||||
unsigned num_feature;
|
||||
/*! \brief feature data content */
|
||||
booster::FMatrixS data;
|
||||
/*! \brief label of each instance */
|
||||
std::vector<float> labels;
|
||||
/*! \brief the index of begin and end of a group,
|
||||
* needed when the learning task is ranking*/
|
||||
std::vector<int> group_index;
|
||||
public:
|
||||
/*! \brief default constructor */
|
||||
DMatrix(void){}
|
||||
|
||||
/*! \brief get the number of instances */
|
||||
inline size_t Size() const{
|
||||
return labels.size();
|
||||
}
|
||||
/*!
|
||||
* \brief load from text file
|
||||
* \param fname file of instances data
|
||||
* \param fgroup file of the group data
|
||||
* \param silent whether print information or not
|
||||
*/
|
||||
inline void LoadText(const char* fname, const char* fgroup, bool silent = false){
|
||||
data.Clear();
|
||||
FILE* file = utils::FopenCheck(fname, "r");
|
||||
float label; bool init = true;
|
||||
char tmp[1024];
|
||||
std::vector<booster::bst_uint> findex;
|
||||
std::vector<booster::bst_float> fvalue;
|
||||
/*! \brief get the number of instances */
|
||||
inline size_t Size() const{
|
||||
return labels.size();
|
||||
}
|
||||
/*!
|
||||
* \brief load from text file
|
||||
* \param fname file of instances data
|
||||
* \param fgroup file of the group data
|
||||
* \param silent whether print information or not
|
||||
*/
|
||||
inline void LoadText(const char* fname, const char* fgroup, bool silent = false){
|
||||
data.Clear();
|
||||
FILE* file = utils::FopenCheck(fname, "r");
|
||||
float label; bool init = true;
|
||||
char tmp[1024];
|
||||
std::vector<booster::bst_uint> findex;
|
||||
std::vector<booster::bst_float> fvalue;
|
||||
|
||||
while (fscanf(file, "%s", tmp) == 1){
|
||||
unsigned index; float value;
|
||||
if (sscanf(tmp, "%u:%f", &index, &value) == 2){
|
||||
findex.push_back(index); fvalue.push_back(value);
|
||||
}
|
||||
else{
|
||||
if (!init){
|
||||
labels.push_back(label);
|
||||
data.AddRow(findex, fvalue);
|
||||
}
|
||||
findex.clear(); fvalue.clear();
|
||||
utils::Assert(sscanf(tmp, "%f", &label) == 1, "invalid format");
|
||||
init = false;
|
||||
}
|
||||
}
|
||||
while (fscanf(file, "%s", tmp) == 1){
|
||||
unsigned index; float value;
|
||||
if (sscanf(tmp, "%u:%f", &index, &value) == 2){
|
||||
findex.push_back(index); fvalue.push_back(value);
|
||||
}
|
||||
else{
|
||||
if (!init){
|
||||
labels.push_back(label);
|
||||
data.AddRow(findex, fvalue);
|
||||
}
|
||||
findex.clear(); fvalue.clear();
|
||||
utils::Assert(sscanf(tmp, "%f", &label) == 1, "invalid format");
|
||||
init = false;
|
||||
}
|
||||
}
|
||||
|
||||
labels.push_back(label);
|
||||
data.AddRow(findex, fvalue);
|
||||
// initialize column support as well
|
||||
data.InitData();
|
||||
labels.push_back(label);
|
||||
data.AddRow(findex, fvalue);
|
||||
// initialize column support as well
|
||||
data.InitData();
|
||||
|
||||
if (!silent){
|
||||
printf("%ux%u matrix with %lu entries is loaded from %s\n",
|
||||
(unsigned)data.NumRow(), (unsigned)data.NumCol(), (unsigned long)data.NumEntry(), fname);
|
||||
}
|
||||
fclose(file);
|
||||
LoadGroup(fgroup,silent);
|
||||
}
|
||||
|
||||
inline void LoadGroup(const char* fgroup, bool silent = false){
|
||||
//if exists group data load it in
|
||||
FILE *file_group = fopen64(fgroup, "r");
|
||||
|
||||
if (file_group != NULL){
|
||||
group_index.push_back(0);
|
||||
int tmp = 0, acc = 0,cnt = 0;
|
||||
while (fscanf(file_group, "%d", &tmp) == 1){
|
||||
acc += tmp;
|
||||
group_index.push_back(acc);
|
||||
cnt++;
|
||||
}
|
||||
if(!silent) printf("%d groups are loaded from %s\n",cnt,fgroup);
|
||||
fclose(file_group);
|
||||
}else{
|
||||
if(!silent) printf("There is no group file\n");
|
||||
}
|
||||
|
||||
}
|
||||
/*!
|
||||
* \brief load from binary file
|
||||
* \param fname name of binary data
|
||||
* \param silent whether print information or not
|
||||
* \return whether loading is success
|
||||
*/
|
||||
inline bool LoadBinary(const char* fname, const char* fgroup, bool silent = false){
|
||||
FILE *fp = fopen64(fname, "rb");
|
||||
if (fp == NULL) return false;
|
||||
utils::FileStream fs(fp);
|
||||
data.LoadBinary(fs);
|
||||
labels.resize(data.NumRow());
|
||||
utils::Assert(fs.Read(&labels[0], sizeof(float) * data.NumRow()) != 0, "DMatrix LoadBinary");
|
||||
fs.Close();
|
||||
// initialize column support as well
|
||||
data.InitData();
|
||||
|
||||
if (!silent){
|
||||
printf("%ux%u matrix with %lu entries is loaded from %s\n",
|
||||
(unsigned)data.NumRow(), (unsigned)data.NumCol(), (unsigned long)data.NumEntry(), fname);
|
||||
}
|
||||
fclose(file);
|
||||
if (!silent){
|
||||
printf("%ux%u matrix with %lu entries is loaded from %s as binary\n",
|
||||
(unsigned)data.NumRow(), (unsigned)data.NumCol(), (unsigned long)data.NumEntry(), fname);
|
||||
}
|
||||
|
||||
//if exists group data load it in
|
||||
FILE *file_group = fopen64(fgroup, "r");
|
||||
if (file_group != NULL){
|
||||
group_index.push_back(0);
|
||||
int tmp = 0, acc = 0;
|
||||
while (fscanf(file_group, "%d", tmp) == 1){
|
||||
acc += tmp;
|
||||
group_index.push_back(acc);
|
||||
}
|
||||
}
|
||||
}
|
||||
/*!
|
||||
* \brief load from binary file
|
||||
* \param fname name of binary data
|
||||
* \param silent whether print information or not
|
||||
* \return whether loading is success
|
||||
*/
|
||||
inline bool LoadBinary(const char* fname, const char* fgroup, bool silent = false){
|
||||
FILE *fp = fopen64(fname, "rb");
|
||||
if (fp == NULL) return false;
|
||||
utils::FileStream fs(fp);
|
||||
data.LoadBinary(fs);
|
||||
labels.resize(data.NumRow());
|
||||
utils::Assert(fs.Read(&labels[0], sizeof(float)* data.NumRow()) != 0, "DMatrix LoadBinary");
|
||||
fs.Close();
|
||||
// initialize column support as well
|
||||
data.InitData();
|
||||
LoadGroupBinary(fgroup,silent);
|
||||
return true;
|
||||
}
|
||||
|
||||
/*!
|
||||
* \brief save to binary file
|
||||
* \param fname name of binary data
|
||||
* \param silent whether print information or not
|
||||
*/
|
||||
inline void SaveBinary(const char* fname, const char* fgroup, bool silent = false){
|
||||
// initialize column support as well
|
||||
data.InitData();
|
||||
|
||||
if (!silent){
|
||||
printf("%ux%u matrix with %lu entries is loaded from %s\n",
|
||||
(unsigned)data.NumRow(), (unsigned)data.NumCol(), (unsigned long)data.NumEntry(), fname);
|
||||
}
|
||||
utils::FileStream fs(utils::FopenCheck(fname, "wb"));
|
||||
data.SaveBinary(fs);
|
||||
fs.Write(&labels[0], sizeof(float)* data.NumRow());
|
||||
fs.Close();
|
||||
if (!silent){
|
||||
printf("%ux%u matrix with %lu entries is saved to %s as binary\n",
|
||||
(unsigned)data.NumRow(), (unsigned)data.NumCol(), (unsigned long)data.NumEntry(), fname);
|
||||
}
|
||||
|
||||
//if group data exists load it in
|
||||
FILE *file_group = fopen64(fgroup, "r");
|
||||
if (file_group != NULL){
|
||||
int group_index_size = 0;
|
||||
utils::FileStream group_stream(file_group);
|
||||
utils::Assert(group_stream.Read(&group_index_size, sizeof(int)) != 0, "Load group indice size");
|
||||
group_index.resize(group_index_size);
|
||||
utils::Assert(group_stream.Read(&group_index, sizeof(int)* group_index_size) != 0, "Load group indice");
|
||||
SaveGroupBinary(fgroup,silent);
|
||||
}
|
||||
|
||||
inline void SaveGroupBinary(const char* fgroup, bool silent = false){
|
||||
//save group data
|
||||
if (group_index.size() > 0){
|
||||
utils::FileStream file_group(utils::FopenCheck(fgroup, "wb"));
|
||||
int group_index_size = group_index.size();
|
||||
file_group.Write(&(group_index_size), sizeof(int));
|
||||
file_group.Write(&group_index[0], sizeof(int) * group_index_size);
|
||||
file_group.Close();
|
||||
if(!silent){printf("Index info of %d groups is saved to %s as binary\n",group_index_size-1,fgroup);}
|
||||
}
|
||||
}
|
||||
|
||||
inline void LoadGroupBinary(const char* fgroup, bool silent = false){
|
||||
//if group data exists load it in
|
||||
FILE *file_group = fopen64(fgroup, "r");
|
||||
if (file_group != NULL){
|
||||
int group_index_size = 0;
|
||||
utils::FileStream group_stream(file_group);
|
||||
utils::Assert(group_stream.Read(&group_index_size, sizeof(int)) != 0, "Load group indice size");
|
||||
group_index.resize(group_index_size);
|
||||
utils::Assert(group_stream.Read(&group_index[0], sizeof(int) * group_index_size) != 0, "Load group indice");
|
||||
|
||||
if (!silent){
|
||||
printf("the group index of %d groups is loaded from %s\n",
|
||||
group_index_size - 1, fgroup);
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
/*!
|
||||
* \brief save to binary file
|
||||
* \param fname name of binary data
|
||||
* \param silent whether print information or not
|
||||
*/
|
||||
inline void SaveBinary(const char* fname, const char* fgroup, bool silent = false){
|
||||
// initialize column support as well
|
||||
data.InitData();
|
||||
|
||||
utils::FileStream fs(utils::FopenCheck(fname, "wb"));
|
||||
data.SaveBinary(fs);
|
||||
fs.Write(&labels[0], sizeof(float)* data.NumRow());
|
||||
fs.Close();
|
||||
if (!silent){
|
||||
printf("%ux%u matrix with %lu entries is saved to %s\n",
|
||||
(unsigned)data.NumRow(), (unsigned)data.NumCol(), (unsigned long)data.NumEntry(), fname);
|
||||
}
|
||||
|
||||
//save group data
|
||||
if (group_index.size() > 0){
|
||||
utils::FileStream file_group(utils::FopenCheck(fgroup, "wb"));
|
||||
int group_index_size = group_index.size();
|
||||
file_group.Write(&(group_index_size), sizeof(int));
|
||||
file_group.Write(&group_index[0], sizeof(int) * group_index_size);
|
||||
}
|
||||
|
||||
}
|
||||
/*!
|
||||
* \brief cache load data given a file name, if filename ends with .buffer, direct load binary
|
||||
* otherwise the function will first check if fname + '.buffer' exists,
|
||||
* if binary buffer exists, it will reads from binary buffer, otherwise, it will load from text file,
|
||||
* and try to create a buffer file
|
||||
* \param fname name of binary data
|
||||
* \param silent whether print information or not
|
||||
* \param savebuffer whether do save binary buffer if it is text
|
||||
*/
|
||||
inline void CacheLoad(const char *fname, const char *fgroup, bool silent = false, bool savebuffer = true){
|
||||
int len = strlen(fname);
|
||||
if (len > 8 && !strcmp(fname + len - 7, ".buffer")){
|
||||
this->LoadBinary(fname, fgroup, silent); return;
|
||||
}
|
||||
char bname[1024];
|
||||
sprintf(bname, "%s.buffer", fname);
|
||||
if (!this->LoadBinary(bname, fgroup, silent)){
|
||||
this->LoadText(fname, fgroup, silent);
|
||||
if (savebuffer) this->SaveBinary(bname, fgroup, silent);
|
||||
}
|
||||
}
|
||||
private:
|
||||
/*! \brief update num_feature info */
|
||||
inline void UpdateInfo(void){
|
||||
this->num_feature = 0;
|
||||
for (size_t i = 0; i < data.NumRow(); i++){
|
||||
booster::FMatrixS::Line sp = data[i];
|
||||
for (unsigned j = 0; j < sp.len; j++){
|
||||
if (num_feature <= sp[j].findex){
|
||||
num_feature = sp[j].findex + 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
|
||||
}
|
||||
if (!silent){
|
||||
printf("Index info of %d groups is loaded from %s as binary\n",
|
||||
group_index.size() - 1, fgroup);
|
||||
}
|
||||
fclose(file_group);
|
||||
}else{
|
||||
if(!silent){printf("The binary file of group info not exists");}
|
||||
}
|
||||
}
|
||||
|
||||
/*!
|
||||
* \brief cache load data given a file name, if filename ends with .buffer, direct load binary
|
||||
* otherwise the function will first check if fname + '.buffer' exists,
|
||||
* if binary buffer exists, it will reads from binary buffer, otherwise, it will load from text file,
|
||||
* and try to create a buffer file
|
||||
* \param fname name of binary data
|
||||
* \param silent whether print information or not
|
||||
* \param savebuffer whether do save binary buffer if it is text
|
||||
*/
|
||||
inline void CacheLoad(const char *fname, const char *fgroup, bool silent = false, bool savebuffer = true){
|
||||
int len = strlen(fname);
|
||||
if (len > 8 && !strcmp(fname + len - 7, ".buffer")){
|
||||
this->LoadBinary(fname, fgroup, silent); return;
|
||||
}
|
||||
char bname[1024],bgroup[1024];
|
||||
sprintf(bname, "%s.buffer", fname);
|
||||
sprintf(bgroup, "%s.buffer", fgroup);
|
||||
if (!this->LoadBinary(bname, bgroup, silent))
|
||||
{
|
||||
this->LoadText(fname, fgroup, silent);
|
||||
if (savebuffer) this->SaveBinary(bname, bgroup, silent);
|
||||
}
|
||||
}
|
||||
private:
|
||||
/*! \brief update num_feature info */
|
||||
inline void UpdateInfo(void){
|
||||
this->num_feature = 0;
|
||||
for (size_t i = 0; i < data.NumRow(); i++){
|
||||
booster::FMatrixS::Line sp = data[i];
|
||||
for (unsigned j = 0; j < sp.len; j++){
|
||||
if (num_feature <= sp[j].findex){
|
||||
num_feature = sp[j].findex + 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
};
|
||||
|
||||
#endif
|
||||
@ -15,256 +15,264 @@
|
||||
#include "../utils/xgboost_stream.h"
|
||||
|
||||
namespace xgboost {
|
||||
namespace base {
|
||||
/*! \brief class for gradient boosting learner */
|
||||
class BoostLearner {
|
||||
public:
|
||||
/*! \brief constructor */
|
||||
BoostLearner(void) {
|
||||
silent = 0;
|
||||
}
|
||||
/*!
|
||||
* \brief booster associated with training and evaluating data
|
||||
* \param train pointer to the training data
|
||||
* \param evals array of evaluating data
|
||||
* \param evname name of evaluation data, used print statistics
|
||||
*/
|
||||
BoostLearner(const DMatrix *train,
|
||||
const std::vector<DMatrix *> &evals,
|
||||
const std::vector<std::string> &evname) {
|
||||
silent = 0;
|
||||
this->SetData(train, evals, evname);
|
||||
}
|
||||
namespace base {
|
||||
/*! \brief class for gradient boosting learner */
|
||||
class BoostLearner {
|
||||
public:
|
||||
/*! \brief constructor */
|
||||
BoostLearner(void) {
|
||||
silent = 0;
|
||||
}
|
||||
/*!
|
||||
* \brief booster associated with training and evaluating data
|
||||
* \param train pointer to the training data
|
||||
* \param evals array of evaluating data
|
||||
* \param evname name of evaluation data, used print statistics
|
||||
*/
|
||||
BoostLearner(const DMatrix *train,
|
||||
const std::vector<DMatrix *> &evals,
|
||||
const std::vector<std::string> &evname) {
|
||||
silent = 0;
|
||||
this->SetData(train, evals, evname);
|
||||
}
|
||||
|
||||
/*!
|
||||
* \brief associate booster with training and evaluating data
|
||||
* \param train pointer to the training data
|
||||
* \param evals array of evaluating data
|
||||
* \param evname name of evaluation data, used print statistics
|
||||
*/
|
||||
inline void SetData(const DMatrix *train,
|
||||
const std::vector<DMatrix *> &evals,
|
||||
const std::vector<std::string> &evname) {
|
||||
this->train_ = train;
|
||||
this->evals_ = evals;
|
||||
this->evname_ = evname;
|
||||
// estimate feature bound
|
||||
int num_feature = (int)(train->data.NumCol());
|
||||
// assign buffer index
|
||||
unsigned buffer_size = static_cast<unsigned>(train->Size());
|
||||
/*!
|
||||
* \brief associate booster with training and evaluating data
|
||||
* \param train pointer to the training data
|
||||
* \param evals array of evaluating data
|
||||
* \param evname name of evaluation data, used print statistics
|
||||
*/
|
||||
inline void SetData(const DMatrix *train,
|
||||
const std::vector<DMatrix *> &evals,
|
||||
const std::vector<std::string> &evname) {
|
||||
this->train_ = train;
|
||||
this->evals_ = evals;
|
||||
this->evname_ = evname;
|
||||
// estimate feature bound
|
||||
int num_feature = (int)(train->data.NumCol());
|
||||
// assign buffer index
|
||||
unsigned buffer_size = static_cast<unsigned>(train->Size());
|
||||
|
||||
for (size_t i = 0; i < evals.size(); ++i) {
|
||||
buffer_size += static_cast<unsigned>(evals[i]->Size());
|
||||
num_feature = std::max(num_feature, (int)(evals[i]->data.NumCol()));
|
||||
}
|
||||
for (size_t i = 0; i < evals.size(); ++i) {
|
||||
buffer_size += static_cast<unsigned>(evals[i]->Size());
|
||||
num_feature = std::max(num_feature, (int)(evals[i]->data.NumCol()));
|
||||
}
|
||||
|
||||
char str_temp[25];
|
||||
if (num_feature > mparam.num_feature) {
|
||||
mparam.num_feature = num_feature;
|
||||
sprintf(str_temp, "%d", num_feature);
|
||||
base_gbm.SetParam("bst:num_feature", str_temp);
|
||||
}
|
||||
char str_temp[25];
|
||||
if (num_feature > mparam.num_feature) {
|
||||
mparam.num_feature = num_feature;
|
||||
sprintf(str_temp, "%d", num_feature);
|
||||
base_gbm.SetParam("bst:num_feature", str_temp);
|
||||
}
|
||||
|
||||
sprintf(str_temp, "%u", buffer_size);
|
||||
base_gbm.SetParam("num_pbuffer", str_temp);
|
||||
if (!silent) {
|
||||
printf("buffer_size=%u\n", buffer_size);
|
||||
}
|
||||
sprintf(str_temp, "%u", buffer_size);
|
||||
base_gbm.SetParam("num_pbuffer", str_temp);
|
||||
if (!silent) {
|
||||
printf("buffer_size=%u\n", buffer_size);
|
||||
}
|
||||
|
||||
// set eval_preds tmp sapce
|
||||
this->eval_preds_.resize(evals.size(), std::vector<float>());
|
||||
}
|
||||
/*!
|
||||
* \brief set parameters from outside
|
||||
* \param name name of the parameter
|
||||
* \param val value of the parameter
|
||||
*/
|
||||
virtual inline void SetParam(const char *name, const char *val) {
|
||||
if (!strcmp(name, "silent")) silent = atoi(val);
|
||||
mparam.SetParam(name, val);
|
||||
base_gbm.SetParam(name, val);
|
||||
}
|
||||
/*!
|
||||
* \brief initialize solver before training, called before training
|
||||
* this function is reserved for solver to allocate necessary space and do other preparation
|
||||
*/
|
||||
inline void InitTrainer(void) {
|
||||
base_gbm.InitTrainer();
|
||||
}
|
||||
/*!
|
||||
* \brief initialize the current data storage for model, if the model is used first time, call this function
|
||||
*/
|
||||
inline void InitModel(void) {
|
||||
base_gbm.InitModel();
|
||||
}
|
||||
/*!
|
||||
* \brief load model from stream
|
||||
* \param fi input stream
|
||||
*/
|
||||
inline void LoadModel(utils::IStream &fi) {
|
||||
base_gbm.LoadModel(fi);
|
||||
utils::Assert(fi.Read(&mparam, sizeof(ModelParam)) != 0);
|
||||
}
|
||||
/*!
|
||||
* \brief DumpModel
|
||||
* \param fo text file
|
||||
* \param fmap feature map that may help give interpretations of feature
|
||||
* \param with_stats whether print statistics as well
|
||||
*/
|
||||
inline void DumpModel(FILE *fo, const utils::FeatMap& fmap, bool with_stats) {
|
||||
base_gbm.DumpModel(fo, fmap, with_stats);
|
||||
}
|
||||
/*!
|
||||
* \brief Dump path of all trees
|
||||
* \param fo text file
|
||||
* \param data input data
|
||||
*/
|
||||
inline void DumpPath(FILE *fo, const DMatrix &data) {
|
||||
base_gbm.DumpPath(fo, data.data);
|
||||
}
|
||||
// set eval_preds tmp sapce
|
||||
this->eval_preds_.resize(evals.size(), std::vector<float>());
|
||||
}
|
||||
/*!
|
||||
* \brief set parameters from outside
|
||||
* \param name name of the parameter
|
||||
* \param val value of the parameter
|
||||
*/
|
||||
virtual inline void SetParam(const char *name, const char *val) {
|
||||
if (!strcmp(name, "silent")) silent = atoi(val);
|
||||
mparam.SetParam(name, val);
|
||||
base_gbm.SetParam(name, val);
|
||||
}
|
||||
/*!
|
||||
* \brief initialize solver before training, called before training
|
||||
* this function is reserved for solver to allocate necessary space and do other preparation
|
||||
*/
|
||||
inline void InitTrainer(void) {
|
||||
base_gbm.InitTrainer();
|
||||
}
|
||||
/*!
|
||||
* \brief initialize the current data storage for model, if the model is used first time, call this function
|
||||
*/
|
||||
inline void InitModel(void) {
|
||||
base_gbm.InitModel();
|
||||
if(!silent) printf("BoostLearner:InitModel Done!\n");
|
||||
}
|
||||
/*!
|
||||
* \brief load model from stream
|
||||
* \param fi input stream
|
||||
*/
|
||||
inline void LoadModel(utils::IStream &fi) {
|
||||
base_gbm.LoadModel(fi);
|
||||
utils::Assert(fi.Read(&mparam, sizeof(ModelParam)) != 0);
|
||||
}
|
||||
/*!
|
||||
* \brief DumpModel
|
||||
* \param fo text file
|
||||
* \param fmap feature map that may help give interpretations of feature
|
||||
* \param with_stats whether print statistics as well
|
||||
*/
|
||||
inline void DumpModel(FILE *fo, const utils::FeatMap& fmap, bool with_stats) {
|
||||
base_gbm.DumpModel(fo, fmap, with_stats);
|
||||
}
|
||||
/*!
|
||||
* \brief Dump path of all trees
|
||||
* \param fo text file
|
||||
* \param data input data
|
||||
*/
|
||||
inline void DumpPath(FILE *fo, const DMatrix &data) {
|
||||
base_gbm.DumpPath(fo, data.data);
|
||||
}
|
||||
|
||||
/*!
|
||||
* \brief save model to stream
|
||||
* \param fo output stream
|
||||
*/
|
||||
inline void SaveModel(utils::IStream &fo) const {
|
||||
base_gbm.SaveModel(fo);
|
||||
fo.Write(&mparam, sizeof(ModelParam));
|
||||
}
|
||||
/*!
|
||||
* \brief save model to stream
|
||||
* \param fo output stream
|
||||
*/
|
||||
inline void SaveModel(utils::IStream &fo) const {
|
||||
base_gbm.SaveModel(fo);
|
||||
fo.Write(&mparam, sizeof(ModelParam));
|
||||
}
|
||||
|
||||
virtual void EvalOneIter(int iter, FILE *fo = stderr) {}
|
||||
virtual void EvalOneIter(int iter, FILE *fo = stderr) {}
|
||||
|
||||
/*!
|
||||
* \brief update the model for one iteration
|
||||
* \param iteration iteration number
|
||||
*/
|
||||
inline void UpdateOneIter(int iter) {
|
||||
this->PredictBuffer(preds_, *train_, 0);
|
||||
this->GetGradient(preds_, train_->labels, train_->group_index, grad_, hess_);
|
||||
std::vector<unsigned> root_index;
|
||||
base_gbm.DoBoost(grad_, hess_, train_->data, root_index);
|
||||
}
|
||||
/*!
|
||||
* \brief update the model for one iteration
|
||||
* \param iteration iteration number
|
||||
*/
|
||||
inline void UpdateOneIter(int iter) {
|
||||
this->PredictBuffer(preds_, *train_, 0);
|
||||
this->GetGradient(preds_, train_->labels, train_->group_index, grad_, hess_);
|
||||
std::vector<unsigned> root_index;
|
||||
base_gbm.DoBoost(grad_, hess_, train_->data, root_index);
|
||||
|
||||
// printf("xgboost_learner.h:UpdateOneIter\n");
|
||||
// const unsigned ndata = static_cast<unsigned>(train_->Size());
|
||||
// #pragma omp parallel for schedule( static )
|
||||
// for (unsigned j = 0; j < ndata; ++j) {
|
||||
// printf("haha:%d %f\n",j,base_gbm.Predict(train_->data, j, j));
|
||||
// }
|
||||
}
|
||||
|
||||
/*! \brief get intransformed prediction, without buffering */
|
||||
inline void Predict(std::vector<float> &preds, const DMatrix &data) {
|
||||
preds.resize(data.Size());
|
||||
/*! \brief get intransformed prediction, without buffering */
|
||||
inline void Predict(std::vector<float> &preds, const DMatrix &data) {
|
||||
preds.resize(data.Size());
|
||||
const unsigned ndata = static_cast<unsigned>(data.Size());
|
||||
#pragma omp parallel for schedule( static )
|
||||
for (unsigned j = 0; j < ndata; ++j) {
|
||||
preds[j] = base_gbm.Predict(data.data, j, -1);
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
const unsigned ndata = static_cast<unsigned>(data.Size());
|
||||
#pragma omp parallel for schedule( static )
|
||||
for (unsigned j = 0; j < ndata; ++j) {
|
||||
preds[j] = base_gbm.Predict(data.data, j, -1);
|
||||
}
|
||||
}
|
||||
public:
|
||||
/*!
|
||||
* \brief update the model for one iteration
|
||||
* \param iteration iteration number
|
||||
*/
|
||||
virtual inline void UpdateInteract(std::string action){
|
||||
this->InteractPredict(preds_, *train_, 0);
|
||||
|
||||
public:
|
||||
/*!
|
||||
* \brief update the model for one iteration
|
||||
* \param iteration iteration number
|
||||
*/
|
||||
virtual inline void UpdateInteract(std::string action){
|
||||
this->InteractPredict(preds_, *train_, 0);
|
||||
int buffer_offset = static_cast<int>(train_->Size());
|
||||
for (size_t i = 0; i < evals_.size(); ++i) {
|
||||
std::vector<float> &preds = this->eval_preds_[i];
|
||||
this->InteractPredict(preds, *evals_[i], buffer_offset);
|
||||
buffer_offset += static_cast<int>(evals_[i]->Size());
|
||||
}
|
||||
|
||||
int buffer_offset = static_cast<int>(train_->Size());
|
||||
for (size_t i = 0; i < evals_.size(); ++i) {
|
||||
std::vector<float> &preds = this->eval_preds_[i];
|
||||
this->InteractPredict(preds, *evals_[i], buffer_offset);
|
||||
buffer_offset += static_cast<int>(evals_[i]->Size());
|
||||
}
|
||||
if (action == "remove") {
|
||||
base_gbm.DelteBooster();
|
||||
return;
|
||||
}
|
||||
|
||||
if (action == "remove") {
|
||||
base_gbm.DelteBooster();
|
||||
return;
|
||||
}
|
||||
this->GetGradient(preds_, train_->labels, train_->group_index, grad_, hess_);
|
||||
std::vector<unsigned> root_index;
|
||||
base_gbm.DoBoost(grad_, hess_, train_->data, root_index);
|
||||
|
||||
this->GetGradient(preds_, train_->labels, train_->group_index, grad_, hess_);
|
||||
std::vector<unsigned> root_index;
|
||||
base_gbm.DoBoost(grad_, hess_, train_->data, root_index);
|
||||
this->InteractRePredict(*train_, 0);
|
||||
buffer_offset = static_cast<int>(train_->Size());
|
||||
for (size_t i = 0; i < evals_.size(); ++i) {
|
||||
this->InteractRePredict(*evals_[i], buffer_offset);
|
||||
buffer_offset += static_cast<int>(evals_[i]->Size());
|
||||
}
|
||||
};
|
||||
|
||||
this->InteractRePredict(*train_, 0);
|
||||
buffer_offset = static_cast<int>(train_->Size());
|
||||
for (size_t i = 0; i < evals_.size(); ++i) {
|
||||
this->InteractRePredict(*evals_[i], buffer_offset);
|
||||
buffer_offset += static_cast<int>(evals_[i]->Size());
|
||||
}
|
||||
};
|
||||
protected:
|
||||
/*! \brief get the intransformed predictions, given data */
|
||||
inline void InteractPredict(std::vector<float> &preds, const DMatrix &data, unsigned buffer_offset) {
|
||||
preds.resize(data.Size());
|
||||
const unsigned ndata = static_cast<unsigned>(data.Size());
|
||||
#pragma omp parallel for schedule( static )
|
||||
for (unsigned j = 0; j < ndata; ++j) {
|
||||
preds[j] = base_gbm.InteractPredict(data.data, j, buffer_offset + j);
|
||||
}
|
||||
}
|
||||
/*! \brief repredict trial */
|
||||
inline void InteractRePredict(const xgboost::base::DMatrix &data, unsigned buffer_offset) {
|
||||
const unsigned ndata = static_cast<unsigned>(data.Size());
|
||||
#pragma omp parallel for schedule( static )
|
||||
for (unsigned j = 0; j < ndata; ++j) {
|
||||
base_gbm.InteractRePredict(data.data, j, buffer_offset + j);
|
||||
}
|
||||
}
|
||||
|
||||
protected:
|
||||
/*! \brief get the intransformed predictions, given data */
|
||||
inline void InteractPredict(std::vector<float> &preds, const DMatrix &data, unsigned buffer_offset) {
|
||||
preds.resize(data.Size());
|
||||
const unsigned ndata = static_cast<unsigned>(data.Size());
|
||||
#pragma omp parallel for schedule( static )
|
||||
for (unsigned j = 0; j < ndata; ++j) {
|
||||
preds[j] = base_gbm.InteractPredict(data.data, j, buffer_offset + j);
|
||||
}
|
||||
}
|
||||
/*! \brief repredict trial */
|
||||
inline void InteractRePredict(const xgboost::base::DMatrix &data, unsigned buffer_offset) {
|
||||
const unsigned ndata = static_cast<unsigned>(data.Size());
|
||||
#pragma omp parallel for schedule( static )
|
||||
for (unsigned j = 0; j < ndata; ++j) {
|
||||
base_gbm.InteractRePredict(data.data, j, buffer_offset + j);
|
||||
}
|
||||
}
|
||||
/*! \brief get intransformed predictions, given data */
|
||||
virtual inline void PredictBuffer(std::vector<float> &preds, const DMatrix &data, unsigned buffer_offset) {
|
||||
preds.resize(data.Size());
|
||||
const unsigned ndata = static_cast<unsigned>(data.Size());
|
||||
|
||||
#pragma omp parallel for schedule( static )
|
||||
for (unsigned j = 0; j < ndata; ++j) {
|
||||
preds[j] = base_gbm.Predict(data.data, j, buffer_offset + j);
|
||||
}
|
||||
}
|
||||
|
||||
/*! \brief get intransformed predictions, given data */
|
||||
virtual inline void PredictBuffer(std::vector<float> &preds, const DMatrix &data, unsigned buffer_offset) {
|
||||
preds.resize(data.Size());
|
||||
|
||||
const unsigned ndata = static_cast<unsigned>(data.Size());
|
||||
#pragma omp parallel for schedule( static )
|
||||
for (unsigned j = 0; j < ndata; ++j) {
|
||||
preds[j] = base_gbm.Predict(data.data, j, buffer_offset + j);
|
||||
}
|
||||
}
|
||||
|
||||
/*! \brief get the first order and second order gradient, given the transformed predictions and labels */
|
||||
virtual inline void GetGradient(const std::vector<float> &preds,
|
||||
const std::vector<float> &labels,
|
||||
const std::vector<int> &group_index,
|
||||
std::vector<float> &grad,
|
||||
std::vector<float> &hess) {};
|
||||
/*! \brief get the first order and second order gradient, given the transformed predictions and labels */
|
||||
virtual inline void GetGradient(const std::vector<float> &preds,
|
||||
const std::vector<float> &labels,
|
||||
const std::vector<int> &group_index,
|
||||
std::vector<float> &grad,
|
||||
std::vector<float> &hess) {};
|
||||
|
||||
|
||||
protected:
|
||||
protected:
|
||||
|
||||
/*! \brief training parameter for regression */
|
||||
struct ModelParam {
|
||||
/* \brief type of loss function */
|
||||
int loss_type;
|
||||
/* \brief number of features */
|
||||
int num_feature;
|
||||
/*! \brief reserved field */
|
||||
int reserved[16];
|
||||
/*! \brief constructor */
|
||||
ModelParam(void) {
|
||||
loss_type = 0;
|
||||
num_feature = 0;
|
||||
memset(reserved, 0, sizeof(reserved));
|
||||
}
|
||||
/*!
|
||||
* \brief set parameters from outside
|
||||
* \param name name of the parameter
|
||||
* \param val value of the parameter
|
||||
*/
|
||||
inline void SetParam(const char *name, const char *val) {
|
||||
if (!strcmp("loss_type", name)) loss_type = atoi(val);
|
||||
if (!strcmp("bst:num_feature", name)) num_feature = atoi(val);
|
||||
}
|
||||
/*! \brief training parameter for regression */
|
||||
struct ModelParam {
|
||||
/* \brief type of loss function */
|
||||
int loss_type;
|
||||
/* \brief number of features */
|
||||
int num_feature;
|
||||
/*! \brief reserved field */
|
||||
int reserved[16];
|
||||
/*! \brief constructor */
|
||||
ModelParam(void) {
|
||||
loss_type = 0;
|
||||
num_feature = 0;
|
||||
memset(reserved, 0, sizeof(reserved));
|
||||
}
|
||||
/*!
|
||||
* \brief set parameters from outside
|
||||
* \param name name of the parameter
|
||||
* \param val value of the parameter
|
||||
*/
|
||||
inline void SetParam(const char *name, const char *val) {
|
||||
if (!strcmp("loss_type", name)) loss_type = atoi(val);
|
||||
if (!strcmp("bst:num_feature", name)) num_feature = atoi(val);
|
||||
}
|
||||
|
||||
};
|
||||
};
|
||||
|
||||
int silent;
|
||||
booster::GBMBase base_gbm;
|
||||
ModelParam mparam;
|
||||
const DMatrix *train_;
|
||||
std::vector<DMatrix *> evals_;
|
||||
std::vector<std::string> evname_;
|
||||
std::vector<unsigned> buffer_index_;
|
||||
std::vector<float> grad_, hess_, preds_;
|
||||
std::vector< std::vector<float> > eval_preds_;
|
||||
};
|
||||
}
|
||||
int silent;
|
||||
booster::GBMBase base_gbm;
|
||||
ModelParam mparam;
|
||||
const DMatrix *train_;
|
||||
std::vector<DMatrix *> evals_;
|
||||
std::vector<std::string> evname_;
|
||||
std::vector<unsigned> buffer_index_;
|
||||
std::vector<float> grad_, hess_, preds_;
|
||||
std::vector< std::vector<float> > eval_preds_;
|
||||
};
|
||||
}
|
||||
};
|
||||
|
||||
#endif
|
||||
|
||||
@ -7,7 +7,7 @@
|
||||
*/
|
||||
#include <cmath>
|
||||
#include <cstdlib>
|
||||
#include <cstring>
|
||||
#include <vector>
|
||||
#include "xgboost_sample.h"
|
||||
#include "xgboost_rank_eval.h"
|
||||
#include "../base/xgboost_data_instance.h"
|
||||
@ -18,133 +18,273 @@
|
||||
#include "../base/xgboost_learner.h"
|
||||
|
||||
namespace xgboost {
|
||||
namespace rank {
|
||||
/*! \brief class for gradient boosted regression */
|
||||
class RankBoostLearner :public base::BoostLearner{
|
||||
public:
|
||||
/*! \brief constructor */
|
||||
RankBoostLearner(void) {
|
||||
BoostLearner();
|
||||
namespace rank {
|
||||
/*! \brief class for gradient boosted regression */
|
||||
class RankBoostLearner :public base::BoostLearner{
|
||||
public:
|
||||
/*! \brief constructor */
|
||||
RankBoostLearner(void) {
|
||||
BoostLearner();
|
||||
}
|
||||
/*!
|
||||
* \brief a rank booster associated with training and evaluating data
|
||||
* \param train pointer to the training data
|
||||
* \param evals array of evaluating data
|
||||
* \param evname name of evaluation data, used print statistics
|
||||
*/
|
||||
RankBoostLearner(const base::DMatrix *train,
|
||||
const std::vector<base::DMatrix *> &evals,
|
||||
const std::vector<std::string> &evname) {
|
||||
|
||||
BoostLearner(train, evals, evname);
|
||||
}
|
||||
|
||||
/*!
|
||||
* \brief initialize solver before training, called before training
|
||||
* this function is reserved for solver to allocate necessary space
|
||||
* and do other preparation
|
||||
*/
|
||||
inline void InitTrainer(void) {
|
||||
BoostLearner::InitTrainer();
|
||||
if (mparam.loss_type == PAIRWISE) {
|
||||
evaluator_.AddEval("PAIR");
|
||||
}
|
||||
else if (mparam.loss_type == MAP) {
|
||||
evaluator_.AddEval("MAP");
|
||||
}
|
||||
else {
|
||||
evaluator_.AddEval("NDCG");
|
||||
}
|
||||
evaluator_.Init();
|
||||
}
|
||||
|
||||
void EvalOneIter(int iter, FILE *fo = stderr) {
|
||||
fprintf(fo, "[%d]", iter);
|
||||
int buffer_offset = static_cast<int>(train_->Size());
|
||||
|
||||
for (size_t i = 0; i < evals_.size(); ++i) {
|
||||
std::vector<float> &preds = this->eval_preds_[i];
|
||||
this->PredictBuffer(preds, *evals_[i], buffer_offset);
|
||||
evaluator_.Eval(fo, evname_[i].c_str(), preds, (*evals_[i]).labels, (*evals_[i]).group_index);
|
||||
buffer_offset += static_cast<int>(evals_[i]->Size());
|
||||
}
|
||||
fprintf(fo, "\n");
|
||||
}
|
||||
|
||||
virtual inline void SetParam(const char *name, const char *val){
|
||||
BoostLearner::SetParam(name,val);
|
||||
if (!strcmp(name, "eval_metric")) evaluator_.AddEval(val);
|
||||
if (!strcmp(name, "rank:sampler")) sampler.AssignSampler(atoi(val));
|
||||
}
|
||||
|
||||
private:
|
||||
inline std::vector< Triple<float,float,int> > GetSortedTuple(const std::vector<float> &preds,
|
||||
const std::vector<float> &labels,
|
||||
const std::vector<int> &group_index,
|
||||
int group){
|
||||
std::vector< Triple<float,float,int> > sorted_triple;
|
||||
for(int j = group_index[group]; j < group_index[group+1]; j++){
|
||||
sorted_triple.push_back(Triple<float,float,int>(preds[j],labels[j],j));
|
||||
}
|
||||
std::sort(sorted_triple.begin(),sorted_triple.end(),Triplef1Comparer);
|
||||
return sorted_triple;
|
||||
}
|
||||
|
||||
inline std::vector<int> GetIndexMap(std::vector< Triple<float,float,int> > sorted_triple,int start){
|
||||
std::vector<int> index_remap;
|
||||
index_remap.resize(sorted_triple.size());
|
||||
for(int i = 0; i < sorted_triple.size(); i++){
|
||||
index_remap[sorted_triple[i].f3_-start] = i;
|
||||
}
|
||||
return index_remap;
|
||||
}
|
||||
|
||||
inline float GetLambdaMAP(const std::vector< Triple<float,float,int> > sorted_triple,
|
||||
int index1,int index2,
|
||||
std::vector< Quadruple<float,float,float,float> > map_acc){
|
||||
if(index1 > index2) std::swap(index1,index2);
|
||||
float original = map_acc[index2].f1_;
|
||||
if(index1 != 0) original -= map_acc[index1 - 1].f1_;
|
||||
float changed = 0;
|
||||
if(sorted_triple[index1].f2_ < sorted_triple[index2].f2_){
|
||||
changed += map_acc[index2 - 1].f3_ - map_acc[index1].f3_;
|
||||
changed += (map_acc[index1].f4_ + 1.0f)/(index1 + 1);
|
||||
}else{
|
||||
changed += map_acc[index2 - 1].f2_ - map_acc[index1].f2_;
|
||||
changed += map_acc[index2].f4_/(index2 + 1);
|
||||
}
|
||||
float ans = (changed - original)/(map_acc[map_acc.size() - 1].f4_);
|
||||
if(ans < 0) ans = -ans;
|
||||
return ans;
|
||||
}
|
||||
|
||||
inline float GetLambdaNDCG(const std::vector< Triple<float,float,int> > sorted_triple,
|
||||
int index1,
|
||||
int index2,float IDCG){
|
||||
float original = pow(2,sorted_triple[index1].f2_)/log(index1+2)
|
||||
+ pow(2,sorted_triple[index2].f2_)/log(index2+2);
|
||||
float changed = pow(2,sorted_triple[index2].f2_)/log(index1+2)
|
||||
+ pow(2,sorted_triple[index1].f2_)/log(index2+2);
|
||||
float ans = (original - changed)/IDCG;
|
||||
if(ans < 0) ans = -ans;
|
||||
return ans;
|
||||
}
|
||||
|
||||
|
||||
inline float GetIDCG(const std::vector< Triple<float,float,int> > sorted_triple){
|
||||
std::vector<float> labels;
|
||||
for(int i = 0; i < sorted_triple.size(); i++){
|
||||
labels.push_back(sorted_triple[i].f2_);
|
||||
}
|
||||
|
||||
std::sort(labels.begin(),labels.end(),std::greater<float>());
|
||||
return EvalNDCG::DCG(labels);
|
||||
}
|
||||
|
||||
inline std::vector< Quadruple<float,float,float,float> > GetMAPAcc(const std::vector< Triple<float,float,int> > sorted_triple){
|
||||
std::vector< Quadruple<float,float,float,float> > map_acc;
|
||||
float hit = 0,acc1 = 0,acc2 = 0,acc3 = 0;
|
||||
for(int i = 0; i < sorted_triple.size(); i++){
|
||||
if(sorted_triple[i].f2_ == 1) {
|
||||
hit++;
|
||||
acc1 += hit /( i + 1 );
|
||||
acc2 += (hit - 1)/(i+1);
|
||||
acc3 += (hit + 1)/(i+1);
|
||||
}
|
||||
map_acc.push_back(Quadruple<float,float,float,float>(acc1,acc2,acc3,hit));
|
||||
}
|
||||
return map_acc;
|
||||
|
||||
}
|
||||
|
||||
inline void GetGroupGradient(const std::vector<float> &preds,
|
||||
const std::vector<float> &labels,
|
||||
const std::vector<int> &group_index,
|
||||
std::vector<float> &grad,
|
||||
std::vector<float> &hess,
|
||||
const std::vector< Triple<float,float,int> > sorted_triple,
|
||||
const std::vector<int> index_remap,
|
||||
const sample::Pairs& pairs,
|
||||
int group){
|
||||
bool j_better;
|
||||
float IDCG, pred_diff, pred_diff_exp, delta;
|
||||
float first_order_gradient, second_order_gradient;
|
||||
std::vector< Quadruple<float,float,float,float> > map_acc;
|
||||
|
||||
if(mparam.loss_type == NDCG){
|
||||
IDCG = GetIDCG(sorted_triple);
|
||||
}else if(mparam.loss_type == MAP){
|
||||
map_acc = GetMAPAcc(sorted_triple);
|
||||
}
|
||||
|
||||
for (int j = group_index[group]; j < group_index[group + 1]; j++){
|
||||
std::vector<int> pair_instance = pairs.GetPairs(j);
|
||||
for (int k = 0; k < pair_instance.size(); k++){
|
||||
j_better = labels[j] > labels[pair_instance[k]];
|
||||
if (j_better){
|
||||
switch(mparam.loss_type){
|
||||
case PAIRWISE: delta = 1.0;break;
|
||||
case MAP: delta = GetLambdaMAP(sorted_triple,index_remap[j - group_index[group]],index_remap[pair_instance[k]-group_index[group]],map_acc);break;
|
||||
case NDCG: delta = GetLambdaNDCG(sorted_triple,index_remap[j - group_index[group]],index_remap[pair_instance[k]-group_index[group]],IDCG);break;
|
||||
default: utils::Error("Cannot find the specified loss type");
|
||||
}
|
||||
|
||||
pred_diff = preds[preds[j] - pair_instance[k]];
|
||||
pred_diff_exp = j_better ? expf(-pred_diff) : expf(pred_diff);
|
||||
first_order_gradient = delta * FirstOrderGradient(pred_diff_exp);
|
||||
second_order_gradient = 2 * delta * SecondOrderGradient(pred_diff_exp);
|
||||
hess[j] += second_order_gradient;
|
||||
grad[j] += first_order_gradient;
|
||||
hess[pair_instance[k]] += second_order_gradient;
|
||||
grad[pair_instance[k]] += -first_order_gradient;
|
||||
}
|
||||
/*!
|
||||
* \brief a rank booster associated with training and evaluating data
|
||||
* \param train pointer to the training data
|
||||
* \param evals array of evaluating data
|
||||
* \param evname name of evaluation data, used print statistics
|
||||
*/
|
||||
RankBoostLearner(const base::DMatrix *train,
|
||||
const std::vector<base::DMatrix *> &evals,
|
||||
const std::vector<std::string> &evname) {
|
||||
}
|
||||
}
|
||||
}
|
||||
public:
|
||||
/*! \brief get the first order and second order gradient, given the
|
||||
* intransformed predictions and labels */
|
||||
inline void GetGradient(const std::vector<float> &preds,
|
||||
const std::vector<float> &labels,
|
||||
const std::vector<int> &group_index,
|
||||
std::vector<float> &grad,
|
||||
std::vector<float> &hess) {
|
||||
grad.resize(preds.size());
|
||||
hess.resize(preds.size());
|
||||
for (int i = 0; i < group_index.size() - 1; i++){
|
||||
sample::Pairs pairs = sampler.GenPairs(preds, labels, group_index[i], group_index[i + 1]);
|
||||
//pairs.GetPairs()
|
||||
std::vector< Triple<float,float,int> > sorted_triple = GetSortedTuple(preds,labels,group_index,i);
|
||||
std::vector<int> index_remap = GetIndexMap(sorted_triple,group_index[i]);
|
||||
GetGroupGradient(preds,labels,group_index,
|
||||
grad,hess,sorted_triple,index_remap,pairs,i);
|
||||
}
|
||||
}
|
||||
|
||||
BoostLearner(train, evals, evname);
|
||||
}
|
||||
inline void UpdateInteract(std::string action) {
|
||||
this->InteractPredict(preds_, *train_, 0);
|
||||
|
||||
/*!
|
||||
* \brief initialize solver before training, called before training
|
||||
* this function is reserved for solver to allocate necessary space
|
||||
* and do other preparation
|
||||
*/
|
||||
inline void InitTrainer(void) {
|
||||
BoostLearner::InitTrainer();
|
||||
if (mparam.loss_type == PAIRWISE) {
|
||||
evaluator_.AddEval("PAIR");
|
||||
}
|
||||
else if (mparam.loss_type == MAP) {
|
||||
evaluator_.AddEval("MAP");
|
||||
}
|
||||
else {
|
||||
evaluator_.AddEval("NDCG");
|
||||
}
|
||||
evaluator_.Init();
|
||||
}
|
||||
int buffer_offset = static_cast<int>(train_->Size());
|
||||
for (size_t i = 0; i < evals_.size(); ++i){
|
||||
std::vector<float> &preds = this->eval_preds_[i];
|
||||
this->InteractPredict(preds, *evals_[i], buffer_offset);
|
||||
buffer_offset += static_cast<int>(evals_[i]->Size());
|
||||
}
|
||||
|
||||
void EvalOneIter(int iter, FILE *fo = stderr) {
|
||||
fprintf(fo, "[%d]", iter);
|
||||
int buffer_offset = static_cast<int>(train_->Size());
|
||||
if (action == "remove"){
|
||||
base_gbm.DelteBooster(); return;
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < evals_.size(); ++i) {
|
||||
std::vector<float> &preds = this->eval_preds_[i];
|
||||
this->PredictBuffer(preds, *evals_[i], buffer_offset);
|
||||
evaluator_.Eval(fo, evname_[i].c_str(), preds, (*evals_[i]).labels, (*evals_[i]).group_index);
|
||||
buffer_offset += static_cast<int>(evals_[i]->Size());
|
||||
}
|
||||
fprintf(fo, "\n");
|
||||
}
|
||||
this->GetGradient(preds_, train_->labels,train_->group_index, grad_, hess_);
|
||||
std::vector<unsigned> root_index;
|
||||
base_gbm.DoBoost(grad_, hess_, train_->data, root_index);
|
||||
|
||||
inline void SetParam(const char *name, const char *val){
|
||||
if (!strcmp(name, "eval_metric")) evaluator_.AddEval(val);
|
||||
if (!strcmp(name, "rank:sampler")) sampler.AssignSampler(atoi(val));
|
||||
}
|
||||
/*! \brief get the first order and second order gradient, given the transformed predictions and labels */
|
||||
inline void GetGradient(const std::vector<float> &preds,
|
||||
const std::vector<float> &labels,
|
||||
const std::vector<int> &group_index,
|
||||
std::vector<float> &grad,
|
||||
std::vector<float> &hess) {
|
||||
grad.resize(preds.size());
|
||||
hess.resize(preds.size());
|
||||
bool j_better;
|
||||
float pred_diff, pred_diff_exp, first_order_gradient, second_order_gradient;
|
||||
for (int i = 0; i < group_index.size() - 1; i++){
|
||||
sample::Pairs pairs = sampler.GenPairs(preds, labels, group_index[i], group_index[i + 1]);
|
||||
for (int j = group_index[i]; j < group_index[i + 1]; j++){
|
||||
std::vector<int> pair_instance = pairs.GetPairs(j);
|
||||
for (int k = 0; k < pair_instance.size(); k++){
|
||||
j_better = labels[j] > labels[pair_instance[k]];
|
||||
if (j_better){
|
||||
pred_diff = preds[preds[j] - pair_instance[k]];
|
||||
pred_diff_exp = j_better ? expf(-pred_diff) : expf(pred_diff);
|
||||
first_order_gradient = FirstOrderGradient(pred_diff_exp);
|
||||
second_order_gradient = 2 * SecondOrderGradient(pred_diff_exp);
|
||||
hess[j] += second_order_gradient;
|
||||
grad[j] += first_order_gradient;
|
||||
hess[pair_instance[k]] += second_order_gradient;
|
||||
grad[pair_instance[k]] += -first_order_gradient;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
inline void UpdateInteract(std::string action) {
|
||||
|
||||
}
|
||||
private:
|
||||
enum LossType {
|
||||
PAIRWISE = 0,
|
||||
MAP = 1,
|
||||
NDCG = 2
|
||||
};
|
||||
this->InteractRePredict(*train_, 0);
|
||||
buffer_offset = static_cast<int>(train_->Size());
|
||||
for (size_t i = 0; i < evals_.size(); ++i){
|
||||
this->InteractRePredict(*evals_[i], buffer_offset);
|
||||
buffer_offset += static_cast<int>(evals_[i]->Size());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
private:
|
||||
enum LossType {
|
||||
PAIRWISE = 0,
|
||||
MAP = 1,
|
||||
NDCG = 2
|
||||
};
|
||||
|
||||
|
||||
|
||||
/*!
|
||||
* \brief calculate first order gradient of pairwise loss function(f(x) = ln(1+exp(-x)),
|
||||
* given the exponential of the difference of intransformed pair predictions
|
||||
* \param the intransformed prediction of positive instance
|
||||
* \param the intransformed prediction of negative instance
|
||||
* \return first order gradient
|
||||
*/
|
||||
inline float FirstOrderGradient(float pred_diff_exp) const {
|
||||
return -pred_diff_exp / (1 + pred_diff_exp);
|
||||
}
|
||||
/*!
|
||||
* \brief calculate first order gradient of pairwise loss function(f(x) = ln(1+exp(-x)),
|
||||
* given the exponential of the difference of intransformed pair predictions
|
||||
* \param the intransformed prediction of positive instance
|
||||
* \param the intransformed prediction of negative instance
|
||||
* \return first order gradient
|
||||
*/
|
||||
inline float FirstOrderGradient(float pred_diff_exp) const {
|
||||
return -pred_diff_exp / (1 + pred_diff_exp);
|
||||
}
|
||||
|
||||
/*!
|
||||
* \brief calculate second order gradient of pairwise loss function(f(x) = ln(1+exp(-x)),
|
||||
* given the exponential of the difference of intransformed pair predictions
|
||||
* \param the intransformed prediction of positive instance
|
||||
* \param the intransformed prediction of negative instance
|
||||
* \return second order gradient
|
||||
*/
|
||||
inline float SecondOrderGradient(float pred_diff_exp) const {
|
||||
return pred_diff_exp / pow(1 + pred_diff_exp, 2);
|
||||
}
|
||||
/*!
|
||||
* \brief calculate second order gradient of pairwise loss function(f(x) = ln(1+exp(-x)),
|
||||
* given the exponential of the difference of intransformed pair predictions
|
||||
* \param the intransformed prediction of positive instance
|
||||
* \param the intransformed prediction of negative instance
|
||||
* \return second order gradient
|
||||
*/
|
||||
inline float SecondOrderGradient(float pred_diff_exp) const {
|
||||
return pred_diff_exp / pow(1 + pred_diff_exp, 2);
|
||||
}
|
||||
|
||||
private:
|
||||
RankEvalSet evaluator_;
|
||||
sample::PairSamplerWrapper sampler;
|
||||
};
|
||||
};
|
||||
private:
|
||||
RankEvalSet evaluator_;
|
||||
sample::PairSamplerWrapper sampler;
|
||||
};
|
||||
};
|
||||
};
|
||||
|
||||
#endif
|
||||
|
||||
@ -13,170 +13,225 @@
|
||||
#include "../utils/xgboost_omp.h"
|
||||
|
||||
namespace xgboost {
|
||||
namespace rank {
|
||||
/*! \brief evaluator that evaluates the loss metrics */
|
||||
class IRankEvaluator {
|
||||
public:
|
||||
/*!
|
||||
* \brief evaluate a specific metric
|
||||
* \param preds prediction
|
||||
* \param labels label
|
||||
*/
|
||||
virtual float Eval(const std::vector<float> &preds,
|
||||
const std::vector<float> &labels,
|
||||
const std::vector<int> &group_index) const = 0;
|
||||
/*! \return name of metric */
|
||||
virtual const char *Name(void) const = 0;
|
||||
};
|
||||
namespace rank {
|
||||
/*! \brief evaluator that evaluates the loss metrics */
|
||||
class IRankEvaluator {
|
||||
public:
|
||||
/*!
|
||||
* \brief evaluate a specific metric
|
||||
* \param preds prediction
|
||||
* \param labels label
|
||||
*/
|
||||
virtual float Eval(const std::vector<float> &preds,
|
||||
const std::vector<float> &labels,
|
||||
const std::vector<int> &group_index) const = 0;
|
||||
/*! \return name of metric */
|
||||
virtual const char *Name(void) const = 0;
|
||||
};
|
||||
|
||||
class Pair{
|
||||
public:
|
||||
float key_;
|
||||
float value_;
|
||||
class Pair{
|
||||
public:
|
||||
float key_;
|
||||
float value_;
|
||||
|
||||
Pair(float key, float value){
|
||||
key_ = key;
|
||||
value_ = value_;
|
||||
}
|
||||
};
|
||||
Pair(float key, float value):key_(key),value_(value){
|
||||
}
|
||||
};
|
||||
|
||||
bool PairKeyComparer(const Pair &a, const Pair &b){
|
||||
return a.key_ < b.key_;
|
||||
}
|
||||
bool PairKeyComparer(const Pair &a, const Pair &b){
|
||||
return a.key_ < b.key_;
|
||||
}
|
||||
|
||||
bool PairValueComparer(const Pair &a, const Pair &b){
|
||||
return a.value_ < b.value_;
|
||||
}
|
||||
|
||||
|
||||
/*! \brief Mean Average Precision */
|
||||
class EvalMAP : public IRankEvaluator {
|
||||
public:
|
||||
float Eval(const std::vector<float> &preds,
|
||||
const std::vector<float> &labels,
|
||||
const std::vector<int> &group_index) const {
|
||||
float acc = 0;
|
||||
std::vector<Pair> pairs_sort;
|
||||
for (int i = 0; i < group_index.size() - 1; i++){
|
||||
for (int j = group_index[i]; j < group_index[i + 1]; j++){
|
||||
Pair pair(preds[j], labels[j]);
|
||||
pairs_sort.push_back(pair);
|
||||
}
|
||||
acc += average_precision(pairs_sort);
|
||||
}
|
||||
return acc / (group_index.size() - 1);
|
||||
}
|
||||
|
||||
|
||||
virtual const char *Name(void) const {
|
||||
return "MAP";
|
||||
}
|
||||
|
||||
float average_precision(std::vector<Pair> pairs_sort) const{
|
||||
|
||||
std::sort(pairs_sort.begin(), pairs_sort.end(), PairKeyComparer);
|
||||
float hits = 0;
|
||||
float average_precision = 0;
|
||||
for (int j = 0; j < pairs_sort.size(); j++){
|
||||
if (pairs_sort[j].value_ == 1){
|
||||
hits++;
|
||||
average_precision += hits / (j + 1);
|
||||
}
|
||||
}
|
||||
if (hits != 0) average_precision /= hits;
|
||||
return average_precision;
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
class EvalPair : public IRankEvaluator{
|
||||
public:
|
||||
float Eval(const std::vector<float> &preds,
|
||||
const std::vector<float> &labels,
|
||||
const std::vector<int> &group_index) const {
|
||||
return 0;
|
||||
}
|
||||
|
||||
const char *Name(void) const {
|
||||
return "PAIR";
|
||||
}
|
||||
};
|
||||
|
||||
/*! \brief Normalized DCG */
|
||||
class EvalNDCG : public IRankEvaluator {
|
||||
public:
|
||||
float Eval(const std::vector<float> &preds,
|
||||
const std::vector<float> &labels,
|
||||
const std::vector<int> &group_index) const {
|
||||
if (group_index.size() <= 1) return 0;
|
||||
float acc = 0;
|
||||
std::vector<Pair> pairs_sort;
|
||||
for (int i = 0; i < group_index.size() - 1; i++){
|
||||
for (int j = group_index[i]; j < group_index[i + 1]; j++){
|
||||
Pair pair(preds[j], labels[j]);
|
||||
pairs_sort.push_back(pair);
|
||||
}
|
||||
acc += NDCG(pairs_sort);
|
||||
}
|
||||
return acc / (group_index.size() - 1);
|
||||
}
|
||||
|
||||
float NDCG(std::vector<Pair> pairs_sort) const{
|
||||
std::sort(pairs_sort.begin(), pairs_sort.end(), PairKeyComparer);
|
||||
float dcg = DCG(pairs_sort);
|
||||
std::sort(pairs_sort.begin(), pairs_sort.end(), PairValueComparer);
|
||||
float IDCG = DCG(pairs_sort);
|
||||
if (IDCG == 0) return 0;
|
||||
return dcg / IDCG;
|
||||
}
|
||||
|
||||
float DCG(std::vector<Pair> pairs_sort) const{
|
||||
float ans = 0.0;
|
||||
ans += pairs_sort[0].value_;
|
||||
for (int i = 1; i < pairs_sort.size(); i++){
|
||||
ans += pairs_sort[i].value_ / log(i + 1);
|
||||
}
|
||||
return ans;
|
||||
}
|
||||
|
||||
virtual const char *Name(void) const {
|
||||
return "NDCG";
|
||||
}
|
||||
};
|
||||
bool PairValueComparer(const Pair &a, const Pair &b){
|
||||
return a.value_ < b.value_;
|
||||
}
|
||||
|
||||
template<typename T1,typename T2,typename T3>
|
||||
class Triple{
|
||||
public:
|
||||
T1 f1_;
|
||||
T2 f2_;
|
||||
T3 f3_;
|
||||
Triple(T1 f1,T2 f2,T3 f3):f1_(f1),f2_(f2),f3_(f3){
|
||||
|
||||
}
|
||||
};
|
||||
|
||||
namespace rank {
|
||||
/*! \brief a set of evaluators */
|
||||
class RankEvalSet {
|
||||
public:
|
||||
inline void AddEval(const char *name) {
|
||||
if (!strcmp(name, "PAIR")) evals_.push_back(&pair_);
|
||||
if (!strcmp(name, "MAP")) evals_.push_back(&map_);
|
||||
if (!strcmp(name, "NDCG")) evals_.push_back(&ndcg_);
|
||||
}
|
||||
|
||||
inline void Init(void) {
|
||||
std::sort(evals_.begin(), evals_.end());
|
||||
evals_.resize(std::unique(evals_.begin(), evals_.end()) - evals_.begin());
|
||||
}
|
||||
|
||||
inline void Eval(FILE *fo, const char *evname,
|
||||
const std::vector<float> &preds,
|
||||
const std::vector<float> &labels,
|
||||
const std::vector<int> &group_index) const {
|
||||
for (size_t i = 0; i < evals_.size(); ++i) {
|
||||
float res = evals_[i]->Eval(preds, labels, group_index);
|
||||
fprintf(fo, "\t%s-%s:%f", evname, evals_[i]->Name(), res);
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
EvalPair pair_;
|
||||
EvalMAP map_;
|
||||
EvalNDCG ndcg_;
|
||||
std::vector<const IRankEvaluator*> evals_;
|
||||
};
|
||||
|
||||
template<typename T1,typename T2,typename T3,typename T4>
|
||||
class Quadruple{
|
||||
public:
|
||||
T1 f1_;
|
||||
T2 f2_;
|
||||
T3 f3_;
|
||||
T4 f4_;
|
||||
Quadruple(T1 f1,T2 f2,T3 f3,T4 f4):f1_(f1),f2_(f2),f3_(f3),f4_(f4){
|
||||
|
||||
}
|
||||
};
|
||||
|
||||
bool Triplef1Comparer(const Triple<float,float,int> &a, const Triple<float,float,int> &b){
|
||||
return a.f1_< b.f1_;
|
||||
}
|
||||
|
||||
/*! \brief Mean Average Precision */
|
||||
class EvalMAP : public IRankEvaluator {
|
||||
public:
|
||||
float Eval(const std::vector<float> &preds,
|
||||
const std::vector<float> &labels,
|
||||
const std::vector<int> &group_index) const {
|
||||
if (group_index.size() <= 1) return 0;
|
||||
float acc = 0;
|
||||
std::vector<Pair> pairs_sort;
|
||||
for (int i = 0; i < group_index.size() - 1; i++){
|
||||
for (int j = group_index[i]; j < group_index[i + 1]; j++){
|
||||
Pair pair(preds[j], labels[j]);
|
||||
pairs_sort.push_back(pair);
|
||||
}
|
||||
acc += average_precision(pairs_sort);
|
||||
}
|
||||
return acc / (group_index.size() - 1);
|
||||
}
|
||||
|
||||
|
||||
|
||||
virtual const char *Name(void) const {
|
||||
return "MAP";
|
||||
}
|
||||
private:
|
||||
float average_precision(std::vector<Pair> pairs_sort) const{
|
||||
|
||||
std::sort(pairs_sort.begin(), pairs_sort.end(), PairKeyComparer);
|
||||
float hits = 0;
|
||||
float average_precision = 0;
|
||||
for (int j = 0; j < pairs_sort.size(); j++){
|
||||
if (pairs_sort[j].value_ == 1){
|
||||
hits++;
|
||||
average_precision += hits / (j + 1);
|
||||
}
|
||||
}
|
||||
if (hits != 0) average_precision /= hits;
|
||||
return average_precision;
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
class EvalPair : public IRankEvaluator{
|
||||
public:
|
||||
float Eval(const std::vector<float> &preds,
|
||||
const std::vector<float> &labels,
|
||||
const std::vector<int> &group_index) const {
|
||||
if (group_index.size() <= 1) return 0;
|
||||
float acc = 0;
|
||||
for (int i = 0; i < group_index.size() - 1; i++){
|
||||
acc += Count_Inversion(preds,labels,
|
||||
group_index[i],group_index[i+1]);
|
||||
}
|
||||
return acc / (group_index.size() - 1);
|
||||
}
|
||||
|
||||
const char *Name(void) const {
|
||||
return "PAIR";
|
||||
}
|
||||
private:
|
||||
float Count_Inversion(const std::vector<float> &preds,
|
||||
const std::vector<float> &labels,int begin,int end
|
||||
) const{
|
||||
float ans = 0;
|
||||
for(int i = begin; i < end; i++){
|
||||
for(int j = i + 1; j < end; j++){
|
||||
if(preds[i] > preds[j] && labels[i] < labels[j])
|
||||
ans++;
|
||||
}
|
||||
}
|
||||
return ans;
|
||||
}
|
||||
};
|
||||
|
||||
/*! \brief Normalized DCG */
|
||||
class EvalNDCG : public IRankEvaluator {
|
||||
public:
|
||||
float Eval(const std::vector<float> &preds,
|
||||
const std::vector<float> &labels,
|
||||
const std::vector<int> &group_index) const {
|
||||
if (group_index.size() <= 1) return 0;
|
||||
float acc = 0;
|
||||
std::vector<Pair> pairs_sort;
|
||||
for (int i = 0; i < group_index.size() - 1; i++){
|
||||
for (int j = group_index[i]; j < group_index[i + 1]; j++){
|
||||
Pair pair(preds[j], labels[j]);
|
||||
pairs_sort.push_back(pair);
|
||||
}
|
||||
acc += NDCG(pairs_sort);
|
||||
}
|
||||
return acc / (group_index.size() - 1);
|
||||
}
|
||||
|
||||
static float DCG(const std::vector<float> &labels){
|
||||
float ans = 0.0;
|
||||
for (int i = 0; i < labels.size(); i++){
|
||||
ans += (pow(2,labels[i]) - 1 ) / log(i + 2);
|
||||
}
|
||||
return ans;
|
||||
}
|
||||
|
||||
virtual const char *Name(void) const {
|
||||
return "NDCG";
|
||||
}
|
||||
|
||||
private:
|
||||
float NDCG(std::vector<Pair> pairs_sort) const{
|
||||
std::sort(pairs_sort.begin(), pairs_sort.end(), PairKeyComparer);
|
||||
float dcg = DCG(pairs_sort);
|
||||
std::sort(pairs_sort.begin(), pairs_sort.end(), PairValueComparer);
|
||||
float IDCG = DCG(pairs_sort);
|
||||
if (IDCG == 0) return 0;
|
||||
return dcg / IDCG;
|
||||
}
|
||||
|
||||
float DCG(std::vector<Pair> pairs_sort) const{
|
||||
std::vector<float> labels;
|
||||
for (int i = 1; i < pairs_sort.size(); i++){
|
||||
labels.push_back(pairs_sort[i].value_);
|
||||
}
|
||||
return DCG(labels);
|
||||
}
|
||||
|
||||
|
||||
};
|
||||
|
||||
};
|
||||
|
||||
namespace rank {
|
||||
/*! \brief a set of evaluators */
|
||||
class RankEvalSet {
|
||||
public:
|
||||
inline void AddEval(const char *name) {
|
||||
if (!strcmp(name, "PAIR")) evals_.push_back(&pair_);
|
||||
if (!strcmp(name, "MAP")) evals_.push_back(&map_);
|
||||
if (!strcmp(name, "NDCG")) evals_.push_back(&ndcg_);
|
||||
}
|
||||
|
||||
inline void Init(void) {
|
||||
std::sort(evals_.begin(), evals_.end());
|
||||
evals_.resize(std::unique(evals_.begin(), evals_.end()) - evals_.begin());
|
||||
}
|
||||
|
||||
inline void Eval(FILE *fo, const char *evname,
|
||||
const std::vector<float> &preds,
|
||||
const std::vector<float> &labels,
|
||||
const std::vector<int> &group_index) const {
|
||||
for (size_t i = 0; i < evals_.size(); ++i) {
|
||||
float res = evals_[i]->Eval(preds, labels, group_index);
|
||||
fprintf(fo, "\t%s-%s:%f", evname, evals_[i]->Name(), res);
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
EvalPair pair_;
|
||||
EvalMAP map_;
|
||||
EvalNDCG ndcg_;
|
||||
std::vector<const IRankEvaluator*> evals_;
|
||||
};
|
||||
};
|
||||
};
|
||||
#endif
|
||||
|
||||
@ -11,20 +11,12 @@
|
||||
#include "../base/xgboost_boost_task.h"
|
||||
#include "xgboost_rank.h"
|
||||
#include "../regression/xgboost_reg.h"
|
||||
#include "../regression/xgboost_reg_main.cpp"
|
||||
#include "../base/xgboost_data_instance.h"
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
|
||||
xgboost::random::Seed(0);
|
||||
xgboost::base::BoostTask tsk;
|
||||
xgboost::utils::ConfigIterator itr(argv[1]);
|
||||
/* int learner_index = 0;
|
||||
while (itr.Next()){
|
||||
if (!strcmp(itr.name(), "learning_task")){
|
||||
learner_index = atoi(itr.val());
|
||||
}
|
||||
}*/
|
||||
xgboost::rank::RankBoostLearner* rank_learner = new xgboost::rank::RankBoostLearner;
|
||||
xgboost::base::BoostLearner *parent = static_cast<xgboost::base::BoostLearner*>(rank_learner);
|
||||
tsk.SetLearner(parent);
|
||||
return tsk.Run(argc, argv);
|
||||
int main(int argc, char *argv[]) {
|
||||
xgboost::random::Seed(0);
|
||||
xgboost::base::BoostTask rank_tsk;
|
||||
rank_tsk.SetLearner(new xgboost::rank::RankBoostLearner);
|
||||
return rank_tsk.Run(argc, argv);
|
||||
}
|
||||
|
||||
@ -5,123 +5,124 @@
|
||||
#include"../utils/xgboost_utils.h"
|
||||
|
||||
namespace xgboost {
|
||||
namespace rank {
|
||||
namespace sample {
|
||||
namespace rank {
|
||||
namespace sample {
|
||||
|
||||
/*
|
||||
* \brief the data structure to maintain the sample pairs
|
||||
*/
|
||||
struct Pairs {
|
||||
/*
|
||||
* \brief the data structure to maintain the sample pairs
|
||||
*/
|
||||
struct Pairs {
|
||||
|
||||
/*
|
||||
* \brief constructor given the start and end offset of the sampling group
|
||||
* in overall instances
|
||||
* \param start the begin index of the group
|
||||
* \param end the end index of the group
|
||||
*/
|
||||
Pairs(int start,int end):start_(start),end_(end_){
|
||||
for(int i = start; i < end; i++){
|
||||
std::vector<int> v;
|
||||
pairs_.push_back(v);
|
||||
}
|
||||
}
|
||||
/*
|
||||
* \brief retrieve the related pair information of an data instances
|
||||
* \param index, the index of retrieved instance
|
||||
* \return the index of instances paired
|
||||
*/
|
||||
std::vector<int> GetPairs(int index) {
|
||||
utils::Assert(index >= start_ && index < end_,"The query index out of sampling bound");
|
||||
return pairs_[index-start_];
|
||||
}
|
||||
/*
|
||||
* \brief constructor given the start and end offset of the sampling group
|
||||
* in overall instances
|
||||
* \param start the begin index of the group
|
||||
* \param end the end index of the group
|
||||
*/
|
||||
Pairs(int start, int end) :start_(start), end_(end){
|
||||
for (int i = start; i < end; i++){
|
||||
std::vector<int> v;
|
||||
pairs_.push_back(v);
|
||||
}
|
||||
}
|
||||
/*
|
||||
* \brief retrieve the related pair information of an data instances
|
||||
* \param index, the index of retrieved instance
|
||||
* \return the index of instances paired
|
||||
*/
|
||||
std::vector<int> GetPairs(int index) const{
|
||||
utils::Assert(index >= start_ && index < end_, "The query index out of sampling bound");
|
||||
return pairs_[index - start_];
|
||||
}
|
||||
|
||||
/*
|
||||
* \brief add in a sampled pair
|
||||
* \param index the index of the instance to sample a friend
|
||||
* \param paired_index the index of the instance sampled as a friend
|
||||
*/
|
||||
void push(int index,int paired_index){
|
||||
pairs_[index - start_].push_back(paired_index);
|
||||
}
|
||||
|
||||
std::vector< std::vector<int> > pairs_;
|
||||
int start_;
|
||||
int end_;
|
||||
};
|
||||
/*
|
||||
* \brief add in a sampled pair
|
||||
* \param index the index of the instance to sample a friend
|
||||
* \param paired_index the index of the instance sampled as a friend
|
||||
*/
|
||||
void push(int index, int paired_index){
|
||||
pairs_[index - start_].push_back(paired_index);
|
||||
}
|
||||
|
||||
/*
|
||||
* \brief the interface of pair sampler
|
||||
*/
|
||||
struct IPairSampler {
|
||||
/*
|
||||
* \brief Generate sample pairs given the predcions, labels, the start and the end index
|
||||
* of a specified group
|
||||
* \param preds, the predictions of all data instances
|
||||
* \param labels, the labels of all data instances
|
||||
* \param start, the start index of a specified group
|
||||
* \param end, the end index of a specified group
|
||||
* \return the generated pairs
|
||||
*/
|
||||
virtual Pairs GenPairs(const std::vector<float> &preds,
|
||||
const std::vector<float> &labels,
|
||||
int start,int end) = 0;
|
||||
|
||||
};
|
||||
|
||||
enum{
|
||||
BINARY_LINEAR_SAMPLER
|
||||
};
|
||||
|
||||
/*! \brief A simple pair sampler when the rank relevence scale is binary
|
||||
* for each positive instance, we will pick a negative
|
||||
* instance and add in a pair. When using binary linear sampler,
|
||||
* we should guarantee the labels are 0 or 1
|
||||
*/
|
||||
struct BinaryLinearSampler:public IPairSampler{
|
||||
virtual Pairs GenPairs(const std::vector<float> &preds,
|
||||
const std::vector<float> &labels,
|
||||
int start,int end) {
|
||||
Pairs pairs(start,end);
|
||||
int pointer = 0, last_pointer = 0,index = start, interval = end - start;
|
||||
for(int i = start; i < end; i++){
|
||||
if(labels[i] == 1){
|
||||
while(true){
|
||||
index = (++pointer) % interval + start;
|
||||
if(labels[index] == 0) break;
|
||||
if(pointer - last_pointer > interval) return pairs;
|
||||
}
|
||||
pairs.push(i,index);
|
||||
pairs.push(index,i);
|
||||
last_pointer = pointer;
|
||||
}
|
||||
}
|
||||
return pairs;
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
/*! \brief Pair Sampler Wrapper*/
|
||||
struct PairSamplerWrapper{
|
||||
public:
|
||||
inline void AssignSampler( int sampler_index ){
|
||||
|
||||
switch(sampler_index){
|
||||
case BINARY_LINEAR_SAMPLER:sampler_ = &binary_linear_sampler;break;
|
||||
|
||||
default:utils::Error("Cannot find the specified sampler");
|
||||
}
|
||||
}
|
||||
|
||||
Pairs GenPairs(const std::vector<float> &preds,
|
||||
const std::vector<float> &labels,
|
||||
int start,int end){
|
||||
return sampler_->GenPairs(preds,labels,start,end);
|
||||
}
|
||||
private:
|
||||
BinaryLinearSampler binary_linear_sampler;
|
||||
IPairSampler *sampler_;
|
||||
};
|
||||
std::vector< std::vector<int> > pairs_;
|
||||
int start_;
|
||||
int end_;
|
||||
};
|
||||
|
||||
/*
|
||||
* \brief the interface of pair sampler
|
||||
*/
|
||||
struct IPairSampler {
|
||||
/*
|
||||
* \brief Generate sample pairs given the predcions, labels, the start and the end index
|
||||
* of a specified group
|
||||
* \param preds, the predictions of all data instances
|
||||
* \param labels, the labels of all data instances
|
||||
* \param start, the start index of a specified group
|
||||
* \param end, the end index of a specified group
|
||||
* \return the generated pairs
|
||||
*/
|
||||
virtual Pairs GenPairs(const std::vector<float> &preds,
|
||||
const std::vector<float> &labels,
|
||||
int start, int end) = 0;
|
||||
|
||||
};
|
||||
|
||||
enum{
|
||||
BINARY_LINEAR_SAMPLER
|
||||
};
|
||||
|
||||
/*! \brief A simple pair sampler when the rank relevence scale is binary
|
||||
* for each positive instance, we will pick a negative
|
||||
* instance and add in a pair. When using binary linear sampler,
|
||||
* we should guarantee the labels are 0 or 1
|
||||
*/
|
||||
struct BinaryLinearSampler :public IPairSampler{
|
||||
virtual Pairs GenPairs(const std::vector<float> &preds,
|
||||
const std::vector<float> &labels,
|
||||
int start, int end) {
|
||||
Pairs pairs(start, end);
|
||||
int pointer = 0, last_pointer = 0, index = start, interval = end - start;
|
||||
for (int i = start; i < end; i++){
|
||||
if (labels[i] == 1){
|
||||
while (true){
|
||||
index = (++pointer) % interval + start;
|
||||
if (labels[index] == 0) break;
|
||||
if (pointer - last_pointer > interval) return pairs;
|
||||
}
|
||||
pairs.push(i, index);
|
||||
pairs.push(index, i);
|
||||
last_pointer = pointer;
|
||||
}
|
||||
}
|
||||
return pairs;
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
/*! \brief Pair Sampler Wrapper*/
|
||||
struct PairSamplerWrapper{
|
||||
public:
|
||||
inline void AssignSampler(int sampler_index){
|
||||
|
||||
switch (sampler_index){
|
||||
case BINARY_LINEAR_SAMPLER:sampler_ = &binary_linear_sampler; break;
|
||||
|
||||
default:utils::Error("Cannot find the specified sampler");
|
||||
}
|
||||
}
|
||||
|
||||
Pairs GenPairs(const std::vector<float> &preds,
|
||||
const std::vector<float> &labels,
|
||||
int start, int end){
|
||||
utils::Assert(sampler_ != NULL,"Not config the sampler yet. Add rank:sampler in the config file\n");
|
||||
return sampler_->GenPairs(preds, labels, start, end);
|
||||
}
|
||||
private:
|
||||
BinaryLinearSampler binary_linear_sampler;
|
||||
IPairSampler *sampler_;
|
||||
};
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
@ -21,239 +21,240 @@ namespace xgboost{
|
||||
class RegBoostLearner{
|
||||
public:
|
||||
/*! \brief constructor */
|
||||
RegBoostLearner( void ){
|
||||
silent = 0;
|
||||
RegBoostLearner(void){
|
||||
silent = 0;
|
||||
}
|
||||
/*!
|
||||
* \brief a regression booter associated with training and evaluating data
|
||||
/*!
|
||||
* \brief a regression booter associated with training and evaluating data
|
||||
* \param train pointer to the training data
|
||||
* \param evals array of evaluating data
|
||||
* \param evname name of evaluation data, used print statistics
|
||||
*/
|
||||
RegBoostLearner( const DMatrix *train,
|
||||
const std::vector<DMatrix *> &evals,
|
||||
const std::vector<std::string> &evname ){
|
||||
RegBoostLearner(const DMatrix *train,
|
||||
const std::vector<DMatrix *> &evals,
|
||||
const std::vector<std::string> &evname){
|
||||
silent = 0;
|
||||
this->SetData(train,evals,evname);
|
||||
this->SetData(train, evals, evname);
|
||||
}
|
||||
|
||||
/*!
|
||||
* \brief associate regression booster with training and evaluating data
|
||||
/*!
|
||||
* \brief associate regression booster with training and evaluating data
|
||||
* \param train pointer to the training data
|
||||
* \param evals array of evaluating data
|
||||
* \param evname name of evaluation data, used print statistics
|
||||
*/
|
||||
inline void SetData( const DMatrix *train,
|
||||
const std::vector<DMatrix *> &evals,
|
||||
const std::vector<std::string> &evname ){
|
||||
inline void SetData(const DMatrix *train,
|
||||
const std::vector<DMatrix *> &evals,
|
||||
const std::vector<std::string> &evname){
|
||||
this->train_ = train;
|
||||
this->evals_ = evals;
|
||||
this->evname_ = evname;
|
||||
this->evname_ = evname;
|
||||
// estimate feature bound
|
||||
int num_feature = (int)(train->data.NumCol());
|
||||
// assign buffer index
|
||||
unsigned buffer_size = static_cast<unsigned>( train->Size() );
|
||||
|
||||
for( size_t i = 0; i < evals.size(); ++ i ){
|
||||
buffer_size += static_cast<unsigned>( evals[i]->Size() );
|
||||
num_feature = std::max( num_feature, (int)(evals[i]->data.NumCol()) );
|
||||
unsigned buffer_size = static_cast<unsigned>(train->Size());
|
||||
|
||||
for (size_t i = 0; i < evals.size(); ++i){
|
||||
buffer_size += static_cast<unsigned>(evals[i]->Size());
|
||||
num_feature = std::max(num_feature, (int)(evals[i]->data.NumCol()));
|
||||
}
|
||||
|
||||
char str_temp[25];
|
||||
if( num_feature > mparam.num_feature ){
|
||||
if (num_feature > mparam.num_feature){
|
||||
mparam.num_feature = num_feature;
|
||||
sprintf( str_temp, "%d", num_feature );
|
||||
base_gbm.SetParam( "bst:num_feature", str_temp );
|
||||
sprintf(str_temp, "%d", num_feature);
|
||||
base_gbm.SetParam("bst:num_feature", str_temp);
|
||||
}
|
||||
|
||||
sprintf( str_temp, "%u", buffer_size );
|
||||
base_gbm.SetParam( "num_pbuffer", str_temp );
|
||||
if( !silent ){
|
||||
printf( "buffer_size=%u\n", buffer_size );
|
||||
|
||||
sprintf(str_temp, "%u", buffer_size);
|
||||
base_gbm.SetParam("num_pbuffer", str_temp);
|
||||
if (!silent){
|
||||
printf("buffer_size=%u\n", buffer_size);
|
||||
}
|
||||
|
||||
|
||||
// set eval_preds tmp sapce
|
||||
this->eval_preds_.resize( evals.size(), std::vector<float>() );
|
||||
this->eval_preds_.resize(evals.size(), std::vector<float>());
|
||||
}
|
||||
/*!
|
||||
* \brief set parameters from outside
|
||||
/*!
|
||||
* \brief set parameters from outside
|
||||
* \param name name of the parameter
|
||||
* \param val value of the parameter
|
||||
*/
|
||||
inline void SetParam( const char *name, const char *val ){
|
||||
if( !strcmp( name, "silent") ) silent = atoi( val );
|
||||
if( !strcmp( name, "eval_metric") ) evaluator_.AddEval( val );
|
||||
mparam.SetParam( name, val );
|
||||
base_gbm.SetParam( name, val );
|
||||
inline void SetParam(const char *name, const char *val){
|
||||
if (!strcmp(name, "silent")) silent = atoi(val);
|
||||
if (!strcmp(name, "eval_metric")) evaluator_.AddEval(val);
|
||||
mparam.SetParam(name, val);
|
||||
base_gbm.SetParam(name, val);
|
||||
}
|
||||
/*!
|
||||
* \brief initialize solver before training, called before training
|
||||
* this function is reserved for solver to allocate necessary space and do other preparation
|
||||
* this function is reserved for solver to allocate necessary space and do other preparation
|
||||
*/
|
||||
inline void InitTrainer( void ){
|
||||
inline void InitTrainer(void){
|
||||
base_gbm.InitTrainer();
|
||||
if( mparam.loss_type == kLogisticClassify ){
|
||||
evaluator_.AddEval( "error" );
|
||||
}else{
|
||||
evaluator_.AddEval( "rmse" );
|
||||
if (mparam.loss_type == kLogisticClassify){
|
||||
evaluator_.AddEval("error");
|
||||
}
|
||||
else{
|
||||
evaluator_.AddEval("rmse");
|
||||
}
|
||||
evaluator_.Init();
|
||||
}
|
||||
}
|
||||
/*!
|
||||
* \brief initialize the current data storage for model, if the model is used first time, call this function
|
||||
*/
|
||||
inline void InitModel( void ){
|
||||
inline void InitModel(void){
|
||||
base_gbm.InitModel();
|
||||
mparam.AdjustBase();
|
||||
}
|
||||
/*!
|
||||
/*!
|
||||
* \brief load model from stream
|
||||
* \param fi input stream
|
||||
*/
|
||||
inline void LoadModel( utils::IStream &fi ){
|
||||
base_gbm.LoadModel( fi );
|
||||
utils::Assert( fi.Read( &mparam, sizeof(ModelParam) ) != 0 );
|
||||
*/
|
||||
inline void LoadModel(utils::IStream &fi){
|
||||
base_gbm.LoadModel(fi);
|
||||
utils::Assert(fi.Read(&mparam, sizeof(ModelParam)) != 0);
|
||||
}
|
||||
/*!
|
||||
/*!
|
||||
* \brief DumpModel
|
||||
* \param fo text file
|
||||
* \param fmap feature map that may help give interpretations of feature
|
||||
* \param with_stats whether print statistics as well
|
||||
*/
|
||||
inline void DumpModel( FILE *fo, const utils::FeatMap& fmap, bool with_stats ){
|
||||
base_gbm.DumpModel( fo, fmap, with_stats );
|
||||
* \param fo text file
|
||||
* \param fmap feature map that may help give interpretations of feature
|
||||
* \param with_stats whether print statistics as well
|
||||
*/
|
||||
inline void DumpModel(FILE *fo, const utils::FeatMap& fmap, bool with_stats){
|
||||
base_gbm.DumpModel(fo, fmap, with_stats);
|
||||
}
|
||||
/*!
|
||||
/*!
|
||||
* \brief Dump path of all trees
|
||||
* \param fo text file
|
||||
* \param fo text file
|
||||
* \param data input data
|
||||
*/
|
||||
inline void DumpPath( FILE *fo, const DMatrix &data ){
|
||||
base_gbm.DumpPath( fo, data.data );
|
||||
inline void DumpPath(FILE *fo, const DMatrix &data){
|
||||
base_gbm.DumpPath(fo, data.data);
|
||||
}
|
||||
/*!
|
||||
/*!
|
||||
* \brief save model to stream
|
||||
* \param fo output stream
|
||||
*/
|
||||
inline void SaveModel( utils::IStream &fo ) const{
|
||||
base_gbm.SaveModel( fo );
|
||||
fo.Write( &mparam, sizeof(ModelParam) );
|
||||
}
|
||||
/*!
|
||||
inline void SaveModel(utils::IStream &fo) const{
|
||||
base_gbm.SaveModel(fo);
|
||||
fo.Write(&mparam, sizeof(ModelParam));
|
||||
}
|
||||
/*!
|
||||
* \brief update the model for one iteration
|
||||
* \param iteration iteration number
|
||||
*/
|
||||
inline void UpdateOneIter( int iter ){
|
||||
this->PredictBuffer( preds_, *train_, 0 );
|
||||
this->GetGradient( preds_, train_->labels, grad_, hess_ );
|
||||
inline void UpdateOneIter(int iter){
|
||||
this->PredictBuffer(preds_, *train_, 0);
|
||||
this->GetGradient(preds_, train_->labels, grad_, hess_);
|
||||
std::vector<unsigned> root_index;
|
||||
base_gbm.DoBoost( grad_, hess_, train_->data, root_index );
|
||||
base_gbm.DoBoost(grad_, hess_, train_->data, root_index);
|
||||
}
|
||||
/*!
|
||||
/*!
|
||||
* \brief evaluate the model for specific iteration
|
||||
* \param iter iteration number
|
||||
* \param fo file to output log
|
||||
*/
|
||||
inline void EvalOneIter( int iter, FILE *fo = stderr ){
|
||||
fprintf( fo, "[%d]", iter );
|
||||
int buffer_offset = static_cast<int>( train_->Size() );
|
||||
|
||||
for( size_t i = 0; i < evals_.size(); ++i ){
|
||||
std::vector<float> &preds = this->eval_preds_[ i ];
|
||||
this->PredictBuffer( preds, *evals_[i], buffer_offset);
|
||||
evaluator_.Eval( fo, evname_[i].c_str(), preds, (*evals_[i]).labels );
|
||||
buffer_offset += static_cast<int>( evals_[i]->Size() );
|
||||
*/
|
||||
inline void EvalOneIter(int iter, FILE *fo = stderr){
|
||||
fprintf(fo, "[%d]", iter);
|
||||
int buffer_offset = static_cast<int>(train_->Size());
|
||||
|
||||
for (size_t i = 0; i < evals_.size(); ++i){
|
||||
std::vector<float> &preds = this->eval_preds_[i];
|
||||
this->PredictBuffer(preds, *evals_[i], buffer_offset);
|
||||
evaluator_.Eval(fo, evname_[i].c_str(), preds, (*evals_[i]).labels);
|
||||
buffer_offset += static_cast<int>(evals_[i]->Size());
|
||||
}
|
||||
fprintf( fo,"\n" );
|
||||
fprintf(fo, "\n");
|
||||
}
|
||||
/*! \brief get prediction, without buffering */
|
||||
inline void Predict( std::vector<float> &preds, const DMatrix &data ){
|
||||
preds.resize( data.Size() );
|
||||
inline void Predict(std::vector<float> &preds, const DMatrix &data){
|
||||
preds.resize(data.Size());
|
||||
|
||||
const unsigned ndata = static_cast<unsigned>( data.Size() );
|
||||
#pragma omp parallel for schedule( static )
|
||||
for( unsigned j = 0; j < ndata; ++ j ){
|
||||
const unsigned ndata = static_cast<unsigned>(data.Size());
|
||||
#pragma omp parallel for schedule( static )
|
||||
for (unsigned j = 0; j < ndata; ++j){
|
||||
preds[j] = mparam.PredTransform
|
||||
( mparam.base_score + base_gbm.Predict( data.data, j, -1 ) );
|
||||
(mparam.base_score + base_gbm.Predict(data.data, j, -1));
|
||||
}
|
||||
}
|
||||
public:
|
||||
/*!
|
||||
/*!
|
||||
* \brief update the model for one iteration
|
||||
* \param iteration iteration number
|
||||
*/
|
||||
inline void UpdateInteract( std::string action ){
|
||||
this->InteractPredict( preds_, *train_, 0 );
|
||||
inline void UpdateInteract(std::string action){
|
||||
this->InteractPredict(preds_, *train_, 0);
|
||||
|
||||
int buffer_offset = static_cast<int>( train_->Size() );
|
||||
for( size_t i = 0; i < evals_.size(); ++i ){
|
||||
std::vector<float> &preds = this->eval_preds_[ i ];
|
||||
this->InteractPredict( preds, *evals_[i], buffer_offset );
|
||||
buffer_offset += static_cast<int>( evals_[i]->Size() );
|
||||
int buffer_offset = static_cast<int>(train_->Size());
|
||||
for (size_t i = 0; i < evals_.size(); ++i){
|
||||
std::vector<float> &preds = this->eval_preds_[i];
|
||||
this->InteractPredict(preds, *evals_[i], buffer_offset);
|
||||
buffer_offset += static_cast<int>(evals_[i]->Size());
|
||||
}
|
||||
|
||||
if( action == "remove" ){
|
||||
if (action == "remove"){
|
||||
base_gbm.DelteBooster(); return;
|
||||
}
|
||||
|
||||
this->GetGradient( preds_, train_->labels, grad_, hess_ );
|
||||
std::vector<unsigned> root_index;
|
||||
base_gbm.DoBoost( grad_, hess_, train_->data, root_index );
|
||||
|
||||
this->InteractRePredict( *train_, 0 );
|
||||
buffer_offset = static_cast<int>( train_->Size() );
|
||||
for( size_t i = 0; i < evals_.size(); ++i ){
|
||||
this->InteractRePredict( *evals_[i], buffer_offset );
|
||||
buffer_offset += static_cast<int>( evals_[i]->Size() );
|
||||
this->GetGradient(preds_, train_->labels, grad_, hess_);
|
||||
std::vector<unsigned> root_index;
|
||||
base_gbm.DoBoost(grad_, hess_, train_->data, root_index);
|
||||
|
||||
this->InteractRePredict(*train_, 0);
|
||||
buffer_offset = static_cast<int>(train_->Size());
|
||||
for (size_t i = 0; i < evals_.size(); ++i){
|
||||
this->InteractRePredict(*evals_[i], buffer_offset);
|
||||
buffer_offset += static_cast<int>(evals_[i]->Size());
|
||||
}
|
||||
}
|
||||
private:
|
||||
/*! \brief get the transformed predictions, given data */
|
||||
inline void InteractPredict( std::vector<float> &preds, const DMatrix &data, unsigned buffer_offset ){
|
||||
preds.resize( data.Size() );
|
||||
const unsigned ndata = static_cast<unsigned>( data.Size() );
|
||||
#pragma omp parallel for schedule( static )
|
||||
for( unsigned j = 0; j < ndata; ++ j ){
|
||||
inline void InteractPredict(std::vector<float> &preds, const DMatrix &data, unsigned buffer_offset){
|
||||
preds.resize(data.Size());
|
||||
const unsigned ndata = static_cast<unsigned>(data.Size());
|
||||
#pragma omp parallel for schedule( static )
|
||||
for (unsigned j = 0; j < ndata; ++j){
|
||||
preds[j] = mparam.PredTransform
|
||||
( mparam.base_score + base_gbm.InteractPredict( data.data, j, buffer_offset + j ) );
|
||||
(mparam.base_score + base_gbm.InteractPredict(data.data, j, buffer_offset + j));
|
||||
}
|
||||
}
|
||||
/*! \brief repredict trial */
|
||||
inline void InteractRePredict( const DMatrix &data, unsigned buffer_offset ){
|
||||
const unsigned ndata = static_cast<unsigned>( data.Size() );
|
||||
#pragma omp parallel for schedule( static )
|
||||
for( unsigned j = 0; j < ndata; ++ j ){
|
||||
base_gbm.InteractRePredict( data.data, j, buffer_offset + j );
|
||||
inline void InteractRePredict(const DMatrix &data, unsigned buffer_offset){
|
||||
const unsigned ndata = static_cast<unsigned>(data.Size());
|
||||
#pragma omp parallel for schedule( static )
|
||||
for (unsigned j = 0; j < ndata; ++j){
|
||||
base_gbm.InteractRePredict(data.data, j, buffer_offset + j);
|
||||
}
|
||||
}
|
||||
private:
|
||||
/*! \brief get the transformed predictions, given data */
|
||||
inline void PredictBuffer( std::vector<float> &preds, const DMatrix &data, unsigned buffer_offset ){
|
||||
preds.resize( data.Size() );
|
||||
inline void PredictBuffer(std::vector<float> &preds, const DMatrix &data, unsigned buffer_offset){
|
||||
preds.resize(data.Size());
|
||||
|
||||
const unsigned ndata = static_cast<unsigned>( data.Size() );
|
||||
#pragma omp parallel for schedule( static )
|
||||
for( unsigned j = 0; j < ndata; ++ j ){
|
||||
const unsigned ndata = static_cast<unsigned>(data.Size());
|
||||
#pragma omp parallel for schedule( static )
|
||||
for (unsigned j = 0; j < ndata; ++j){
|
||||
preds[j] = mparam.PredTransform
|
||||
( mparam.base_score + base_gbm.Predict( data.data, j, buffer_offset + j ) );
|
||||
(mparam.base_score + base_gbm.Predict(data.data, j, buffer_offset + j));
|
||||
}
|
||||
}
|
||||
|
||||
/*! \brief get the first order and second order gradient, given the transformed predictions and labels */
|
||||
inline void GetGradient( const std::vector<float> &preds,
|
||||
const std::vector<float> &labels,
|
||||
std::vector<float> &grad,
|
||||
std::vector<float> &hess ){
|
||||
grad.resize( preds.size() ); hess.resize( preds.size() );
|
||||
inline void GetGradient(const std::vector<float> &preds,
|
||||
const std::vector<float> &labels,
|
||||
std::vector<float> &grad,
|
||||
std::vector<float> &hess){
|
||||
grad.resize(preds.size()); hess.resize(preds.size());
|
||||
|
||||
const unsigned ndata = static_cast<unsigned>( preds.size() );
|
||||
#pragma omp parallel for schedule( static )
|
||||
for( unsigned j = 0; j < ndata; ++ j ){
|
||||
grad[j] = mparam.FirstOrderGradient( preds[j], labels[j] );
|
||||
hess[j] = mparam.SecondOrderGradient( preds[j], labels[j] );
|
||||
const unsigned ndata = static_cast<unsigned>(preds.size());
|
||||
#pragma omp parallel for schedule( static )
|
||||
for (unsigned j = 0; j < ndata; ++j){
|
||||
grad[j] = mparam.FirstOrderGradient(preds[j], labels[j]);
|
||||
hess[j] = mparam.SecondOrderGradient(preds[j], labels[j]);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private:
|
||||
enum LossType{
|
||||
kLinearSquare = 0,
|
||||
@ -270,73 +271,73 @@ namespace xgboost{
|
||||
/* \brief number of features */
|
||||
int num_feature;
|
||||
/*! \brief reserved field */
|
||||
int reserved[ 16 ];
|
||||
int reserved[16];
|
||||
/*! \brief constructor */
|
||||
ModelParam( void ){
|
||||
ModelParam(void){
|
||||
base_score = 0.5f;
|
||||
loss_type = 0;
|
||||
loss_type = 0;
|
||||
num_feature = 0;
|
||||
memset( reserved, 0, sizeof( reserved ) );
|
||||
memset(reserved, 0, sizeof(reserved));
|
||||
}
|
||||
/*!
|
||||
* \brief set parameters from outside
|
||||
/*!
|
||||
* \brief set parameters from outside
|
||||
* \param name name of the parameter
|
||||
* \param val value of the parameter
|
||||
*/
|
||||
inline void SetParam( const char *name, const char *val ){
|
||||
if( !strcmp("base_score", name ) ) base_score = (float)atof( val );
|
||||
if( !strcmp("loss_type", name ) ) loss_type = atoi( val );
|
||||
if( !strcmp("bst:num_feature", name ) ) num_feature = atoi( val );
|
||||
inline void SetParam(const char *name, const char *val){
|
||||
if (!strcmp("base_score", name)) base_score = (float)atof(val);
|
||||
if (!strcmp("loss_type", name)) loss_type = atoi(val);
|
||||
if (!strcmp("bst:num_feature", name)) num_feature = atoi(val);
|
||||
}
|
||||
/*!
|
||||
/*!
|
||||
* \brief adjust base_score
|
||||
*/
|
||||
inline void AdjustBase( void ){
|
||||
if( loss_type == 1 || loss_type == 2 ){
|
||||
utils::Assert( base_score > 0.0f && base_score < 1.0f, "sigmoid range constrain" );
|
||||
base_score = - logf( 1.0f / base_score - 1.0f );
|
||||
*/
|
||||
inline void AdjustBase(void){
|
||||
if (loss_type == 1 || loss_type == 2){
|
||||
utils::Assert(base_score > 0.0f && base_score < 1.0f, "sigmoid range constrain");
|
||||
base_score = -logf(1.0f / base_score - 1.0f);
|
||||
}
|
||||
}
|
||||
|
||||
/*!
|
||||
* \brief transform the linear sum to prediction
|
||||
/*!
|
||||
* \brief transform the linear sum to prediction
|
||||
* \param x linear sum of boosting ensemble
|
||||
* \return transformed prediction
|
||||
*/
|
||||
inline float PredTransform( float x ){
|
||||
switch( loss_type ){
|
||||
inline float PredTransform(float x){
|
||||
switch (loss_type){
|
||||
case kLinearSquare: return x;
|
||||
case kLogisticClassify:
|
||||
case kLogisticNeglik: return 1.0f/(1.0f + expf(-x));
|
||||
case kLogisticNeglik: return 1.0f / (1.0f + expf(-x));
|
||||
default: utils::Error("unknown loss_type"); return 0.0f;
|
||||
}
|
||||
}
|
||||
|
||||
/*!
|
||||
/*!
|
||||
* \brief calculate first order gradient of loss, given transformed prediction
|
||||
* \param predt transformed prediction
|
||||
* \param label true label
|
||||
* \return first order gradient
|
||||
*/
|
||||
inline float FirstOrderGradient( float predt, float label ) const{
|
||||
switch( loss_type ){
|
||||
inline float FirstOrderGradient(float predt, float label) const{
|
||||
switch (loss_type){
|
||||
case kLinearSquare: return predt - label;
|
||||
case kLogisticClassify:
|
||||
case kLogisticNeglik: return predt - label;
|
||||
default: utils::Error("unknown loss_type"); return 0.0f;
|
||||
}
|
||||
}
|
||||
/*!
|
||||
/*!
|
||||
* \brief calculate second order gradient of loss, given transformed prediction
|
||||
* \param predt transformed prediction
|
||||
* \param label true label
|
||||
* \return second order gradient
|
||||
*/
|
||||
inline float SecondOrderGradient( float predt, float label ) const{
|
||||
switch( loss_type ){
|
||||
inline float SecondOrderGradient(float predt, float label) const{
|
||||
switch (loss_type){
|
||||
case kLinearSquare: return 1.0f;
|
||||
case kLogisticClassify:
|
||||
case kLogisticNeglik: return predt * ( 1 - predt );
|
||||
case kLogisticNeglik: return predt * (1 - predt);
|
||||
default: utils::Error("unknown loss_type"); return 0.0f;
|
||||
}
|
||||
}
|
||||
@ -348,10 +349,10 @@ namespace xgboost{
|
||||
* \return the specified loss
|
||||
*/
|
||||
inline float Loss(const std::vector<float> &preds, const std::vector<float> &labels) const{
|
||||
switch( loss_type ){
|
||||
case kLinearSquare: return SquareLoss(preds,labels);
|
||||
case kLogisticNeglik:
|
||||
case kLogisticClassify: return NegLoglikelihoodLoss(preds,labels);
|
||||
switch (loss_type){
|
||||
case kLinearSquare: return SquareLoss(preds, labels);
|
||||
case kLogisticNeglik:
|
||||
case kLogisticClassify: return NegLoglikelihoodLoss(preds, labels);
|
||||
default: utils::Error("unknown loss_type"); return 0.0f;
|
||||
}
|
||||
}
|
||||
@ -364,7 +365,7 @@ namespace xgboost{
|
||||
*/
|
||||
inline float SquareLoss(const std::vector<float> &preds, const std::vector<float> &labels) const{
|
||||
float ans = 0.0;
|
||||
for(size_t i = 0; i < preds.size(); i++){
|
||||
for (size_t i = 0; i < preds.size(); i++){
|
||||
float dif = preds[i] - labels[i];
|
||||
ans += dif * dif;
|
||||
}
|
||||
@ -379,8 +380,8 @@ namespace xgboost{
|
||||
*/
|
||||
inline float NegLoglikelihoodLoss(const std::vector<float> &preds, const std::vector<float> &labels) const{
|
||||
float ans = 0.0;
|
||||
for(size_t i = 0; i < preds.size(); i++)
|
||||
ans -= labels[i] * logf(preds[i]) + ( 1 - labels[i] ) * logf(1 - preds[i]);
|
||||
for (size_t i = 0; i < preds.size(); i++)
|
||||
ans -= labels[i] * logf(preds[i]) + (1 - labels[i]) * logf(1 - preds[i]);
|
||||
return ans;
|
||||
}
|
||||
};
|
||||
|
||||
@ -27,111 +27,112 @@ namespace xgboost{
|
||||
std::vector<float> labels;
|
||||
public:
|
||||
/*! \brief default constructor */
|
||||
DMatrix( void ){}
|
||||
DMatrix(void){}
|
||||
|
||||
/*! \brief get the number of instances */
|
||||
inline size_t Size() const{
|
||||
return labels.size();
|
||||
}
|
||||
/*!
|
||||
* \brief load from text file
|
||||
/*!
|
||||
* \brief load from text file
|
||||
* \param fname name of text data
|
||||
* \param silent whether print information or not
|
||||
*/
|
||||
inline void LoadText( const char* fname, bool silent = false ){
|
||||
*/
|
||||
inline void LoadText(const char* fname, bool silent = false){
|
||||
data.Clear();
|
||||
FILE* file = utils::FopenCheck( fname, "r" );
|
||||
FILE* file = utils::FopenCheck(fname, "r");
|
||||
float label; bool init = true;
|
||||
char tmp[ 1024 ];
|
||||
char tmp[1024];
|
||||
std::vector<booster::bst_uint> findex;
|
||||
std::vector<booster::bst_float> fvalue;
|
||||
|
||||
while( fscanf( file, "%s", tmp ) == 1 ){
|
||||
while (fscanf(file, "%s", tmp) == 1){
|
||||
unsigned index; float value;
|
||||
if( sscanf( tmp, "%u:%f", &index, &value ) == 2 ){
|
||||
findex.push_back( index ); fvalue.push_back( value );
|
||||
}else{
|
||||
if( !init ){
|
||||
labels.push_back( label );
|
||||
data.AddRow( findex, fvalue );
|
||||
if (sscanf(tmp, "%u:%f", &index, &value) == 2){
|
||||
findex.push_back(index); fvalue.push_back(value);
|
||||
}
|
||||
else{
|
||||
if (!init){
|
||||
labels.push_back(label);
|
||||
data.AddRow(findex, fvalue);
|
||||
}
|
||||
findex.clear(); fvalue.clear();
|
||||
utils::Assert( sscanf( tmp, "%f", &label ) == 1, "invalid format" );
|
||||
utils::Assert(sscanf(tmp, "%f", &label) == 1, "invalid format");
|
||||
init = false;
|
||||
}
|
||||
}
|
||||
|
||||
labels.push_back( label );
|
||||
data.AddRow( findex, fvalue );
|
||||
labels.push_back(label);
|
||||
data.AddRow(findex, fvalue);
|
||||
// initialize column support as well
|
||||
data.InitData();
|
||||
|
||||
if( !silent ){
|
||||
printf("%ux%u matrix with %lu entries is loaded from %s\n",
|
||||
(unsigned)data.NumRow(), (unsigned)data.NumCol(), (unsigned long)data.NumEntry(), fname );
|
||||
if (!silent){
|
||||
printf("%ux%u matrix with %lu entries is loaded from %s\n",
|
||||
(unsigned)data.NumRow(), (unsigned)data.NumCol(), (unsigned long)data.NumEntry(), fname);
|
||||
}
|
||||
fclose(file);
|
||||
}
|
||||
/*!
|
||||
* \brief load from binary file
|
||||
/*!
|
||||
* \brief load from binary file
|
||||
* \param fname name of binary data
|
||||
* \param silent whether print information or not
|
||||
* \return whether loading is success
|
||||
*/
|
||||
inline bool LoadBinary( const char* fname, bool silent = false ){
|
||||
FILE *fp = fopen64( fname, "rb" );
|
||||
if( fp == NULL ) return false;
|
||||
utils::FileStream fs( fp );
|
||||
data.LoadBinary( fs );
|
||||
labels.resize( data.NumRow() );
|
||||
utils::Assert( fs.Read( &labels[0], sizeof(float) * data.NumRow() ) != 0, "DMatrix LoadBinary" );
|
||||
inline bool LoadBinary(const char* fname, bool silent = false){
|
||||
FILE *fp = fopen64(fname, "rb");
|
||||
if (fp == NULL) return false;
|
||||
utils::FileStream fs(fp);
|
||||
data.LoadBinary(fs);
|
||||
labels.resize(data.NumRow());
|
||||
utils::Assert(fs.Read(&labels[0], sizeof(float)* data.NumRow()) != 0, "DMatrix LoadBinary");
|
||||
fs.Close();
|
||||
// initialize column support as well
|
||||
data.InitData();
|
||||
|
||||
if( !silent ){
|
||||
printf("%ux%u matrix with %lu entries is loaded from %s\n",
|
||||
(unsigned)data.NumRow(), (unsigned)data.NumCol(), (unsigned long)data.NumEntry(), fname );
|
||||
if (!silent){
|
||||
printf("%ux%u matrix with %lu entries is loaded from %s\n",
|
||||
(unsigned)data.NumRow(), (unsigned)data.NumCol(), (unsigned long)data.NumEntry(), fname);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
/*!
|
||||
/*!
|
||||
* \brief save to binary file
|
||||
* \param fname name of binary data
|
||||
* \param silent whether print information or not
|
||||
*/
|
||||
inline void SaveBinary( const char* fname, bool silent = false ){
|
||||
inline void SaveBinary(const char* fname, bool silent = false){
|
||||
// initialize column support as well
|
||||
data.InitData();
|
||||
|
||||
utils::FileStream fs( utils::FopenCheck( fname, "wb" ) );
|
||||
data.SaveBinary( fs );
|
||||
fs.Write( &labels[0], sizeof(float) * data.NumRow() );
|
||||
utils::FileStream fs(utils::FopenCheck(fname, "wb"));
|
||||
data.SaveBinary(fs);
|
||||
fs.Write(&labels[0], sizeof(float)* data.NumRow());
|
||||
fs.Close();
|
||||
if( !silent ){
|
||||
printf("%ux%u matrix with %lu entries is saved to %s\n",
|
||||
(unsigned)data.NumRow(), (unsigned)data.NumCol(), (unsigned long)data.NumEntry(), fname );
|
||||
if (!silent){
|
||||
printf("%ux%u matrix with %lu entries is saved to %s\n",
|
||||
(unsigned)data.NumRow(), (unsigned)data.NumCol(), (unsigned long)data.NumEntry(), fname);
|
||||
}
|
||||
}
|
||||
/*!
|
||||
/*!
|
||||
* \brief cache load data given a file name, if filename ends with .buffer, direct load binary
|
||||
* otherwise the function will first check if fname + '.buffer' exists,
|
||||
* if binary buffer exists, it will reads from binary buffer, otherwise, it will load from text file,
|
||||
* and try to create a buffer file
|
||||
* and try to create a buffer file
|
||||
* \param fname name of binary data
|
||||
* \param silent whether print information or not
|
||||
* \param savebuffer whether do save binary buffer if it is text
|
||||
*/
|
||||
inline void CacheLoad( const char *fname, bool silent = false, bool savebuffer = true ){
|
||||
int len = strlen( fname );
|
||||
if( len > 8 && !strcmp( fname + len - 7, ".buffer") ){
|
||||
this->LoadBinary( fname, silent ); return;
|
||||
inline void CacheLoad(const char *fname, bool silent = false, bool savebuffer = true){
|
||||
int len = strlen(fname);
|
||||
if (len > 8 && !strcmp(fname + len - 7, ".buffer")){
|
||||
this->LoadBinary(fname, silent); return;
|
||||
}
|
||||
char bname[ 1024 ];
|
||||
sprintf( bname, "%s.buffer", fname );
|
||||
if( !this->LoadBinary( bname, silent ) ){
|
||||
this->LoadText( fname, silent );
|
||||
if( savebuffer ) this->SaveBinary( bname, silent );
|
||||
char bname[1024];
|
||||
sprintf(bname, "%s.buffer", fname);
|
||||
if (!this->LoadBinary(bname, silent)){
|
||||
this->LoadText(fname, silent);
|
||||
if (savebuffer) this->SaveBinary(bname, silent);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
@ -16,72 +16,73 @@ namespace xgboost{
|
||||
namespace regression{
|
||||
/*! \brief evaluator that evaluates the loss metrics */
|
||||
struct IEvaluator{
|
||||
/*!
|
||||
* \brief evaluate a specific metric
|
||||
/*!
|
||||
* \brief evaluate a specific metric
|
||||
* \param preds prediction
|
||||
* \param labels label
|
||||
*/
|
||||
virtual float Eval( const std::vector<float> &preds,
|
||||
const std::vector<float> &labels ) const= 0;
|
||||
virtual float Eval(const std::vector<float> &preds,
|
||||
const std::vector<float> &labels) const = 0;
|
||||
/*! \return name of metric */
|
||||
virtual const char *Name( void ) const= 0;
|
||||
virtual const char *Name(void) const = 0;
|
||||
};
|
||||
|
||||
/*! \brief RMSE */
|
||||
struct EvalRMSE : public IEvaluator{
|
||||
virtual float Eval( const std::vector<float> &preds,
|
||||
const std::vector<float> &labels ) const{
|
||||
const unsigned ndata = static_cast<unsigned>( preds.size() );
|
||||
struct EvalRMSE : public IEvaluator{
|
||||
virtual float Eval(const std::vector<float> &preds,
|
||||
const std::vector<float> &labels) const{
|
||||
const unsigned ndata = static_cast<unsigned>(preds.size());
|
||||
float sum = 0.0;
|
||||
#pragma omp parallel for reduction(+:sum) schedule( static )
|
||||
for( unsigned i = 0; i < ndata; ++ i ){
|
||||
#pragma omp parallel for reduction(+:sum) schedule( static )
|
||||
for (unsigned i = 0; i < ndata; ++i){
|
||||
float diff = preds[i] - labels[i];
|
||||
sum += diff * diff;
|
||||
}
|
||||
return sqrtf( sum / ndata );
|
||||
}
|
||||
return sqrtf(sum / ndata);
|
||||
}
|
||||
virtual const char *Name( void ) const{
|
||||
virtual const char *Name(void) const{
|
||||
return "rmse";
|
||||
}
|
||||
};
|
||||
|
||||
/*! \brief Error */
|
||||
struct EvalError : public IEvaluator{
|
||||
virtual float Eval( const std::vector<float> &preds,
|
||||
const std::vector<float> &labels ) const{
|
||||
const unsigned ndata = static_cast<unsigned>( preds.size() );
|
||||
struct EvalError : public IEvaluator{
|
||||
virtual float Eval(const std::vector<float> &preds,
|
||||
const std::vector<float> &labels) const{
|
||||
const unsigned ndata = static_cast<unsigned>(preds.size());
|
||||
unsigned nerr = 0;
|
||||
#pragma omp parallel for reduction(+:nerr) schedule( static )
|
||||
for( unsigned i = 0; i < ndata; ++ i ){
|
||||
if( preds[i] > 0.5f ){
|
||||
if( labels[i] < 0.5f ) nerr += 1;
|
||||
}else{
|
||||
if( labels[i] > 0.5f ) nerr += 1;
|
||||
#pragma omp parallel for reduction(+:nerr) schedule( static )
|
||||
for (unsigned i = 0; i < ndata; ++i){
|
||||
if (preds[i] > 0.5f){
|
||||
if (labels[i] < 0.5f) nerr += 1;
|
||||
}
|
||||
}
|
||||
else{
|
||||
if (labels[i] > 0.5f) nerr += 1;
|
||||
}
|
||||
}
|
||||
return static_cast<float>(nerr) / ndata;
|
||||
}
|
||||
virtual const char *Name( void ) const{
|
||||
virtual const char *Name(void) const{
|
||||
return "error";
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
/*! \brief Error */
|
||||
struct EvalLogLoss : public IEvaluator{
|
||||
virtual float Eval( const std::vector<float> &preds,
|
||||
const std::vector<float> &labels ) const{
|
||||
const unsigned ndata = static_cast<unsigned>( preds.size() );
|
||||
struct EvalLogLoss : public IEvaluator{
|
||||
virtual float Eval(const std::vector<float> &preds,
|
||||
const std::vector<float> &labels) const{
|
||||
const unsigned ndata = static_cast<unsigned>(preds.size());
|
||||
unsigned nerr = 0;
|
||||
#pragma omp parallel for reduction(+:nerr) schedule( static )
|
||||
for( unsigned i = 0; i < ndata; ++ i ){
|
||||
#pragma omp parallel for reduction(+:nerr) schedule( static )
|
||||
for (unsigned i = 0; i < ndata; ++i){
|
||||
const float y = labels[i];
|
||||
const float py = preds[i];
|
||||
nerr -= y * std::log(py) + (1.0f-y)*std::log(1-py);
|
||||
}
|
||||
nerr -= y * std::log(py) + (1.0f - y)*std::log(1 - py);
|
||||
}
|
||||
return static_cast<float>(nerr) / ndata;
|
||||
}
|
||||
virtual const char *Name( void ) const{
|
||||
virtual const char *Name(void) const{
|
||||
return "negllik";
|
||||
}
|
||||
};
|
||||
@ -91,28 +92,28 @@ namespace xgboost{
|
||||
/*! \brief a set of evaluators */
|
||||
struct EvalSet{
|
||||
public:
|
||||
inline void AddEval( const char *name ){
|
||||
if( !strcmp( name, "rmse") ) evals_.push_back( &rmse_ );
|
||||
if( !strcmp( name, "error") ) evals_.push_back( &error_ );
|
||||
if( !strcmp( name, "logloss") ) evals_.push_back( &logloss_ );
|
||||
inline void AddEval(const char *name){
|
||||
if (!strcmp(name, "rmse")) evals_.push_back(&rmse_);
|
||||
if (!strcmp(name, "error")) evals_.push_back(&error_);
|
||||
if (!strcmp(name, "logloss")) evals_.push_back(&logloss_);
|
||||
}
|
||||
inline void Init( void ){
|
||||
std::sort( evals_.begin(), evals_.end() );
|
||||
evals_.resize( std::unique( evals_.begin(), evals_.end() ) - evals_.begin() );
|
||||
inline void Init(void){
|
||||
std::sort(evals_.begin(), evals_.end());
|
||||
evals_.resize(std::unique(evals_.begin(), evals_.end()) - evals_.begin());
|
||||
}
|
||||
inline void Eval( FILE *fo, const char *evname,
|
||||
const std::vector<float> &preds,
|
||||
const std::vector<float> &labels ) const{
|
||||
for( size_t i = 0; i < evals_.size(); ++ i ){
|
||||
float res = evals_[i]->Eval( preds, labels );
|
||||
fprintf( fo, "\t%s-%s:%f", evname, evals_[i]->Name(), res );
|
||||
}
|
||||
inline void Eval(FILE *fo, const char *evname,
|
||||
const std::vector<float> &preds,
|
||||
const std::vector<float> &labels) const{
|
||||
for (size_t i = 0; i < evals_.size(); ++i){
|
||||
float res = evals_[i]->Eval(preds, labels);
|
||||
fprintf(fo, "\t%s-%s:%f", evname, evals_[i]->Name(), res);
|
||||
}
|
||||
}
|
||||
private:
|
||||
EvalRMSE rmse_;
|
||||
EvalError error_;
|
||||
EvalLogLoss logloss_;
|
||||
std::vector<const IEvaluator*> evals_;
|
||||
std::vector<const IEvaluator*> evals_;
|
||||
};
|
||||
};
|
||||
};
|
||||
|
||||
@ -16,83 +16,84 @@ namespace xgboost{
|
||||
* given the configuation
|
||||
* \author Kailong Chen: chenkl198812@gmail.com, Tianqi Chen: tianqi.chen@gmail.com
|
||||
*/
|
||||
class RegBoostTask{
|
||||
class RegBoostTask{
|
||||
public:
|
||||
inline int Run( int argc, char *argv[] ){
|
||||
if( argc < 2 ){
|
||||
printf("Usage: <config>\n");
|
||||
inline int Run(int argc, char *argv[]){
|
||||
if (argc < 2){
|
||||
printf("Usage: <config>\n");
|
||||
return 0;
|
||||
}
|
||||
utils::ConfigIterator itr( argv[1] );
|
||||
while( itr.Next() ){
|
||||
this->SetParam( itr.name(), itr.val() );
|
||||
utils::ConfigIterator itr(argv[1]);
|
||||
while (itr.Next()){
|
||||
this->SetParam(itr.name(), itr.val());
|
||||
}
|
||||
for( int i = 2; i < argc; i ++ ){
|
||||
for (int i = 2; i < argc; i++){
|
||||
char name[256], val[256];
|
||||
if( sscanf( argv[i], "%[^=]=%s", name, val ) == 2 ){
|
||||
this->SetParam( name, val );
|
||||
if (sscanf(argv[i], "%[^=]=%s", name, val) == 2){
|
||||
this->SetParam(name, val);
|
||||
}
|
||||
}
|
||||
this->InitData();
|
||||
this->InitLearner();
|
||||
if( task == "dump" ){
|
||||
if (task == "dump"){
|
||||
this->TaskDump();
|
||||
return 0;
|
||||
}
|
||||
if( task == "interact" ){
|
||||
if (task == "interact"){
|
||||
this->TaskInteractive(); return 0;
|
||||
}
|
||||
if( task == "dumppath" ){
|
||||
if (task == "dumppath"){
|
||||
this->TaskDumpPath(); return 0;
|
||||
}
|
||||
if( task == "eval" ){
|
||||
if (task == "eval"){
|
||||
this->TaskEval(); return 0;
|
||||
}
|
||||
if( task == "pred" ){
|
||||
if (task == "pred"){
|
||||
this->TaskPred();
|
||||
}else{
|
||||
}
|
||||
else{
|
||||
this->TaskTrain();
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
inline void SetParam( const char *name, const char *val ){
|
||||
if( !strcmp("silent", name ) ) silent = atoi( val );
|
||||
if( !strcmp("use_buffer", name ) ) use_buffer = atoi( val );
|
||||
if( !strcmp("seed", name ) ) random::Seed( atoi(val) );
|
||||
if( !strcmp("num_round", name ) ) num_round = atoi( val );
|
||||
if( !strcmp("save_period", name ) ) save_period = atoi( val );
|
||||
if( !strcmp("task", name ) ) task = val;
|
||||
if( !strcmp("data", name ) ) train_path = val;
|
||||
if( !strcmp("test:data", name ) ) test_path = val;
|
||||
if( !strcmp("model_in", name ) ) model_in = val;
|
||||
if( !strcmp("model_out", name ) ) model_out = val;
|
||||
if( !strcmp("model_dir", name ) ) model_dir_path = val;
|
||||
if( !strcmp("fmap", name ) ) name_fmap = val;
|
||||
if( !strcmp("name_dump", name ) ) name_dump = val;
|
||||
if( !strcmp("name_dumppath", name ) ) name_dumppath = val;
|
||||
if( !strcmp("name_pred", name ) ) name_pred = val;
|
||||
if( !strcmp("dump_stats", name ) ) dump_model_stats = atoi( val );
|
||||
if( !strcmp("interact:action", name ) ) interact_action = val;
|
||||
if( !strncmp("batch:", name, 6 ) ){
|
||||
cfg_batch.PushBack( name + 6, val );
|
||||
inline void SetParam(const char *name, const char *val){
|
||||
if (!strcmp("silent", name)) silent = atoi(val);
|
||||
if (!strcmp("use_buffer", name)) use_buffer = atoi(val);
|
||||
if (!strcmp("seed", name)) random::Seed(atoi(val));
|
||||
if (!strcmp("num_round", name)) num_round = atoi(val);
|
||||
if (!strcmp("save_period", name)) save_period = atoi(val);
|
||||
if (!strcmp("task", name)) task = val;
|
||||
if (!strcmp("data", name)) train_path = val;
|
||||
if (!strcmp("test:data", name)) test_path = val;
|
||||
if (!strcmp("model_in", name)) model_in = val;
|
||||
if (!strcmp("model_out", name)) model_out = val;
|
||||
if (!strcmp("model_dir", name)) model_dir_path = val;
|
||||
if (!strcmp("fmap", name)) name_fmap = val;
|
||||
if (!strcmp("name_dump", name)) name_dump = val;
|
||||
if (!strcmp("name_dumppath", name)) name_dumppath = val;
|
||||
if (!strcmp("name_pred", name)) name_pred = val;
|
||||
if (!strcmp("dump_stats", name)) dump_model_stats = atoi(val);
|
||||
if (!strcmp("interact:action", name)) interact_action = val;
|
||||
if (!strncmp("batch:", name, 6)){
|
||||
cfg_batch.PushBack(name + 6, val);
|
||||
}
|
||||
if( !strncmp("eval[", name, 5 ) ) {
|
||||
char evname[ 256 ];
|
||||
utils::Assert( sscanf( name, "eval[%[^]]", evname ) == 1, "must specify evaluation name for display");
|
||||
eval_data_names.push_back( std::string( evname ) );
|
||||
eval_data_paths.push_back( std::string( val ) );
|
||||
if (!strncmp("eval[", name, 5)) {
|
||||
char evname[256];
|
||||
utils::Assert(sscanf(name, "eval[%[^]]", evname) == 1, "must specify evaluation name for display");
|
||||
eval_data_names.push_back(std::string(evname));
|
||||
eval_data_paths.push_back(std::string(val));
|
||||
}
|
||||
cfg.PushBack( name, val );
|
||||
cfg.PushBack(name, val);
|
||||
}
|
||||
public:
|
||||
RegBoostTask( void ){
|
||||
RegBoostTask(void){
|
||||
// default parameters
|
||||
silent = 0;
|
||||
use_buffer = 1;
|
||||
num_round = 10;
|
||||
save_period = 0;
|
||||
dump_model_stats = 0;
|
||||
task = "train";
|
||||
task = "train";
|
||||
model_in = "NULL";
|
||||
model_out = "NULL";
|
||||
name_fmap = "NULL";
|
||||
@ -102,128 +103,132 @@ namespace xgboost{
|
||||
model_dir_path = "./";
|
||||
interact_action = "update";
|
||||
}
|
||||
~RegBoostTask( void ){
|
||||
for( size_t i = 0; i < deval.size(); i ++ ){
|
||||
~RegBoostTask(void){
|
||||
for (size_t i = 0; i < deval.size(); i++){
|
||||
delete deval[i];
|
||||
}
|
||||
}
|
||||
private:
|
||||
inline void InitData( void ){
|
||||
if( name_fmap != "NULL" ) fmap.LoadText( name_fmap.c_str() );
|
||||
if( task == "dump" ) return;
|
||||
if( task == "pred" || task == "dumppath" ){
|
||||
data.CacheLoad( test_path.c_str(), silent!=0, use_buffer!=0 );
|
||||
}else{
|
||||
inline void InitData(void){
|
||||
if (name_fmap != "NULL") fmap.LoadText(name_fmap.c_str());
|
||||
if (task == "dump") return;
|
||||
if (task == "pred" || task == "dumppath"){
|
||||
data.CacheLoad(test_path.c_str(), silent != 0, use_buffer != 0);
|
||||
}
|
||||
else{
|
||||
// training
|
||||
data.CacheLoad( train_path.c_str(), silent!=0, use_buffer!=0 );
|
||||
utils::Assert( eval_data_names.size() == eval_data_paths.size() );
|
||||
for( size_t i = 0; i < eval_data_names.size(); ++ i ){
|
||||
deval.push_back( new DMatrix() );
|
||||
deval.back()->CacheLoad( eval_data_paths[i].c_str(), silent!=0, use_buffer!=0 );
|
||||
data.CacheLoad(train_path.c_str(), silent != 0, use_buffer != 0);
|
||||
utils::Assert(eval_data_names.size() == eval_data_paths.size());
|
||||
for (size_t i = 0; i < eval_data_names.size(); ++i){
|
||||
deval.push_back(new DMatrix());
|
||||
deval.back()->CacheLoad(eval_data_paths[i].c_str(), silent != 0, use_buffer != 0);
|
||||
}
|
||||
}
|
||||
learner.SetData( &data, deval, eval_data_names );
|
||||
learner.SetData(&data, deval, eval_data_names);
|
||||
}
|
||||
inline void InitLearner( void ){
|
||||
inline void InitLearner(void){
|
||||
cfg.BeforeFirst();
|
||||
while( cfg.Next() ){
|
||||
learner.SetParam( cfg.name(), cfg.val() );
|
||||
while (cfg.Next()){
|
||||
learner.SetParam(cfg.name(), cfg.val());
|
||||
}
|
||||
if( model_in != "NULL" ){
|
||||
utils::FileStream fi( utils::FopenCheck( model_in.c_str(), "rb") );
|
||||
learner.LoadModel( fi );
|
||||
if (model_in != "NULL"){
|
||||
utils::FileStream fi(utils::FopenCheck(model_in.c_str(), "rb"));
|
||||
learner.LoadModel(fi);
|
||||
fi.Close();
|
||||
}else{
|
||||
utils::Assert( task == "train", "model_in not specified" );
|
||||
}
|
||||
else{
|
||||
utils::Assert(task == "train", "model_in not specified");
|
||||
learner.InitModel();
|
||||
}
|
||||
learner.InitTrainer();
|
||||
}
|
||||
inline void TaskTrain( void ){
|
||||
const time_t start = time( NULL );
|
||||
inline void TaskTrain(void){
|
||||
const time_t start = time(NULL);
|
||||
unsigned long elapsed = 0;
|
||||
for( int i = 0; i < num_round; ++ i ){
|
||||
elapsed = (unsigned long)(time(NULL) - start);
|
||||
if( !silent ) printf("boosting round %d, %lu sec elapsed\n", i , elapsed );
|
||||
learner.UpdateOneIter( i );
|
||||
learner.EvalOneIter( i );
|
||||
if( save_period != 0 && (i+1) % save_period == 0 ){
|
||||
this->SaveModel( i );
|
||||
for (int i = 0; i < num_round; ++i){
|
||||
elapsed = (unsigned long)(time(NULL) - start);
|
||||
if (!silent) printf("boosting round %d, %lu sec elapsed\n", i, elapsed);
|
||||
learner.UpdateOneIter(i);
|
||||
learner.EvalOneIter(i);
|
||||
if (save_period != 0 && (i + 1) % save_period == 0){
|
||||
this->SaveModel(i);
|
||||
}
|
||||
elapsed = (unsigned long)(time(NULL) - start);
|
||||
elapsed = (unsigned long)(time(NULL) - start);
|
||||
}
|
||||
// always save final round
|
||||
if( save_period == 0 || num_round % save_period != 0 ){
|
||||
if( model_out == "NULL" ){
|
||||
this->SaveModel( num_round - 1 );
|
||||
}else{
|
||||
this->SaveModel( model_out.c_str() );
|
||||
if (save_period == 0 || num_round % save_period != 0){
|
||||
if (model_out == "NULL"){
|
||||
this->SaveModel(num_round - 1);
|
||||
}
|
||||
else{
|
||||
this->SaveModel(model_out.c_str());
|
||||
}
|
||||
}
|
||||
if( !silent ){
|
||||
printf("\nupdating end, %lu sec in all\n", elapsed );
|
||||
if (!silent){
|
||||
printf("\nupdating end, %lu sec in all\n", elapsed);
|
||||
}
|
||||
}
|
||||
inline void TaskEval( void ){
|
||||
learner.EvalOneIter( 0 );
|
||||
inline void TaskEval(void){
|
||||
learner.EvalOneIter(0);
|
||||
}
|
||||
inline void TaskInteractive( void ){
|
||||
const time_t start = time( NULL );
|
||||
inline void TaskInteractive(void){
|
||||
const time_t start = time(NULL);
|
||||
unsigned long elapsed = 0;
|
||||
int batch_action = 0;
|
||||
|
||||
|
||||
cfg_batch.BeforeFirst();
|
||||
while( cfg_batch.Next() ){
|
||||
if( !strcmp( cfg_batch.name(), "run" ) ){
|
||||
learner.UpdateInteract( interact_action );
|
||||
while (cfg_batch.Next()){
|
||||
if (!strcmp(cfg_batch.name(), "run")){
|
||||
learner.UpdateInteract(interact_action);
|
||||
batch_action += 1;
|
||||
} else{
|
||||
learner.SetParam( cfg_batch.name(), cfg_batch.val() );
|
||||
}
|
||||
else{
|
||||
learner.SetParam(cfg_batch.name(), cfg_batch.val());
|
||||
}
|
||||
}
|
||||
|
||||
if( batch_action == 0 ){
|
||||
learner.UpdateInteract( interact_action );
|
||||
if (batch_action == 0){
|
||||
learner.UpdateInteract(interact_action);
|
||||
}
|
||||
utils::Assert( model_out != "NULL", "interactive mode must specify model_out" );
|
||||
this->SaveModel( model_out.c_str() );
|
||||
elapsed = (unsigned long)(time(NULL) - start);
|
||||
utils::Assert(model_out != "NULL", "interactive mode must specify model_out");
|
||||
this->SaveModel(model_out.c_str());
|
||||
elapsed = (unsigned long)(time(NULL) - start);
|
||||
|
||||
if( !silent ){
|
||||
printf("\ninteractive update, %d batch actions, %lu sec in all\n", batch_action, elapsed );
|
||||
if (!silent){
|
||||
printf("\ninteractive update, %d batch actions, %lu sec in all\n", batch_action, elapsed);
|
||||
}
|
||||
}
|
||||
|
||||
inline void TaskDump( void ){
|
||||
FILE *fo = utils::FopenCheck( name_dump.c_str(), "w" );
|
||||
learner.DumpModel( fo, fmap, dump_model_stats != 0 );
|
||||
fclose( fo );
|
||||
inline void TaskDump(void){
|
||||
FILE *fo = utils::FopenCheck(name_dump.c_str(), "w");
|
||||
learner.DumpModel(fo, fmap, dump_model_stats != 0);
|
||||
fclose(fo);
|
||||
}
|
||||
inline void TaskDumpPath( void ){
|
||||
FILE *fo = utils::FopenCheck( name_dumppath.c_str(), "w" );
|
||||
learner.DumpPath( fo, data );
|
||||
fclose( fo );
|
||||
inline void TaskDumpPath(void){
|
||||
FILE *fo = utils::FopenCheck(name_dumppath.c_str(), "w");
|
||||
learner.DumpPath(fo, data);
|
||||
fclose(fo);
|
||||
}
|
||||
inline void SaveModel( const char *fname ) const{
|
||||
utils::FileStream fo( utils::FopenCheck( fname, "wb" ) );
|
||||
learner.SaveModel( fo );
|
||||
inline void SaveModel(const char *fname) const{
|
||||
utils::FileStream fo(utils::FopenCheck(fname, "wb"));
|
||||
learner.SaveModel(fo);
|
||||
fo.Close();
|
||||
}
|
||||
inline void SaveModel( int i ) const{
|
||||
inline void SaveModel(int i) const{
|
||||
char fname[256];
|
||||
sprintf( fname ,"%s/%04d.model", model_dir_path.c_str(), i+1 );
|
||||
this->SaveModel( fname );
|
||||
sprintf(fname, "%s/%04d.model", model_dir_path.c_str(), i + 1);
|
||||
this->SaveModel(fname);
|
||||
}
|
||||
inline void TaskPred( void ){
|
||||
inline void TaskPred(void){
|
||||
std::vector<float> preds;
|
||||
if( !silent ) printf("start prediction...\n");
|
||||
learner.Predict( preds, data );
|
||||
if( !silent ) printf("writing prediction to %s\n", name_pred.c_str() );
|
||||
FILE *fo = utils::FopenCheck( name_pred.c_str(), "w" );
|
||||
for( size_t i = 0; i < preds.size(); i ++ ){
|
||||
fprintf( fo, "%f\n", preds[i] );
|
||||
if (!silent) printf("start prediction...\n");
|
||||
learner.Predict(preds, data);
|
||||
if (!silent) printf("writing prediction to %s\n", name_pred.c_str());
|
||||
FILE *fo = utils::FopenCheck(name_pred.c_str(), "w");
|
||||
for (size_t i = 0; i < preds.size(); i++){
|
||||
fprintf(fo, "%f\n", preds[i]);
|
||||
}
|
||||
fclose( fo );
|
||||
fclose(fo);
|
||||
}
|
||||
private:
|
||||
/* \brief whether silent */
|
||||
@ -231,7 +236,7 @@ namespace xgboost{
|
||||
/* \brief whether use auto binary buffer */
|
||||
int use_buffer;
|
||||
/* \brief number of boosting iterations */
|
||||
int num_round;
|
||||
int num_round;
|
||||
/* \brief the period to save the model, 0 means only save the final round model */
|
||||
int save_period;
|
||||
/*! \brief interfact action */
|
||||
@ -257,9 +262,9 @@ namespace xgboost{
|
||||
/* \brief name of dump path file */
|
||||
std::string name_dumppath;
|
||||
/* \brief the paths of validation data sets */
|
||||
std::vector<std::string> eval_data_paths;
|
||||
std::vector<std::string> eval_data_paths;
|
||||
/* \brief the names of the evaluation data used in output log */
|
||||
std::vector<std::string> eval_data_names;
|
||||
std::vector<std::string> eval_data_names;
|
||||
/*! \brief saves configurations */
|
||||
utils::ConfigSaver cfg;
|
||||
/*! \brief batch configurations */
|
||||
@ -274,7 +279,7 @@ namespace xgboost{
|
||||
};
|
||||
|
||||
int main( int argc, char *argv[] ){
|
||||
xgboost::random::Seed( 0 );
|
||||
xgboost::regression::RegBoostTask tsk;
|
||||
return tsk.Run( argc, argv );
|
||||
xgboost::random::Seed( 0 );
|
||||
xgboost::regression::RegBoostTask tsk;
|
||||
return tsk.Run( argc, argv );
|
||||
}
|
||||
|
||||
@ -14,198 +14,203 @@
|
||||
|
||||
namespace xgboost{
|
||||
namespace utils{
|
||||
/*!
|
||||
/*!
|
||||
* \brief an iterator that iterates over a configure file and gets the configures
|
||||
*/
|
||||
class ConfigIterator{
|
||||
public:
|
||||
/*!
|
||||
* \brief constructor
|
||||
/*!
|
||||
* \brief constructor
|
||||
* \param fname name of configure file
|
||||
*/
|
||||
ConfigIterator( const char *fname ){
|
||||
fi = FopenCheck( fname, "r");
|
||||
ch_buf = fgetc( fi );
|
||||
ConfigIterator(const char *fname){
|
||||
fi = FopenCheck(fname, "r");
|
||||
ch_buf = fgetc(fi);
|
||||
}
|
||||
/*! \brief destructor */
|
||||
~ConfigIterator(){
|
||||
fclose( fi );
|
||||
fclose(fi);
|
||||
}
|
||||
/*!
|
||||
/*!
|
||||
* \brief get current name, called after Next returns true
|
||||
* \return current parameter name
|
||||
* \return current parameter name
|
||||
*/
|
||||
inline const char *name( void )const{
|
||||
inline const char *name(void)const{
|
||||
return s_name;
|
||||
}
|
||||
/*!
|
||||
/*!
|
||||
* \brief get current value, called after Next returns true
|
||||
* \return current parameter value
|
||||
* \return current parameter value
|
||||
*/
|
||||
inline const char *val( void ) const{
|
||||
inline const char *val(void) const{
|
||||
return s_val;
|
||||
}
|
||||
/*!
|
||||
/*!
|
||||
* \brief move iterator to next position
|
||||
* \return true if there is value in next position
|
||||
*/
|
||||
inline bool Next( void ){
|
||||
while( !feof( fi ) ){
|
||||
GetNextToken( s_name );
|
||||
if( s_name[0] == '=') return false;
|
||||
if( GetNextToken( s_buf ) || s_buf[0] != '=' ) return false;
|
||||
if( GetNextToken( s_val ) || s_val[0] == '=' ) return false;
|
||||
inline bool Next(void){
|
||||
while (!feof(fi)){
|
||||
GetNextToken(s_name);
|
||||
if (s_name[0] == '=') return false;
|
||||
if (GetNextToken(s_buf) || s_buf[0] != '=') return false;
|
||||
if (GetNextToken(s_val) || s_val[0] == '=') return false;
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
private:
|
||||
FILE *fi;
|
||||
FILE *fi;
|
||||
char ch_buf;
|
||||
char s_name[256],s_val[256],s_buf[246];
|
||||
|
||||
inline void SkipLine(){
|
||||
char s_name[256], s_val[256], s_buf[246];
|
||||
|
||||
inline void SkipLine(){
|
||||
do{
|
||||
ch_buf = fgetc( fi );
|
||||
}while( ch_buf != EOF && ch_buf != '\n' && ch_buf != '\r' );
|
||||
ch_buf = fgetc(fi);
|
||||
} while (ch_buf != EOF && ch_buf != '\n' && ch_buf != '\r');
|
||||
}
|
||||
|
||||
inline void ParseStr( char tok[] ){
|
||||
int i = 0;
|
||||
while( (ch_buf = fgetc(fi)) != EOF ){
|
||||
switch( ch_buf ){
|
||||
case '\\': tok[i++] = fgetc( fi ); break;
|
||||
case '\"': tok[i++] = '\0';
|
||||
return;
|
||||
|
||||
inline void ParseStr(char tok[]){
|
||||
int i = 0;
|
||||
while ((ch_buf = fgetc(fi)) != EOF){
|
||||
switch (ch_buf){
|
||||
case '\\': tok[i++] = fgetc(fi); break;
|
||||
case '\"': tok[i++] = '\0';
|
||||
return;
|
||||
case '\r':
|
||||
case '\n': Error("unterminated string"); break;
|
||||
default: tok[i++] = ch_buf;
|
||||
}
|
||||
}
|
||||
Error("unterminated string");
|
||||
Error("unterminated string");
|
||||
}
|
||||
// return newline
|
||||
inline bool GetNextToken( char tok[] ){
|
||||
inline bool GetNextToken(char tok[]){
|
||||
int i = 0;
|
||||
bool new_line = false;
|
||||
while( ch_buf != EOF ){
|
||||
switch( ch_buf ){
|
||||
case '#' : SkipLine(); new_line = true; break;
|
||||
bool new_line = false;
|
||||
while (ch_buf != EOF){
|
||||
switch (ch_buf){
|
||||
case '#': SkipLine(); new_line = true; break;
|
||||
case '\"':
|
||||
if( i == 0 ){
|
||||
ParseStr( tok );ch_buf = fgetc(fi); return new_line;
|
||||
}else{
|
||||
Error("token followed directly by string");
|
||||
if (i == 0){
|
||||
ParseStr(tok); ch_buf = fgetc(fi); return new_line;
|
||||
}
|
||||
else{
|
||||
Error("token followed directly by string");
|
||||
}
|
||||
case '=':
|
||||
if( i == 0 ) {
|
||||
ch_buf = fgetc( fi );
|
||||
tok[0] = '=';
|
||||
tok[1] = '\0';
|
||||
}else{
|
||||
tok[i] = '\0';
|
||||
if (i == 0) {
|
||||
ch_buf = fgetc(fi);
|
||||
tok[0] = '=';
|
||||
tok[1] = '\0';
|
||||
}
|
||||
else{
|
||||
tok[i] = '\0';
|
||||
}
|
||||
return new_line;
|
||||
case '\r':
|
||||
case '\r':
|
||||
case '\n':
|
||||
if( i == 0 ) new_line = true;
|
||||
if (i == 0) new_line = true;
|
||||
case '\t':
|
||||
case ' ' :
|
||||
ch_buf = fgetc( fi );
|
||||
if( i > 0 ){
|
||||
tok[i] = '\0';
|
||||
case ' ':
|
||||
ch_buf = fgetc(fi);
|
||||
if (i > 0){
|
||||
tok[i] = '\0';
|
||||
return new_line;
|
||||
}
|
||||
}
|
||||
break;
|
||||
default:
|
||||
default:
|
||||
tok[i++] = ch_buf;
|
||||
ch_buf = fgetc( fi );
|
||||
break;
|
||||
ch_buf = fgetc(fi);
|
||||
break;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
};
|
||||
};
|
||||
|
||||
|
||||
namespace utils{
|
||||
/*!
|
||||
* \brief a class that save parameter configurations
|
||||
* temporally and allows to get them out later
|
||||
/*!
|
||||
* \brief a class that save parameter configurations
|
||||
* temporally and allows to get them out later
|
||||
* there are two kinds of priority in ConfigSaver
|
||||
*/
|
||||
class ConfigSaver{
|
||||
public:
|
||||
/*! \brief constructor */
|
||||
ConfigSaver( void ){ idx = 0; }
|
||||
ConfigSaver(void){ idx = 0; }
|
||||
/*! \brief clear all saves */
|
||||
inline void Clear( void ){
|
||||
inline void Clear(void){
|
||||
idx = 0;
|
||||
names.clear(); values.clear();
|
||||
names_high.clear(); values_high.clear();
|
||||
}
|
||||
/*!
|
||||
* \brief push back a parameter setting
|
||||
/*!
|
||||
* \brief push back a parameter setting
|
||||
* \param name name of parameter
|
||||
* \param val value of parameter
|
||||
* \param priority whether the setting has higher priority: high priority occurs
|
||||
* \param priority whether the setting has higher priority: high priority occurs
|
||||
* latter when read from ConfigSaver, and can overwrite existing settings
|
||||
*/
|
||||
inline void PushBack( const char *name, const char *val, int priority = 0 ){
|
||||
if( priority == 0 ){
|
||||
names.push_back( std::string( name ) );
|
||||
values.push_back( std::string( val ) );
|
||||
}else{
|
||||
names_high.push_back( std::string( name ) );
|
||||
values_high.push_back( std::string( val ) );
|
||||
inline void PushBack(const char *name, const char *val, int priority = 0){
|
||||
if (priority == 0){
|
||||
names.push_back(std::string(name));
|
||||
values.push_back(std::string(val));
|
||||
}
|
||||
else{
|
||||
names_high.push_back(std::string(name));
|
||||
values_high.push_back(std::string(val));
|
||||
}
|
||||
}
|
||||
/*! \brief set pointer to beginning of the ConfigSaver */
|
||||
inline void BeforeFirst( void ){
|
||||
inline void BeforeFirst(void){
|
||||
idx = 0;
|
||||
}
|
||||
/*!
|
||||
/*!
|
||||
* \brief move iterator to next position
|
||||
* \return true if there is value in next position
|
||||
*/
|
||||
inline bool Next( void ){
|
||||
if( idx >= names.size() + names_high.size() ){
|
||||
inline bool Next(void){
|
||||
if (idx >= names.size() + names_high.size()){
|
||||
return false;
|
||||
}
|
||||
idx ++;
|
||||
idx++;
|
||||
return true;
|
||||
}
|
||||
/*!
|
||||
/*!
|
||||
* \brief get current name, called after Next returns true
|
||||
* \return current parameter name
|
||||
*/
|
||||
inline const char *name( void ) const{
|
||||
Assert( idx > 0, "can't call name before first");
|
||||
* \return current parameter name
|
||||
*/
|
||||
inline const char *name(void) const{
|
||||
Assert(idx > 0, "can't call name before first");
|
||||
size_t i = idx - 1;
|
||||
if( i >= names.size() ){
|
||||
return names_high[ i - names.size() ].c_str();
|
||||
}else{
|
||||
return names[ i ].c_str();
|
||||
if (i >= names.size()){
|
||||
return names_high[i - names.size()].c_str();
|
||||
}
|
||||
else{
|
||||
return names[i].c_str();
|
||||
}
|
||||
}
|
||||
/*!
|
||||
/*!
|
||||
* \brief get current value, called after Next returns true
|
||||
* \return current parameter value
|
||||
* \return current parameter value
|
||||
*/
|
||||
inline const char *val( void ) const{
|
||||
Assert( idx > 0, "can't call name before first");
|
||||
inline const char *val(void) const{
|
||||
Assert(idx > 0, "can't call name before first");
|
||||
size_t i = idx - 1;
|
||||
if( i >= values.size() ){
|
||||
return values_high[ i - values.size() ].c_str();
|
||||
}else{
|
||||
return values[ i ].c_str();
|
||||
if (i >= values.size()){
|
||||
return values_high[i - values.size()].c_str();
|
||||
}
|
||||
else{
|
||||
return values[i].c_str();
|
||||
}
|
||||
}
|
||||
private:
|
||||
std::vector<std::string> names;
|
||||
std::vector<std::string> values;
|
||||
std::vector<std::string> names_high;
|
||||
std::vector<std::string> values_high;
|
||||
std::vector<std::string> values_high;
|
||||
size_t idx;
|
||||
};
|
||||
};
|
||||
|
||||
@ -16,48 +16,48 @@ namespace xgboost{
|
||||
class FeatMap{
|
||||
public:
|
||||
enum Type{
|
||||
kIndicator = 0,
|
||||
kIndicator = 0,
|
||||
kQuantitive = 1,
|
||||
kInteger = 2,
|
||||
kFloat = 3
|
||||
};
|
||||
public:
|
||||
/*! \brief load feature map from text format */
|
||||
inline void LoadText( const char *fname ){
|
||||
FILE *fi = utils::FopenCheck( fname, "r" );
|
||||
this->LoadText( fi );
|
||||
fclose( fi );
|
||||
inline void LoadText(const char *fname){
|
||||
FILE *fi = utils::FopenCheck(fname, "r");
|
||||
this->LoadText(fi);
|
||||
fclose(fi);
|
||||
}
|
||||
/*! \brief load feature map from text format */
|
||||
inline void LoadText( FILE *fi ){
|
||||
inline void LoadText(FILE *fi){
|
||||
int fid;
|
||||
char fname[256], ftype[256];
|
||||
while( fscanf( fi, "%d%s%s", &fid, fname, ftype ) == 3 ){
|
||||
utils::Assert( fid == (int)names_.size(), "invalid fmap format" );
|
||||
names_.push_back( std::string(fname) );
|
||||
types_.push_back( GetType( ftype ) );
|
||||
while (fscanf(fi, "%d%s%s", &fid, fname, ftype) == 3){
|
||||
utils::Assert(fid == (int)names_.size(), "invalid fmap format");
|
||||
names_.push_back(std::string(fname));
|
||||
types_.push_back(GetType(ftype));
|
||||
}
|
||||
}
|
||||
/*! \brief number of known features */
|
||||
size_t size( void ) const{
|
||||
size_t size(void) const{
|
||||
return names_.size();
|
||||
}
|
||||
/*! \brief return name of specific feature */
|
||||
const char* name( size_t idx ) const{
|
||||
utils::Assert( idx < names_.size(), "utils::FMap::name feature index exceed bound" );
|
||||
return names_[ idx ].c_str();
|
||||
const char* name(size_t idx) const{
|
||||
utils::Assert(idx < names_.size(), "utils::FMap::name feature index exceed bound");
|
||||
return names_[idx].c_str();
|
||||
}
|
||||
/*! \brief return type of specific feature */
|
||||
const Type& type( size_t idx ) const{
|
||||
utils::Assert( idx < names_.size(), "utils::FMap::name feature index exceed bound" );
|
||||
return types_[ idx ];
|
||||
const Type& type(size_t idx) const{
|
||||
utils::Assert(idx < names_.size(), "utils::FMap::name feature index exceed bound");
|
||||
return types_[idx];
|
||||
}
|
||||
private:
|
||||
inline static Type GetType( const char *tname ){
|
||||
if( !strcmp( "i", tname ) ) return kIndicator;
|
||||
if( !strcmp( "q", tname ) ) return kQuantitive;
|
||||
if( !strcmp( "int", tname ) ) return kInteger;
|
||||
if( !strcmp( "float", tname ) ) return kFloat;
|
||||
inline static Type GetType(const char *tname){
|
||||
if (!strcmp("i", tname)) return kIndicator;
|
||||
if (!strcmp("q", tname)) return kQuantitive;
|
||||
if (!strcmp("int", tname)) return kInteger;
|
||||
if (!strcmp("float", tname)) return kFloat;
|
||||
utils::Error("unknown feature type, use i for indicator and q for quantity");
|
||||
return kIndicator;
|
||||
}
|
||||
@ -73,50 +73,50 @@ namespace xgboost{
|
||||
/*! \brief feature constraint, allow or disallow some feature during training */
|
||||
class FeatConstrain{
|
||||
public:
|
||||
FeatConstrain( void ){
|
||||
FeatConstrain(void){
|
||||
default_state_ = +1;
|
||||
}
|
||||
/*!\brief set parameters */
|
||||
inline void SetParam( const char *name, const char *val ){
|
||||
inline void SetParam(const char *name, const char *val){
|
||||
int a, b;
|
||||
if( !strcmp( name, "fban") ){
|
||||
this->ParseRange( val, a, b );
|
||||
this->SetRange( a, b, -1 );
|
||||
if (!strcmp(name, "fban")){
|
||||
this->ParseRange(val, a, b);
|
||||
this->SetRange(a, b, -1);
|
||||
}
|
||||
if( !strcmp( name, "fpass") ){
|
||||
this->ParseRange( val, a, b );
|
||||
this->SetRange( a, b, +1 );
|
||||
if (!strcmp(name, "fpass")){
|
||||
this->ParseRange(val, a, b);
|
||||
this->SetRange(a, b, +1);
|
||||
}
|
||||
if( !strcmp( name, "fdefault") ){
|
||||
default_state_ = atoi( val );
|
||||
if (!strcmp(name, "fdefault")){
|
||||
default_state_ = atoi(val);
|
||||
}
|
||||
}
|
||||
/*! \brief whether constrain is specified */
|
||||
inline bool HasConstrain( void ) const {
|
||||
inline bool HasConstrain(void) const {
|
||||
return state_.size() != 0 && default_state_ == 1;
|
||||
}
|
||||
/*! \brief whether a feature index is banned or not */
|
||||
inline bool NotBanned( unsigned index ) const{
|
||||
inline bool NotBanned(unsigned index) const{
|
||||
int rt = index < state_.size() ? state_[index] : default_state_;
|
||||
if( rt == 0 ) rt = default_state_;
|
||||
if (rt == 0) rt = default_state_;
|
||||
return rt == 1;
|
||||
}
|
||||
private:
|
||||
inline void SetRange( int a, int b, int st ){
|
||||
if( b > (int)state_.size() ) state_.resize( b, 0 );
|
||||
for( int i = a; i < b; ++ i ){
|
||||
inline void SetRange(int a, int b, int st){
|
||||
if (b >(int)state_.size()) state_.resize(b, 0);
|
||||
for (int i = a; i < b; ++i){
|
||||
state_[i] = st;
|
||||
}
|
||||
}
|
||||
}
|
||||
inline void ParseRange( const char *val, int &a, int &b ){
|
||||
if( sscanf( val, "%d-%d", &a, &b ) == 2 ) return;
|
||||
utils::Assert( sscanf( val, "%d", &a ) == 1 );
|
||||
inline void ParseRange(const char *val, int &a, int &b){
|
||||
if (sscanf(val, "%d-%d", &a, &b) == 2) return;
|
||||
utils::Assert(sscanf(val, "%d", &a) == 1);
|
||||
b = a + 1;
|
||||
}
|
||||
/*! \brief default state */
|
||||
int default_state_;
|
||||
/*! \brief whether the state here is, +1:pass, -1: ban, 0:default */
|
||||
std::vector<int> state_;
|
||||
std::vector<int> state_;
|
||||
};
|
||||
}; // namespace utils
|
||||
}; // namespace xgboost
|
||||
|
||||
@ -2,7 +2,7 @@
|
||||
* \file xgboost_matrix_csr.h
|
||||
* \brief this file defines some easy to use STL based class for in memory sparse CSR matrix
|
||||
* \author Tianqi Chen: tianqi.tchen@gmail.com
|
||||
*/
|
||||
*/
|
||||
#ifndef XGBOOST_MATRIX_CSR_H
|
||||
#define XGBOOST_MATRIX_CSR_H
|
||||
#include <vector>
|
||||
@ -11,13 +11,13 @@
|
||||
|
||||
namespace xgboost{
|
||||
namespace utils{
|
||||
/*!
|
||||
* \brief a class used to help construct CSR format matrix,
|
||||
/*!
|
||||
* \brief a class used to help construct CSR format matrix,
|
||||
* can be used to convert row major CSR to column major CSR
|
||||
* \tparam IndexType type of index used to store the index position, usually unsigned or size_t
|
||||
* \tparam whether enabling the usage of aclist, this option must be enabled manually
|
||||
*/
|
||||
template<typename IndexType,bool UseAcList = false>
|
||||
template<typename IndexType, bool UseAcList = false>
|
||||
struct SparseCSRMBuilder{
|
||||
private:
|
||||
/*! \brief dummy variable used in the indicator matrix construction */
|
||||
@ -29,100 +29,102 @@ namespace xgboost{
|
||||
/*! \brief a list of active rows, used when many rows are empty */
|
||||
std::vector<size_t> &aclist;
|
||||
public:
|
||||
SparseCSRMBuilder( std::vector<size_t> &p_rptr,
|
||||
std::vector<IndexType> &p_findex )
|
||||
:rptr(p_rptr), findex( p_findex ), aclist( dummy_aclist ){
|
||||
Assert( !UseAcList, "enabling bug" );
|
||||
}
|
||||
/*! \brief use with caution! rptr must be cleaned before use */
|
||||
SparseCSRMBuilder( std::vector<size_t> &p_rptr,
|
||||
std::vector<IndexType> &p_findex,
|
||||
std::vector<size_t> &p_aclist )
|
||||
:rptr(p_rptr), findex( p_findex ), aclist( p_aclist ){
|
||||
Assert( UseAcList, "must manually enable the option use aclist" );
|
||||
SparseCSRMBuilder(std::vector<size_t> &p_rptr,
|
||||
std::vector<IndexType> &p_findex)
|
||||
:rptr(p_rptr), findex(p_findex), aclist(dummy_aclist){
|
||||
Assert(!UseAcList, "enabling bug");
|
||||
}
|
||||
/*! \brief use with caution! rptr must be cleaned before use */
|
||||
SparseCSRMBuilder(std::vector<size_t> &p_rptr,
|
||||
std::vector<IndexType> &p_findex,
|
||||
std::vector<size_t> &p_aclist)
|
||||
:rptr(p_rptr), findex(p_findex), aclist(p_aclist){
|
||||
Assert(UseAcList, "must manually enable the option use aclist");
|
||||
}
|
||||
public:
|
||||
/*!
|
||||
/*!
|
||||
* \brief step 1: initialize the number of rows in the data, not necessary exact
|
||||
* \nrows number of rows in the matrix, can be smaller than expected
|
||||
*/
|
||||
inline void InitBudget( size_t nrows = 0 ){
|
||||
if( !UseAcList ){
|
||||
inline void InitBudget(size_t nrows = 0){
|
||||
if (!UseAcList){
|
||||
rptr.clear();
|
||||
rptr.resize( nrows + 1, 0 );
|
||||
}else{
|
||||
Assert( nrows + 1 == rptr.size(), "rptr must be initialized already" );
|
||||
rptr.resize(nrows + 1, 0);
|
||||
}
|
||||
else{
|
||||
Assert(nrows + 1 == rptr.size(), "rptr must be initialized already");
|
||||
this->Cleanup();
|
||||
}
|
||||
}
|
||||
/*!
|
||||
/*!
|
||||
* \brief step 2: add budget to each rows, this function is called when aclist is used
|
||||
* \param row_id the id of the row
|
||||
* \param nelem number of element budget add to this row
|
||||
*/
|
||||
inline void AddBudget( size_t row_id, size_t nelem = 1 ){
|
||||
if( rptr.size() < row_id + 2 ){
|
||||
rptr.resize( row_id + 2, 0 );
|
||||
inline void AddBudget(size_t row_id, size_t nelem = 1){
|
||||
if (rptr.size() < row_id + 2){
|
||||
rptr.resize(row_id + 2, 0);
|
||||
}
|
||||
if( UseAcList ){
|
||||
if( rptr[ row_id + 1 ] == 0 ) aclist.push_back( row_id );
|
||||
if (UseAcList){
|
||||
if (rptr[row_id + 1] == 0) aclist.push_back(row_id);
|
||||
}
|
||||
rptr[ row_id + 1 ] += nelem;
|
||||
rptr[row_id + 1] += nelem;
|
||||
}
|
||||
/*! \brief step 3: initialize the necessary storage */
|
||||
inline void InitStorage( void ){
|
||||
inline void InitStorage(void){
|
||||
// initialize rptr to be beginning of each segment
|
||||
size_t start = 0;
|
||||
if( !UseAcList ){
|
||||
for( size_t i = 1; i < rptr.size(); i ++ ){
|
||||
size_t rlen = rptr[ i ];
|
||||
rptr[ i ] = start;
|
||||
start += rlen;
|
||||
}
|
||||
}else{
|
||||
// case with active list
|
||||
std::sort( aclist.begin(), aclist.end() );
|
||||
|
||||
for( size_t i = 0; i < aclist.size(); i ++ ){
|
||||
size_t ridx = aclist[ i ];
|
||||
size_t rlen = rptr[ ridx + 1 ];
|
||||
rptr[ ridx + 1 ] = start;
|
||||
// set previous rptr to right position if previous feature is not active
|
||||
if( i == 0 || ridx != aclist[i-1] + 1 ) rptr[ ridx ] = start;
|
||||
if (!UseAcList){
|
||||
for (size_t i = 1; i < rptr.size(); i++){
|
||||
size_t rlen = rptr[i];
|
||||
rptr[i] = start;
|
||||
start += rlen;
|
||||
}
|
||||
}
|
||||
findex.resize( start );
|
||||
else{
|
||||
// case with active list
|
||||
std::sort(aclist.begin(), aclist.end());
|
||||
|
||||
for (size_t i = 0; i < aclist.size(); i++){
|
||||
size_t ridx = aclist[i];
|
||||
size_t rlen = rptr[ridx + 1];
|
||||
rptr[ridx + 1] = start;
|
||||
// set previous rptr to right position if previous feature is not active
|
||||
if (i == 0 || ridx != aclist[i - 1] + 1) rptr[ridx] = start;
|
||||
start += rlen;
|
||||
}
|
||||
}
|
||||
findex.resize(start);
|
||||
}
|
||||
/*!
|
||||
* \brief step 4:
|
||||
* used in indicator matrix construction, add new
|
||||
* element to each row, the number of calls shall be exactly same as add_budget
|
||||
/*!
|
||||
* \brief step 4:
|
||||
* used in indicator matrix construction, add new
|
||||
* element to each row, the number of calls shall be exactly same as add_budget
|
||||
*/
|
||||
inline void PushElem( size_t row_id, IndexType col_id ){
|
||||
size_t &rp = rptr[ row_id + 1 ];
|
||||
findex[ rp ++ ] = col_id;
|
||||
inline void PushElem(size_t row_id, IndexType col_id){
|
||||
size_t &rp = rptr[row_id + 1];
|
||||
findex[rp++] = col_id;
|
||||
}
|
||||
/*!
|
||||
/*!
|
||||
* \brief step 5: only needed when aclist is used
|
||||
* clean up the rptr for next usage
|
||||
*/
|
||||
inline void Cleanup( void ){
|
||||
Assert( UseAcList, "this function can only be called use AcList" );
|
||||
for( size_t i = 0; i < aclist.size(); i ++ ){
|
||||
*/
|
||||
inline void Cleanup(void){
|
||||
Assert(UseAcList, "this function can only be called use AcList");
|
||||
for (size_t i = 0; i < aclist.size(); i++){
|
||||
const size_t ridx = aclist[i];
|
||||
rptr[ ridx ] = 0; rptr[ ridx + 1 ] = 0;
|
||||
rptr[ridx] = 0; rptr[ridx + 1] = 0;
|
||||
}
|
||||
aclist.clear();
|
||||
}
|
||||
};
|
||||
};
|
||||
|
||||
|
||||
namespace utils{
|
||||
/*!
|
||||
/*!
|
||||
* \brief simple sparse matrix container
|
||||
* \tparam IndexType type of index used to store the index position, usually unsigned or size_t
|
||||
*/
|
||||
*/
|
||||
template<typename IndexType>
|
||||
struct SparseCSRMat{
|
||||
private:
|
||||
@ -134,22 +136,22 @@ namespace xgboost{
|
||||
/*! \brief matrix builder*/
|
||||
SparseCSRMBuilder<IndexType> builder;
|
||||
public:
|
||||
SparseCSRMat( void ):builder( rptr, findex ){
|
||||
}
|
||||
SparseCSRMat(void) :builder(rptr, findex){
|
||||
}
|
||||
public:
|
||||
/*! \return number of rows in the matrx */
|
||||
inline size_t NumRow( void ) const{
|
||||
inline size_t NumRow(void) const{
|
||||
return rptr.size() - 1;
|
||||
}
|
||||
/*! \return number of elements r-th row */
|
||||
inline size_t NumElem( size_t r ) const{
|
||||
return rptr[ r + 1 ] - rptr[ r ];
|
||||
inline size_t NumElem(size_t r) const{
|
||||
return rptr[r + 1] - rptr[r];
|
||||
}
|
||||
/*! \return r-th row */
|
||||
inline const IndexType *operator[]( size_t r ) const{
|
||||
return &findex[ rptr[r] ];
|
||||
}
|
||||
};
|
||||
/*! \return r-th row */
|
||||
inline const IndexType *operator[](size_t r) const{
|
||||
return &findex[rptr[r]];
|
||||
}
|
||||
};
|
||||
};
|
||||
};
|
||||
#endif
|
||||
|
||||
@ -3,16 +3,16 @@
|
||||
/*!
|
||||
* \file xgboost_omp.h
|
||||
* \brief header to handle OpenMP compatibility issues
|
||||
*
|
||||
*
|
||||
* \author Tianqi Chen: tianqi.tchen@gmail.com
|
||||
*/
|
||||
|
||||
#if defined(_OPENMP)
|
||||
#include <omp.h>
|
||||
#else
|
||||
//#warning "OpenMP is not available, compile to single thread code"
|
||||
#warning "OpenMP is not available, compile to single thread code"
|
||||
inline int omp_get_thread_num() { return 0; }
|
||||
inline int omp_get_num_threads() { return 1; }
|
||||
inline void omp_set_num_threads( int nthread ) {}
|
||||
inline void omp_set_num_threads(int nthread) {}
|
||||
#endif
|
||||
#endif
|
||||
|
||||
@ -23,107 +23,108 @@ typedef unsigned int uint32_t;
|
||||
namespace xgboost{
|
||||
namespace random{
|
||||
/*! \brief seed the PRNG */
|
||||
inline void Seed( uint32_t seed ){
|
||||
srand( seed );
|
||||
inline void Seed(uint32_t seed){
|
||||
srand(seed);
|
||||
}
|
||||
|
||||
|
||||
/*! \brief return a real number uniform in [0,1) */
|
||||
inline double NextDouble(){
|
||||
return static_cast<double>( rand() ) / (static_cast<double>( RAND_MAX )+1.0);
|
||||
return static_cast<double>(rand()) / (static_cast<double>(RAND_MAX)+1.0);
|
||||
}
|
||||
/*! \brief return a real numer uniform in (0,1) */
|
||||
inline double NextDouble2(){
|
||||
return (static_cast<double>( rand() ) + 1.0 ) / (static_cast<double>(RAND_MAX) + 2.0);
|
||||
return (static_cast<double>(rand()) + 1.0) / (static_cast<double>(RAND_MAX)+2.0);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
namespace random{
|
||||
/*! \brief return a random number */
|
||||
inline uint32_t NextUInt32( void ){
|
||||
inline uint32_t NextUInt32(void){
|
||||
return (uint32_t)rand();
|
||||
}
|
||||
/*! \brief return a random number in n */
|
||||
inline uint32_t NextUInt32( uint32_t n ){
|
||||
return (uint32_t) floor( NextDouble() * n ) ;
|
||||
}
|
||||
inline uint32_t NextUInt32(uint32_t n){
|
||||
return (uint32_t)floor(NextDouble() * n);
|
||||
}
|
||||
/*! \brief return x~N(0,1) */
|
||||
inline double SampleNormal(){
|
||||
double x,y,s;
|
||||
double x, y, s;
|
||||
do{
|
||||
x = 2 * NextDouble2() - 1.0;
|
||||
y = 2 * NextDouble2() - 1.0;
|
||||
s = x*x + y*y;
|
||||
}while( s >= 1.0 || s == 0.0 );
|
||||
|
||||
return x * sqrt( -2.0 * log(s) / s ) ;
|
||||
} while (s >= 1.0 || s == 0.0);
|
||||
|
||||
return x * sqrt(-2.0 * log(s) / s);
|
||||
}
|
||||
|
||||
|
||||
/*! \brief return iid x,y ~N(0,1) */
|
||||
inline void SampleNormal2D( double &xx, double &yy ){
|
||||
double x,y,s;
|
||||
inline void SampleNormal2D(double &xx, double &yy){
|
||||
double x, y, s;
|
||||
do{
|
||||
x = 2 * NextDouble2() - 1.0;
|
||||
y = 2 * NextDouble2() - 1.0;
|
||||
s = x*x + y*y;
|
||||
}while( s >= 1.0 || s == 0.0 );
|
||||
double t = sqrt( -2.0 * log(s) / s ) ;
|
||||
xx = x * t;
|
||||
} while (s >= 1.0 || s == 0.0);
|
||||
double t = sqrt(-2.0 * log(s) / s);
|
||||
xx = x * t;
|
||||
yy = y * t;
|
||||
}
|
||||
/*! \brief return x~N(mu,sigma^2) */
|
||||
inline double SampleNormal( double mu, double sigma ){
|
||||
inline double SampleNormal(double mu, double sigma){
|
||||
return SampleNormal() * sigma + mu;
|
||||
}
|
||||
|
||||
/*! \brief return 1 with probability p, coin flip */
|
||||
inline int SampleBinary( double p ){
|
||||
return NextDouble() < p;
|
||||
inline int SampleBinary(double p){
|
||||
return NextDouble() < p;
|
||||
}
|
||||
|
||||
|
||||
/*! \brief return distribution from Gamma( alpha, beta ) */
|
||||
inline double SampleGamma( double alpha, double beta ) {
|
||||
if ( alpha < 1.0 ) {
|
||||
inline double SampleGamma(double alpha, double beta) {
|
||||
if (alpha < 1.0) {
|
||||
double u;
|
||||
do {
|
||||
u = NextDouble();
|
||||
} while (u == 0.0);
|
||||
return SampleGamma(alpha + 1.0, beta) * pow(u, 1.0 / alpha);
|
||||
} else {
|
||||
double d,c,x,v,u;
|
||||
d = alpha - 1.0/3.0;
|
||||
c = 1.0 / sqrt( 9.0 * d );
|
||||
}
|
||||
else {
|
||||
double d, c, x, v, u;
|
||||
d = alpha - 1.0 / 3.0;
|
||||
c = 1.0 / sqrt(9.0 * d);
|
||||
do {
|
||||
do {
|
||||
x = SampleNormal();
|
||||
v = 1.0 + c*x;
|
||||
} while ( v <= 0.0 );
|
||||
} while (v <= 0.0);
|
||||
v = v * v * v;
|
||||
u = NextDouble();
|
||||
} while ( (u >= (1.0 - 0.0331 * (x*x) * (x*x)))
|
||||
&& (log(u) >= (0.5 * x * x + d * (1.0 - v + log(v)))) );
|
||||
} while ((u >= (1.0 - 0.0331 * (x*x) * (x*x)))
|
||||
&& (log(u) >= (0.5 * x * x + d * (1.0 - v + log(v)))));
|
||||
return d * v / beta;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template<typename T>
|
||||
inline void Exchange( T &a, T &b ){
|
||||
inline void Exchange(T &a, T &b){
|
||||
T c;
|
||||
c = a;
|
||||
a = b;
|
||||
b = c;
|
||||
}
|
||||
|
||||
|
||||
template<typename T>
|
||||
inline void Shuffle( T *data, size_t sz ){
|
||||
if( sz == 0 ) return;
|
||||
for( uint32_t i = (uint32_t)sz - 1; i > 0; i-- ){
|
||||
Exchange( data[i], data[ NextUInt32( i+1 ) ] );
|
||||
}
|
||||
inline void Shuffle(T *data, size_t sz){
|
||||
if (sz == 0) return;
|
||||
for (uint32_t i = (uint32_t)sz - 1; i > 0; i--){
|
||||
Exchange(data[i], data[NextUInt32(i + 1)]);
|
||||
}
|
||||
}
|
||||
// random shuffle the data inside, require PRNG
|
||||
template<typename T>
|
||||
inline void Shuffle( std::vector<T> &data ){
|
||||
Shuffle( &data[0], data.size() );
|
||||
inline void Shuffle(std::vector<T> &data){
|
||||
Shuffle(&data[0], data.size());
|
||||
}
|
||||
};
|
||||
};
|
||||
|
||||
@ -9,44 +9,44 @@
|
||||
*/
|
||||
namespace xgboost{
|
||||
namespace utils{
|
||||
/*!
|
||||
* \brief interface of stream I/O, used to serialize model
|
||||
/*!
|
||||
* \brief interface of stream I/O, used to serialize model
|
||||
*/
|
||||
class IStream{
|
||||
public:
|
||||
/*!
|
||||
/*!
|
||||
* \brief read data from stream
|
||||
* \param ptr pointer to memory buffer
|
||||
* \param size size of block
|
||||
* \return usually is the size of data readed
|
||||
*/
|
||||
virtual size_t Read( void *ptr, size_t size ) = 0;
|
||||
/*!
|
||||
virtual size_t Read(void *ptr, size_t size) = 0;
|
||||
/*!
|
||||
* \brief write data to stream
|
||||
* \param ptr pointer to memory buffer
|
||||
* \param size size of block
|
||||
*/
|
||||
virtual void Write( const void *ptr, size_t size ) = 0;
|
||||
virtual void Write(const void *ptr, size_t size) = 0;
|
||||
/*! \brief virtual destructor */
|
||||
virtual ~IStream( void ){}
|
||||
virtual ~IStream(void){}
|
||||
};
|
||||
|
||||
/*! \brief implementation of file i/o stream */
|
||||
class FileStream: public IStream{
|
||||
class FileStream : public IStream{
|
||||
private:
|
||||
FILE *fp;
|
||||
public:
|
||||
FileStream( FILE *fp ){
|
||||
public:
|
||||
FileStream(FILE *fp){
|
||||
this->fp = fp;
|
||||
}
|
||||
virtual size_t Read( void *ptr, size_t size ){
|
||||
return fread( ptr, size, 1, fp );
|
||||
virtual size_t Read(void *ptr, size_t size){
|
||||
return fread(ptr, size, 1, fp);
|
||||
}
|
||||
virtual void Write( const void *ptr, size_t size ){
|
||||
fwrite( ptr, size, 1, fp );
|
||||
virtual void Write(const void *ptr, size_t size){
|
||||
fwrite(ptr, size, 1, fp);
|
||||
}
|
||||
inline void Close( void ){
|
||||
fclose( fp );
|
||||
inline void Close(void){
|
||||
fclose(fp);
|
||||
}
|
||||
};
|
||||
};
|
||||
|
||||
@ -36,39 +36,29 @@ extern "C"{
|
||||
namespace xgboost{
|
||||
/*! \brief namespace for helper utils of the project */
|
||||
namespace utils{
|
||||
inline void Error( const char *msg ){
|
||||
fprintf( stderr, "Error:%s\n",msg );
|
||||
exit( -1 );
|
||||
}
|
||||
|
||||
inline void Assert( bool exp ){
|
||||
if( !exp ) Error( "AssertError" );
|
||||
}
|
||||
|
||||
inline void Assert( bool exp, const char *msg ){
|
||||
if( !exp ) Error( msg );
|
||||
inline void Error(const char *msg){
|
||||
fprintf(stderr, "Error:%s\n", msg);
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
inline void Warning( const char *msg ){
|
||||
fprintf( stderr, "warning:%s\n",msg );
|
||||
inline void Assert(bool exp){
|
||||
if (!exp) Error("AssertError");
|
||||
}
|
||||
|
||||
inline void Assert(bool exp, const char *msg){
|
||||
if (!exp) Error(msg);
|
||||
}
|
||||
|
||||
inline void Warning(const char *msg){
|
||||
fprintf(stderr, "warning:%s\n", msg);
|
||||
}
|
||||
|
||||
/*! \brief replace fopen, report error when the file open fails */
|
||||
inline FILE *FopenCheck( const char *fname , const char *flag ){
|
||||
FILE *fp = fopen64( fname , flag );
|
||||
if( fp == NULL ){
|
||||
fprintf( stderr, "can not open file \"%s\"\n",fname );
|
||||
exit( -1 );
|
||||
}
|
||||
return fp;
|
||||
}
|
||||
|
||||
/*! \brief replace fopen, */
|
||||
inline FILE *FopenTry( const char *fname , const char *flag ){
|
||||
FILE *fp = fopen64( fname , flag );
|
||||
if( fp == NULL ){
|
||||
fprintf( stderr, "can not open file \"%s\"\n",fname );
|
||||
exit( -1 );
|
||||
inline FILE *FopenCheck(const char *fname, const char *flag){
|
||||
FILE *fp = fopen64(fname, flag);
|
||||
if (fp == NULL){
|
||||
fprintf(stderr, "can not open file \"%s\"\n", fname);
|
||||
exit(-1);
|
||||
}
|
||||
return fp;
|
||||
}
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user