rank pass toy

This commit is contained in:
kalenhaha
2014-04-07 23:25:35 +08:00
parent 40c380e40a
commit a10f594644
32 changed files with 2237 additions and 2146 deletions

View File

@@ -21,76 +21,76 @@ namespace xgboost{
typedef unsigned bst_uint;
/*! \brief float type used in boost */
typedef float bst_float;
/*! \brief debug option for booster */
const bool bst_debug = false;
/*! \brief debug option for booster */
const bool bst_debug = false;
};
};
namespace xgboost{
namespace booster{
/**
* \brief This is a interface, defining the way to access features,
* \brief This is a interface, defining the way to access features,
* by column or by row. This interface is used to make implementation
* of booster does not depend on how feature is stored.
*
* Why template instead of virtual class: for efficiency
* feature matrix is going to be used by most inner loop of the algorithm
*
* \tparam Derived type of actual implementation
* \tparam Derived type of actual implementation
* \sa FMatrixS: most of time FMatrixS is sufficient, refer to it if you find it confusing
*/
template<typename Derived>
struct FMatrix{
public:
/*! \brief exmaple iterator over one row */
/*! \brief exmaple iterator over one row */
struct RowIter{
/*!
* \brief move to next position
/*!
* \brief move to next position
* \return whether there is element in next position
*/
inline bool Next( void );
inline bool Next(void);
/*! \return feature index in current position */
inline bst_uint findex( void ) const;
inline bst_uint findex(void) const;
/*! \return feature value in current position */
inline bst_float fvalue( void ) const;
inline bst_float fvalue(void) const;
};
/*! \brief example iterator over one column */
struct ColIter{
/*!
* \brief move to next position
/*!
* \brief move to next position
* \return whether there is element in next position
*/
inline bool Next( void );
inline bool Next(void);
/*! \return row index of current position */
inline bst_uint rindex( void ) const;
inline bst_uint rindex(void) const;
/*! \return feature value in current position */
inline bst_float fvalue( void ) const;
inline bst_float fvalue(void) const;
};
/*! \brief backward iterator over column */
struct ColBackIter : public ColIter {};
public:
/*!
* \brief get number of rows
/*!
* \brief get number of rows
* \return number of rows
*/
inline size_t NumRow( void ) const;
/*!
inline size_t NumRow(void) const;
/*!
* \brief get number of columns
* \return number of columns
*/
inline size_t NumCol( void ) const;
inline size_t NumCol(void) const;
/*!
* \brief get row iterator
* \param ridx row index
* \return row iterator
*/
inline RowIter GetRow( size_t ridx ) const;
/*!
inline RowIter GetRow(size_t ridx) const;
/*!
* \brief get number of column groups, this ise used together with GetRow( ridx, gid )
* \return number of column group
*/
inline unsigned NumColGroup( void ) const{
inline unsigned NumColGroup(void) const{
return 1;
}
/*!
@@ -99,32 +99,32 @@ namespace xgboost{
* \param gid colmun group id
* \return row iterator, only iterates over features of specified column group
*/
inline RowIter GetRow( size_t ridx, unsigned gid ) const;
inline RowIter GetRow(size_t ridx, unsigned gid) const;
/*! \return whether column access is enabled */
inline bool HaveColAccess( void ) const;
inline bool HaveColAccess(void) const;
/*!
* \brief get column iterator, the columns must be sorted by feature value
* \param ridx column index
* \return column iterator
*/
inline ColIter GetSortedCol( size_t ridx ) const;
inline ColIter GetSortedCol(size_t ridx) const;
/*!
* \brief get column backward iterator, starts from biggest fvalue, and iterator back
* \param ridx column index
* \return reverse column iterator
*/
inline ColBackIter GetReverseSortedCol( size_t ridx ) const;
inline ColBackIter GetReverseSortedCol(size_t ridx) const;
};
};
};
namespace xgboost{
namespace booster{
/*!
/*!
* \brief feature matrix to store training instance, in sparse CSR format
*/
class FMatrixS: public FMatrix<FMatrixS>{
*/
class FMatrixS : public FMatrix<FMatrixS>{
public:
/*! \brief one entry in a row */
struct REntry{
@@ -133,10 +133,10 @@ namespace xgboost{
/*! \brief feature value */
bst_float fvalue;
/*! \brief constructor */
REntry( void ){}
REntry(void){}
/*! \brief constructor */
REntry( bst_uint findex, bst_float fvalue ) : findex(findex), fvalue(fvalue){}
inline static bool cmp_fvalue( const REntry &a, const REntry &b ){
REntry(bst_uint findex, bst_float fvalue) : findex(findex), fvalue(fvalue){}
inline static bool cmp_fvalue(const REntry &a, const REntry &b){
return a.fvalue < b.fvalue;
}
};
@@ -147,79 +147,79 @@ namespace xgboost{
/*! \brief size of the data */
bst_uint len;
/*! \brief get k-th element */
inline const REntry& operator[]( unsigned i ) const{
inline const REntry& operator[](unsigned i) const{
return data_[i];
}
}
};
/*! \brief row iterator */
struct RowIter{
const REntry *dptr_, *end_;
RowIter( const REntry* dptr, const REntry* end )
:dptr_(dptr),end_(end){}
inline bool Next( void ){
if( dptr_ == end_ ) return false;
RowIter(const REntry* dptr, const REntry* end)
:dptr_(dptr), end_(end){}
inline bool Next(void){
if (dptr_ == end_) return false;
else{
++ dptr_; return true;
++dptr_; return true;
}
}
inline bst_uint findex( void ) const{
inline bst_uint findex(void) const{
return dptr_->findex;
}
inline bst_float fvalue( void ) const{
inline bst_float fvalue(void) const{
return dptr_->fvalue;
}
};
/*! \brief column iterator */
struct ColIter: public RowIter{
ColIter( const REntry* dptr, const REntry* end )
:RowIter( dptr, end ){}
inline bst_uint rindex( void ) const{
struct ColIter : public RowIter{
ColIter(const REntry* dptr, const REntry* end)
:RowIter(dptr, end){}
inline bst_uint rindex(void) const{
return this->findex();
}
};
/*! \brief reverse column iterator */
struct ColBackIter: public ColIter{
ColBackIter( const REntry* dptr, const REntry* end )
:ColIter( dptr, end ){}
struct ColBackIter : public ColIter{
ColBackIter(const REntry* dptr, const REntry* end)
:ColIter(dptr, end){}
// shadows RowIter::Next
inline bool Next( void ){
if( dptr_ == end_ ) return false;
inline bool Next(void){
if (dptr_ == end_) return false;
else{
-- dptr_; return true;
--dptr_; return true;
}
}
};
public:
/*! \brief constructor */
FMatrixS( void ){ this->Clear(); }
FMatrixS(void){ this->Clear(); }
/*! \brief get number of rows */
inline size_t NumRow( void ) const{
inline size_t NumRow(void) const{
return row_ptr_.size() - 1;
}
/*!
/*!
* \brief get number of nonzero entries
* \return number of nonzero entries
*/
inline size_t NumEntry( void ) const{
inline size_t NumEntry(void) const{
return row_data_.size();
}
/*! \brief clear the storage */
inline void Clear( void ){
inline void Clear(void){
row_ptr_.clear();
row_ptr_.push_back( 0 );
row_ptr_.push_back(0);
row_data_.clear();
col_ptr_.clear();
col_data_.clear();
}
/*! \brief get sparse part of current row */
inline Line operator[]( size_t sidx ) const{
inline Line operator[](size_t sidx) const{
Line sp;
utils::Assert( !bst_debug || sidx < this->NumRow(), "row id exceed bound" );
sp.len = static_cast<bst_uint>( row_ptr_[ sidx + 1 ] - row_ptr_[ sidx ] );
sp.data_ = &row_data_[ row_ptr_[ sidx ] ];
utils::Assert(!bst_debug || sidx < this->NumRow(), "row id exceed bound");
sp.len = static_cast<bst_uint>(row_ptr_[sidx + 1] - row_ptr_[sidx]);
sp.data_ = &row_data_[row_ptr_[sidx]];
return sp;
}
/*!
/*!
* \brief add a row to the matrix, with data stored in STL container
* \param findex feature index
* \param fvalue feature value
@@ -227,155 +227,155 @@ namespace xgboost{
* \param fend end bound range of feature
* \return the row id added line
*/
inline size_t AddRow( const std::vector<bst_uint> &findex,
const std::vector<bst_float> &fvalue,
unsigned fstart = 0, unsigned fend = UINT_MAX ){
utils::Assert( findex.size() == fvalue.size() );
inline size_t AddRow(const std::vector<bst_uint> &findex,
const std::vector<bst_float> &fvalue,
unsigned fstart = 0, unsigned fend = UINT_MAX){
utils::Assert(findex.size() == fvalue.size());
unsigned cnt = 0;
for( size_t i = 0; i < findex.size(); i ++ ){
if( findex[i] < fstart || findex[i] >= fend ) continue;
row_data_.push_back( REntry( findex[i], fvalue[i] ) );
cnt ++;
for (size_t i = 0; i < findex.size(); i++){
if (findex[i] < fstart || findex[i] >= fend) continue;
row_data_.push_back(REntry(findex[i], fvalue[i]));
cnt++;
}
row_ptr_.push_back( row_ptr_.back() + cnt );
row_ptr_.push_back(row_ptr_.back() + cnt);
return row_ptr_.size() - 2;
}
/*! \brief get row iterator*/
inline RowIter GetRow( size_t ridx ) const{
utils::Assert( !bst_debug || ridx < this->NumRow(), "row id exceed bound" );
return RowIter( &row_data_[ row_ptr_[ridx] ] - 1, &row_data_[ row_ptr_[ridx+1] ] - 1 );
inline RowIter GetRow(size_t ridx) const{
utils::Assert(!bst_debug || ridx < this->NumRow(), "row id exceed bound");
return RowIter(&row_data_[row_ptr_[ridx]] - 1, &row_data_[row_ptr_[ridx + 1]] - 1);
}
/*! \brief get row iterator*/
inline RowIter GetRow( size_t ridx, unsigned gid ) const{
utils::Assert( gid == 0, "FMatrixS only have 1 column group" );
return FMatrixS::GetRow( ridx );
inline RowIter GetRow(size_t ridx, unsigned gid) const{
utils::Assert(gid == 0, "FMatrixS only have 1 column group");
return FMatrixS::GetRow(ridx);
}
public:
/*! \return whether column access is enabled */
inline bool HaveColAccess( void ) const{
inline bool HaveColAccess(void) const{
return col_ptr_.size() != 0 && col_data_.size() == row_data_.size();
}
/*! \brief get number of colmuns */
inline size_t NumCol( void ) const{
utils::Assert( this->HaveColAccess() );
inline size_t NumCol(void) const{
utils::Assert(this->HaveColAccess());
return col_ptr_.size() - 1;
}
/*! \brief get col iterator*/
inline ColIter GetSortedCol( size_t cidx ) const{
utils::Assert( !bst_debug || cidx < this->NumCol(), "col id exceed bound" );
return ColIter( &col_data_[ col_ptr_[cidx] ] - 1, &col_data_[ col_ptr_[cidx+1] ] - 1 );
inline ColIter GetSortedCol(size_t cidx) const{
utils::Assert(!bst_debug || cidx < this->NumCol(), "col id exceed bound");
return ColIter(&col_data_[col_ptr_[cidx]] - 1, &col_data_[col_ptr_[cidx + 1]] - 1);
}
/*! \brief get col iterator */
inline ColBackIter GetReverseSortedCol( size_t cidx ) const{
utils::Assert( !bst_debug || cidx < this->NumCol(), "col id exceed bound" );
return ColBackIter( &col_data_[ col_ptr_[cidx+1] ], &col_data_[ col_ptr_[cidx] ] );
inline ColBackIter GetReverseSortedCol(size_t cidx) const{
utils::Assert(!bst_debug || cidx < this->NumCol(), "col id exceed bound");
return ColBackIter(&col_data_[col_ptr_[cidx + 1]], &col_data_[col_ptr_[cidx]]);
}
/*!
* \brief intialize the data so that we have both column and row major
* access, call this whenever we need column access
*/
inline void InitData( void ){
utils::SparseCSRMBuilder<REntry> builder( col_ptr_, col_data_ );
builder.InitBudget( 0 );
for( size_t i = 0; i < this->NumRow(); i ++ ){
for( RowIter it = this->GetRow(i); it.Next(); ){
builder.AddBudget( it.findex() );
inline void InitData(void){
utils::SparseCSRMBuilder<REntry> builder(col_ptr_, col_data_);
builder.InitBudget(0);
for (size_t i = 0; i < this->NumRow(); i++){
for (RowIter it = this->GetRow(i); it.Next();){
builder.AddBudget(it.findex());
}
}
builder.InitStorage();
for( size_t i = 0; i < this->NumRow(); i ++ ){
for( RowIter it = this->GetRow(i); it.Next(); ){
builder.PushElem( it.findex(), REntry( (bst_uint)i, it.fvalue() ) );
for (size_t i = 0; i < this->NumRow(); i++){
for (RowIter it = this->GetRow(i); it.Next();){
builder.PushElem(it.findex(), REntry((bst_uint)i, it.fvalue()));
}
}
// sort columns
unsigned ncol = static_cast<unsigned>( this->NumCol() );
for( unsigned i = 0; i < ncol; i ++ ){
std::sort( &col_data_[ col_ptr_[ i ] ], &col_data_[ col_ptr_[ i+1 ] ], REntry::cmp_fvalue );
unsigned ncol = static_cast<unsigned>(this->NumCol());
for (unsigned i = 0; i < ncol; i++){
std::sort(&col_data_[col_ptr_[i]], &col_data_[col_ptr_[i + 1]], REntry::cmp_fvalue);
}
}
/*!
* \brief save data to binary stream
* note: since we have size_t in ptr,
* \brief save data to binary stream
* note: since we have size_t in ptr,
* the function is not consistent between 64bit and 32bit machine
* \param fo output stream
*/
inline void SaveBinary( utils::IStream &fo ) const{
FMatrixS::SaveBinary( fo, row_ptr_, row_data_ );
inline void SaveBinary(utils::IStream &fo) const{
FMatrixS::SaveBinary(fo, row_ptr_, row_data_);
int col_access = this->HaveColAccess() ? 1 : 0;
fo.Write( &col_access, sizeof(int) );
if( col_access != 0 ){
FMatrixS::SaveBinary( fo, col_ptr_, col_data_ );
fo.Write(&col_access, sizeof(int));
if (col_access != 0){
FMatrixS::SaveBinary(fo, col_ptr_, col_data_);
}
}
/*!
* \brief load data from binary stream
* note: since we have size_t in ptr,
* \brief load data from binary stream
* note: since we have size_t in ptr,
* the function is not consistent between 64bit and 32bit machin
* \param fi input stream
*/
inline void LoadBinary( utils::IStream &fi ){
FMatrixS::LoadBinary( fi, row_ptr_, row_data_ );
int col_access;
fi.Read( &col_access, sizeof(int) );
if( col_access != 0 ){
FMatrixS::LoadBinary( fi, col_ptr_, col_data_ );
inline void LoadBinary(utils::IStream &fi){
FMatrixS::LoadBinary(fi, row_ptr_, row_data_);
int col_access;
fi.Read(&col_access, sizeof(int));
if (col_access != 0){
FMatrixS::LoadBinary(fi, col_ptr_, col_data_);
}
}
/*!
* \brief load from text file
* \brief load from text file
* \param fi input file pointer
*/
inline void LoadText( FILE *fi ){
*/
inline void LoadText(FILE *fi){
this->Clear();
int ninst;
while( fscanf( fi, "%d", &ninst ) == 1 ){
while (fscanf(fi, "%d", &ninst) == 1){
std::vector<booster::bst_uint> findex;
std::vector<booster::bst_float> fvalue;
while( ninst -- ){
while (ninst--){
unsigned index; float value;
utils::Assert( fscanf( fi, "%u:%f", &index, &value ) == 2, "load Text" );
findex.push_back( index ); fvalue.push_back( value );
utils::Assert(fscanf(fi, "%u:%f", &index, &value) == 2, "load Text");
findex.push_back(index); fvalue.push_back(value);
}
this->AddRow( findex, fvalue );
this->AddRow(findex, fvalue);
}
// initialize column support as well
this->InitData();
}
private:
/*!
* \brief save data to binary stream
* \brief save data to binary stream
* \param fo output stream
* \param ptr pointer data
* \param data data content
*/
inline static void SaveBinary( utils::IStream &fo,
const std::vector<size_t> &ptr,
const std::vector<REntry> &data ){
inline static void SaveBinary(utils::IStream &fo,
const std::vector<size_t> &ptr,
const std::vector<REntry> &data){
size_t nrow = ptr.size() - 1;
fo.Write( &nrow, sizeof(size_t) );
fo.Write( &ptr[0], ptr.size() * sizeof(size_t) );
if( data.size() != 0 ){
fo.Write( &data[0] , data.size() * sizeof(REntry) );
fo.Write(&nrow, sizeof(size_t));
fo.Write(&ptr[0], ptr.size() * sizeof(size_t));
if (data.size() != 0){
fo.Write(&data[0], data.size() * sizeof(REntry));
}
}
/*!
* \brief load data from binary stream
* \brief load data from binary stream
* \param fi input stream
* \param ptr pointer data
* \param data data content
*/
inline static void LoadBinary( utils::IStream &fi,
std::vector<size_t> &ptr,
std::vector<REntry> &data ){
inline static void LoadBinary(utils::IStream &fi,
std::vector<size_t> &ptr,
std::vector<REntry> &data){
size_t nrow;
utils::Assert( fi.Read( &nrow, sizeof(size_t) ) != 0, "Load FMatrixS" );
ptr.resize( nrow + 1 );
utils::Assert( fi.Read( &ptr[0], ptr.size() * sizeof(size_t) ), "Load FMatrixS" );
utils::Assert(fi.Read(&nrow, sizeof(size_t)) != 0, "Load FMatrixS");
ptr.resize(nrow + 1);
utils::Assert(fi.Read(&ptr[0], ptr.size() * sizeof(size_t)), "Load FMatrixS");
data.resize( ptr.back() );
if( data.size() != 0 ){
utils::Assert( fi.Read( &data[0] , data.size() * sizeof(REntry) ) , "Load FMatrixS" );
data.resize(ptr.back());
if (data.size() != 0){
utils::Assert(fi.Read(&data[0], data.size() * sizeof(REntry)), "Load FMatrixS");
}
}
protected:
@@ -387,7 +387,7 @@ namespace xgboost{
std::vector<size_t> col_ptr_;
/*! \brief column datas */
std::vector<REntry> col_data_;
};
};
};
};
#endif