adapt svdfeature tree

This commit is contained in:
tqchen
2014-02-07 22:38:26 -08:00
parent bf36374678
commit 5d052b9e14
7 changed files with 1265 additions and 4 deletions

152
utils/xgboost_matrix_csr.h Normal file
View File

@@ -0,0 +1,152 @@
/*!
* \file xgboost_matrix_csr.h
* \brief this file defines some easy to use STL based class for in memory sparse CSR matrix
* \author Tianqi Chen: tianqi.tchen@gmail.com
*/
#ifndef _XGBOOST_MATRIX_CSR_H_
#define _XGBOOST_MATRIX_CSR_H_
#include <vector>
#include <algorithm>
#include "xgboost_utils.h"
namespace xgboost{
namespace utils{
/*!
* \brief a class used to help construct CSR format matrix,
* can be used to convert row major CSR to column major CSR
* \tparam IndexType type of index used to store the index position, usually unsigned or size_t
* \tparam whether enabling the usage of aclist, this option must be enabled manually
*/
template<typename IndexType,bool UseAcList = false>
struct SparseCSRMBuilder{
private:
/*! \brief dummy variable used in the indicator matrix construction */
std::vector<size_t> dummy_aclist;
/*! \brief pointer to each of the row */
std::vector<size_t> &rptr;
/*! \brief index of nonzero entries in each row */
std::vector<IndexType> &findex;
/*! \brief a list of active rows, used when many rows are empty */
std::vector<size_t> &aclist;
public:
SparseCSRMBuilder( std::vector<size_t> &p_rptr,
std::vector<IndexType> &p_findex )
:rptr(p_rptr), findex( p_findex ), aclist( dummy_aclist ){
Assert( !UseAcList, "enabling bug" );
}
/*! \brief use with caution! rptr must be cleaned before use */
SparseCSRMBuilder( std::vector<size_t> &p_rptr,
std::vector<IndexType> &p_findex,
std::vector<size_t> &p_aclist )
:rptr(p_rptr), findex( p_findex ), aclist( p_aclist ){
Assert( UseAcList, "must manually enable the option use aclist" );
}
public:
/*!
* \brief step 1: initialize the number of rows in the data
* \nrows number of rows in the matrix
*/
inline void InitBudget( size_t nrows ){
if( !UseAcList ){
rptr.resize( nrows + 1 );
std::fill( rptr.begin(), rptr.end(), 0 );
}else{
Assert( nrows + 1 == rptr.size(), "rptr must be initialized already" );
this->Cleanup();
}
}
/*!
* \brief step 2: add budget to each rows, this function is called when aclist is used
* \param row_id the id of the row
* \param nelem number of element budget add to this row
*/
inline void AddBudget( size_t row_id, size_t nelem = 1 ){
if( UseAcList ){
if( rptr[ row_id + 1 ] == 0 ) aclist.push_back( row_id );
}
rptr[ row_id + 1 ] += nelem;
}
/*! \brief step 3: initialize the necessary storage */
inline void InitStorage( void ){
// initialize rptr to be beginning of each segment
size_t start = 0;
if( !UseAcList ){
for( size_t i = 1; i < rptr.size(); i ++ ){
size_t rlen = rptr[ i ];
rptr[ i ] = start;
start += rlen;
}
}else{
// case with active list
std::sort( aclist.begin(), aclist.end() );
for( size_t i = 0; i < aclist.size(); i ++ ){
size_t ridx = aclist[ i ];
size_t rlen = rptr[ ridx + 1 ];
rptr[ ridx + 1 ] = start;
// set previous rptr to right position if previous feature is not active
if( i == 0 || ridx != aclist[i-1] + 1 ) rptr[ ridx ] = start;
start += rlen;
}
}
findex.resize( start );
}
/*!
* \brief step 4:
* used in indicator matrix construction, add new
* element to each row, the number of calls shall be exactly same as add_budget
*/
inline void PushElem( size_t row_id, IndexType col_id ){
size_t &rp = rptr[ row_id + 1 ];
findex[ rp ++ ] = col_id;
}
/*!
* \brief step 5: only needed when aclist is used
* clean up the rptr for next usage
*/
inline void Cleanup( void ){
Assert( UseAcList, "this function can only be called use AcList" );
for( size_t i = 0; i < aclist.size(); i ++ ){
const size_t ridx = aclist[i];
rptr[ ridx ] = 0; rptr[ ridx + 1 ] = 0;
}
aclist.clear();
}
};
};
namespace utils{
/*!
* \brief simple sparse matrix container
* \tparam IndexType type of index used to store the index position, usually unsigned or size_t
*/
template<typename IndexType>
struct SparseCSRMat{
private:
/*! \brief pointer to each of the row */
std::vector<size_t> rptr;
/*! \brief index of nonzero entries in each row */
std::vector<IndexType> findex;
public:
/*! \brief matrix builder*/
SparseCSRMBuilder<IndexType> builder;
public:
SparseCSRMat( void ):builder( rptr, findex ){
}
public:
/*! \return number of rows in the matrx */
inline size_t NumRow( void ) const{
return rptr.size() - 1;
}
/*! \return number of elements r-th row */
inline size_t NumElem( size_t r ) const{
return rptr[ r + 1 ] - rptr[ r ];
}
/*! \return r-th row */
inline const IndexType *operator[]( size_t r ) const{
return &findex[ rptr[r] ];
}
};
};
};
#endif

131
utils/xgboost_random.h Normal file
View File

@@ -0,0 +1,131 @@
#ifndef _XGBOOST_RANDOM_H_
#define _XGBOOST_RANDOM_H_
/*!
* \file xgboost_random.h
* \brief PRNG to support random number generation
* \author Tianqi Chen: tianqi.tchen@gmail.com
*
* Use standard PRNG from stdlib
*/
#include <cmath>
#include <cstdlib>
#include <vector>
#ifdef _MSC_VER
typedef unsigned char uint8_t;
typedef unsigned short int uint16_t;
typedef unsigned int uint32_t;
#else
#include <inttypes.h>
#endif
/*! namespace of PRNG */
namespace xgboost{
namespace random{
/*! \brief seed the PRNG */
inline void Seed( uint32_t seed ){
srand( seed );
}
/*! \brief return a real number uniform in [0,1) */
inline double NextDouble(){
return static_cast<double>( rand() ) / (static_cast<double>( RAND_MAX )+1.0);
}
/*! \brief return a real numer uniform in (0,1) */
inline double NextDouble2(){
return (static_cast<double>( rand() ) + 1.0 ) / (static_cast<double>(RAND_MAX) + 2.0);
}
};
namespace random{
/*! \brief return a random number */
inline uint32_t NextUInt32( void ){
return (uint32_t)rand();
}
/*! \brief return a random number in n */
inline uint32_t NextUInt32( uint32_t n ){
return (uint32_t) floor( NextDouble() * n ) ;
}
/*! \brief return x~N(0,1) */
inline double SampleNormal(){
double x,y,s;
do{
x = 2 * NextDouble2() - 1.0;
y = 2 * NextDouble2() - 1.0;
s = x*x + y*y;
}while( s >= 1.0 || s == 0.0 );
return x * sqrt( -2.0 * log(s) / s ) ;
}
/*! \brief return iid x,y ~N(0,1) */
inline void SampleNormal2D( double &xx, double &yy ){
double x,y,s;
do{
x = 2 * NextDouble2() - 1.0;
y = 2 * NextDouble2() - 1.0;
s = x*x + y*y;
}while( s >= 1.0 || s == 0.0 );
double t = sqrt( -2.0 * log(s) / s ) ;
xx = x * t;
yy = y * t;
}
/*! \brief return x~N(mu,sigma^2) */
inline double SampleNormal( double mu, double sigma ){
return SampleNormal() * sigma + mu;
}
/*! \brief return 1 with probability p, coin flip */
inline int SampleBinary( double p ){
return NextDouble() < p;
}
/*! \brief return distribution from Gamma( alpha, beta ) */
inline double SampleGamma( double alpha, double beta ) {
if ( alpha < 1.0 ) {
double u;
do {
u = NextDouble();
} while (u == 0.0);
return SampleGamma(alpha + 1.0, beta) * pow(u, 1.0 / alpha);
} else {
double d,c,x,v,u;
d = alpha - 1.0/3.0;
c = 1.0 / sqrt( 9.0 * d );
do {
do {
x = SampleNormal();
v = 1.0 + c*x;
} while ( v <= 0.0 );
v = v * v * v;
u = NextDouble();
} while ( (u >= (1.0 - 0.0331 * (x*x) * (x*x)))
&& (log(u) >= (0.5 * x * x + d * (1.0 - v + log(v)))) );
return d * v / beta;
}
}
template<typename T>
inline void Exchange( T &a, T &b ){
T c;
c = a;
a = b;
b = c;
}
template<typename T>
inline void Shuffle( T *data, size_t sz ){
if( sz == 0 ) return;
for( uint32_t i = (uint32_t)sz - 1; i > 0; i-- ){
Exchange( data[i], data[ NextUInt32( i+1 ) ] );
}
}
// random shuffle the data inside, require PRNG
template<typename T>
inline void Shuffle( std::vector<T> &data ){
Shuffle( &data[0], data.size() );
}
};
};
#endif