start add coltree maker

This commit is contained in:
tqchen 2014-02-28 11:44:50 -08:00
parent 82807b3a55
commit b57656902e
5 changed files with 217 additions and 35 deletions

View File

@ -0,0 +1,142 @@
#ifndef _XGBOOST_COL_TREEMAKER_HPP_
#define _XGBOOST_COL_TREEMAKER_HPP_
/*!
* \file xgboost_col_treemaker.hpp
* \brief implementation of regression tree maker,
* use a column based approach, with OpenMP
* \author Tianqi Chen: tianqi.tchen@gmail.com
*/
#include <vector>
#include <omp.h>
#include "xgboost_tree_model.h"
#include "../../utils/xgboost_random.h"
namespace xgboost{
namespace booster{
template<typename FMatrix>
class ColTreeMaker{
public:
ColTreeMaker( RegTree &tree,
const TreeParamTrain &param,
const std::vector<float> &grad,
const std::vector<float> &hess,
const FMatrix &smat,
const std::vector<unsigned> &root_index ):
tree( tree ), param( param ), grad( grad ), hess( hess ),
smat( smat ), root_index( root_index ){
utils::Assert( grad.size() == hess.size(), "booster:invalid input" );
utils::Assert( smat.NumRow() == hess.size(), "booster:invalid input" );
utils::Assert( root_index.size() == 0 || root_index.size() == hess.size(), "booster:invalid input" );
utils::Assert( smat.HaveColAccess(), "ColTreeMaker: need column access matrix" );
}
inline void Make( void ){
}
private:
// statistics that is helpful to decide a split
struct SplitEntry{
/*! \brief gain in terms of loss */
float loss_gain;
/*! \brief weight calculated related to current data */
float weight;
/*! \brief split index */
unsigned sindex;
/*! \brief split value */
float split_value;
/*! \brief constructor */
SplitEntry( void ){
weight = loss_gain = 0.0f;
split_value = 0.0f; sindex = 0;
}
inline void SetSplit( unsigned split_index, float split_value, bool default_left ){
if( default_left ) split_index |= (1U << 31);
this->sindex = split_index;
this->split_value = split_value;
}
inline unsigned split_index( void ) const{
return sindex & ( (1U<<31) - 1U );
}
inline bool default_left( void ) const{
return (sindex >> 31) != 0;
}
};
/*! \brief per thread x per node entry to store tmp data */
struct ThreadEntry{
/*! \brief sum gradient statistics */
double sum_grad;
/*! \brief sum hessian statistics */
double sum_hess;
/*! \brief current best solution */
SplitEntry best;
ThreadEntry( void ){
sum_grad = sum_hess = 0;
}
};
private:
// find split at current level
inline void FindSplit( int depth ){
unsigned nsize = static_cast<unsigned>(feat_index.size());
#pragma omp parallel for schedule( dynamic, 1 )
for( unsigned i = 0; i < nsize; ++ i ){
const unsigned fid = feat_index[i];
}
}
// initialize temp data structure
inline void InitData( void ){
position.resize( grad.size() );
if( root_index.size() == 0 ){
std::fill( position.begin(), position.end(), 0 );
}else{
for( size_t i = 0; i < root_index.size(); ++ i ){
position[i] = root_index[i];
utils::Assert( root_index[i] < (unsigned)tree.param.num_roots, "root index exceed setting" );
}
}
{// initialize feature index
for( int i = 0; i < tree.param.num_feature; i ++ ){
if( smat.GetSortedCol(i).Next() ){
feat_index.push_back( i );
}
}
random::Shuffle( feat_index );
}
{// setup temp space for each thread
int nthread;
#pragma omp parallel
{
nthread = omp_get_num_threads();
}
// reserve a small space
stemp.resize( nthread, std::vector<ThreadEntry>() );
for( size_t i = 0; i < stemp.size(); ++ i ){
stemp[i].reserve( 256 );
stemp[i].resize( tree.param.num_roots, ThreadEntry() );
}
}
{// setup statistics space for each tree node
snode.resize( tree.param.num_roots, SplitEntry() );
}
}
private:
// local helper tmp data structure
// Per feature: shuffle index of each feature index
std::vector<int> feat_index;
// Instance Data: current node position in the tree of each instance
std::vector<int> position;
// TreeNode Data: statistics for each constructed node
std::vector<SplitEntry> snode;
// PerThread x PerTreeNode: statistics for per thread construction
std::vector< std::vector<SplitEntry> > stemp;
private:
// original data that supports tree construction
RegTree &tree;
const TreeParamTrain &param;
const std::vector<float> &grad;
const std::vector<float> &hess;
const FMatrix &smat;
const std::vector<unsigned> &root_index;
};
};
};
#endif

View File

@ -205,13 +205,13 @@ namespace xgboost{
// enumerate split point of the tree
inline void enumerate_split( RTSelecter &sglobal, int tlen,
double rsum_grad, double rsum_hess, double root_cost,
double rsum_grad, double rsum_hess, double root_gain,
const SCEntry *entry, size_t start, size_t end,
int findex, float parent_base_weight ){
// local selecter
RTSelecter slocal( param );
if( param.default_direction != 1 ){
if( param.need_forward_search() ){
// forward process, default right
double csum_grad = 0.0, csum_hess = 0.0;
for( size_t j = start; j < end; j ++ ){
@ -225,8 +225,8 @@ namespace xgboost{
if( dsum_hess < param.min_child_weight ) break;
// change of loss
double loss_chg =
param.CalcCost( csum_grad, csum_hess, parent_base_weight ) +
param.CalcCost( rsum_grad - csum_grad, dsum_hess, parent_base_weight ) - root_cost;
param.CalcGain( csum_grad, csum_hess, parent_base_weight ) +
param.CalcGain( rsum_grad - csum_grad, dsum_hess, parent_base_weight ) - root_gain;
const int clen = static_cast<int>( j + 1 - start );
// add candidate to selecter
@ -237,7 +237,7 @@ namespace xgboost{
}
}
if( param.default_direction != 2 ){
if( param.need_backward_search() ){
// backward process, default left
double csum_grad = 0.0, csum_hess = 0.0;
for( size_t j = end; j > start; j -- ){
@ -249,8 +249,8 @@ namespace xgboost{
if( csum_hess < param.min_child_weight ) continue;
const double dsum_hess = rsum_hess - csum_hess;
if( dsum_hess < param.min_child_weight ) break;
double loss_chg = param.CalcCost( csum_grad, csum_hess, parent_base_weight ) +
param.CalcCost( rsum_grad - csum_grad, dsum_hess, parent_base_weight ) - root_cost;
double loss_chg = param.CalcGain( csum_grad, csum_hess, parent_base_weight ) +
param.CalcGain( rsum_grad - csum_grad, dsum_hess, parent_base_weight ) - root_gain;
const int clen = static_cast<int>( end - j + 1 );
// add candidate to selecter
slocal.push_back( RTSelecter::Entry( loss_chg, j - 1, clen, findex,
@ -319,8 +319,8 @@ namespace xgboost{
// global selecter
RTSelecter sglobal( param );
// cost root
const double root_cost = param.CalcRootCost( rsum_grad, rsum_hess );
// gain root
const double root_gain = param.CalcRootGain( rsum_grad, rsum_hess );
// KEY: layerwise, weight of current node if it is leaf
const double base_weight = param.CalcWeight( rsum_grad, rsum_hess, tsk.parent_base_weight );
// enumerate feature index
@ -333,7 +333,7 @@ namespace xgboost{
std::sort( entry.begin() + start, entry.begin() + end );
// local selecter
this->enumerate_split( sglobal, tsk.len,
rsum_grad, rsum_hess, root_cost,
rsum_grad, rsum_hess, root_gain,
&entry[0], start, end, findex, base_weight );
}
// Cleanup tmp_rptr for next use

View File

@ -23,6 +23,7 @@ namespace xgboost{
};
#include "xgboost_svdf_tree.hpp"
#include "xgboost_col_treemaker.hpp"
namespace xgboost{
namespace booster{
@ -30,7 +31,9 @@ namespace xgboost{
// see RegTreeUpdater
class RegTreeTrainer : public IBooster{
public:
RegTreeTrainer( void ){ silent = 0; }
RegTreeTrainer( void ){
silent = 0; tree_maker = 0;
}
virtual ~RegTreeTrainer( void ){}
public:
virtual void SetParam( const char *name, const char *val ){
@ -51,8 +54,8 @@ namespace xgboost{
virtual void DoBoost( std::vector<float> &grad,
std::vector<float> &hess,
const FMatrixS &smat,
const std::vector<unsigned> &group_id ){
this->DoBoost_( grad, hess, smat, group_id );
const std::vector<unsigned> &root_index ){
this->DoBoost_( grad, hess, smat, root_index );
}
virtual int GetLeafIndex( const std::vector<float> &feat,
@ -108,23 +111,28 @@ namespace xgboost{
inline void DoBoost_( std::vector<float> &grad,
std::vector<float> &hess,
const FMatrix &smat,
const std::vector<unsigned> &group_id ){
const std::vector<unsigned> &root_index ){
utils::Assert( grad.size() < UINT_MAX, "number of instance exceed what we can handle" );
if( !silent ){
printf( "\nbuild GBRT with %u instances\n", (unsigned)grad.size() );
}
// start with a id set
RTreeUpdater<FMatrix> updater( param, tree, grad, hess, smat, group_id );
int num_pruned;
tree.param.max_depth = updater.do_boost( num_pruned );
if( !silent ){
printf( "tree train end, %d roots, %d extra nodes, %d pruned nodes ,max_depth=%d\n",
tree.param.num_roots, tree.num_extra_nodes(), num_pruned, tree.param.max_depth );
if( tree_maker == 0 ){
// start with a id set
RTreeUpdater<FMatrix> updater( param, tree, grad, hess, smat, root_index );
int num_pruned;
tree.param.max_depth = updater.do_boost( num_pruned );
if( !silent ){
printf( "tree train end, %d roots, %d extra nodes, %d pruned nodes ,max_depth=%d\n",
tree.param.num_roots, tree.num_extra_nodes(), num_pruned, tree.param.max_depth );
}
}else{
ColTreeMaker<FMatrix> maker( tree, param, grad, hess, smat, root_index );
maker.Make();
}
}
private:
int silent;
int tree_maker;
RegTree tree;
TreeParamTrain param;
private:

View File

@ -391,7 +391,7 @@ namespace xgboost{
}
public:
// calculate the cost of loss function
inline double CalcCost( double sum_grad, double sum_hess ) const{
inline double CalcGain( double sum_grad, double sum_hess ) const{
if( sum_hess < min_child_weight ){
return 0.0;
}
@ -405,29 +405,37 @@ namespace xgboost{
}
// KEY:layerwise
// calculate cost of root
inline double CalcRootCost( double sum_grad, double sum_hess ) const{
if( use_layerwise == 0 ) return this->CalcCost( sum_grad, sum_hess );
inline double CalcRootGain( double sum_grad, double sum_hess ) const{
if( use_layerwise == 0 ) return this->CalcGain( sum_grad, sum_hess );
else return 0.0;
}
// KEY:layerwise
// calculate the cost after split
// base_weight: the base_weight of parent
inline double CalcCost( double sum_grad, double sum_hess, double base_weight ) const{
if( use_layerwise == 0 ) return this->CalcCost( sum_grad, sum_hess );
else return this->CalcCost( sum_grad + sum_hess * base_weight, sum_hess );
inline double CalcGain( double sum_grad, double sum_hess, double base_weight ) const{
if( use_layerwise == 0 ) return this->CalcGain( sum_grad, sum_hess );
else return this->CalcGain( sum_grad + sum_hess * base_weight, sum_hess );
}
// calculate the weight of leaf
inline double CalcWeight( double sum_grad, double sum_hess, double parent_base_weight )const{
if( use_layerwise == 0 ) return CalcWeight( sum_grad, sum_hess );
else return parent_base_weight + CalcWeight( sum_grad + parent_base_weight * sum_hess, sum_hess );
}
/*! \brief whether need forward small to big search: default right */
inline bool need_forward_search( void ) const{
return this->default_direction != 1;
}
/*! \brief whether need forward big to small search: default left */
inline bool need_backward_search( void ) const{
return this->default_direction != 2;
}
/*! \brief given the loss change, whether we need to invode prunning */
inline bool need_prune( double loss_chg, int depth ) const{
return loss_chg < min_split_loss;
return loss_chg < this->min_split_loss;
}
/*! \brief whether we can split with current hessian */
inline bool cannot_split( double sum_hess, int depth ) const{
return sum_hess < min_child_weight * 2.0;
return sum_hess < this->min_child_weight * 2.0;
}
};
};

View File

@ -66,6 +66,8 @@ namespace xgboost{
/*! \return feature value in current position */
inline bst_float fvalue( void ) const;
};
/*! \brief backward iterator over column */
struct ColBackIter : public ColIter {};
public:
/*!
* \brief get number of rows
@ -83,17 +85,21 @@ namespace xgboost{
* \return row iterator
*/
inline RowIter GetRow( size_t ridx ) const;
/*! \return whether column access is enabled */
inline bool HaveColAccess( void ) const;
/*!
* \brief get column iterator, the columns must be sorted by feature value
* \param ridx column index
* \return column iterator
*/
inline ColIter GetSortedCol( size_t ridx ) const;
/*! \return the view of derived class */
inline const Derived& self( void ) const{
return *static_cast<const Derived*>(this);
}
/*!
* \brief get column backward iterator, starts from biggest fvalue, and iterator back
* \param ridx column index
* \return reverse column iterator
*/
inline ColBackIter GetReverseSortedCol( size_t ridx ) const;
};
};
};
@ -152,6 +158,16 @@ namespace xgboost{
return this->findex();
}
};
/*! \brief reverse column iterator */
struct ColBackIter: public ColIter{
// shadows RowIter::Next
inline bool Next( void ){
if( dptr_ == end_ ) return false;
else{
-- dptr_; return true;
}
}
};
public:
/*! \brief constructor */
FMatrixS( void ){ this->Clear(); }
@ -229,6 +245,14 @@ namespace xgboost{
it.end_ = &col_data_[ col_ptr_[cidx+1] ] - 1;
return it;
}
/*! \brief get col iterator */
inline ColIter GetReverseSortedCol( size_t cidx ) const{
utils::Assert( !bst_debug || cidx < this->NumCol(), "col id exceed bound" );
ColIter it;
it.dptr_ = &col_data_[ col_ptr_[cidx+1] ];
it.end_ = &col_data_[ col_ptr_[cidx] ];
return it;
}
/*!
* \brief intialize the data so that we have both column and row major
* access, call this whenever we need column access