start add coltree maker
This commit is contained in:
parent
10382f6365
commit
fffad41e53
142
booster/tree/xgboost_col_treemaker.hpp
Normal file
142
booster/tree/xgboost_col_treemaker.hpp
Normal file
@ -0,0 +1,142 @@
|
||||
#ifndef _XGBOOST_COL_TREEMAKER_HPP_
|
||||
#define _XGBOOST_COL_TREEMAKER_HPP_
|
||||
/*!
|
||||
* \file xgboost_col_treemaker.hpp
|
||||
* \brief implementation of regression tree maker,
|
||||
* use a column based approach, with OpenMP
|
||||
* \author Tianqi Chen: tianqi.tchen@gmail.com
|
||||
*/
|
||||
#include <vector>
|
||||
#include <omp.h>
|
||||
#include "xgboost_tree_model.h"
|
||||
#include "../../utils/xgboost_random.h"
|
||||
|
||||
namespace xgboost{
|
||||
namespace booster{
|
||||
template<typename FMatrix>
|
||||
class ColTreeMaker{
|
||||
public:
|
||||
ColTreeMaker( RegTree &tree,
|
||||
const TreeParamTrain ¶m,
|
||||
const std::vector<float> &grad,
|
||||
const std::vector<float> &hess,
|
||||
const FMatrix &smat,
|
||||
const std::vector<unsigned> &root_index ):
|
||||
tree( tree ), param( param ), grad( grad ), hess( hess ),
|
||||
smat( smat ), root_index( root_index ){
|
||||
utils::Assert( grad.size() == hess.size(), "booster:invalid input" );
|
||||
utils::Assert( smat.NumRow() == hess.size(), "booster:invalid input" );
|
||||
utils::Assert( root_index.size() == 0 || root_index.size() == hess.size(), "booster:invalid input" );
|
||||
utils::Assert( smat.HaveColAccess(), "ColTreeMaker: need column access matrix" );
|
||||
}
|
||||
inline void Make( void ){
|
||||
}
|
||||
private:
|
||||
// statistics that is helpful to decide a split
|
||||
struct SplitEntry{
|
||||
/*! \brief gain in terms of loss */
|
||||
float loss_gain;
|
||||
/*! \brief weight calculated related to current data */
|
||||
float weight;
|
||||
/*! \brief split index */
|
||||
unsigned sindex;
|
||||
/*! \brief split value */
|
||||
float split_value;
|
||||
/*! \brief constructor */
|
||||
SplitEntry( void ){
|
||||
weight = loss_gain = 0.0f;
|
||||
split_value = 0.0f; sindex = 0;
|
||||
}
|
||||
inline void SetSplit( unsigned split_index, float split_value, bool default_left ){
|
||||
if( default_left ) split_index |= (1U << 31);
|
||||
this->sindex = split_index;
|
||||
this->split_value = split_value;
|
||||
}
|
||||
inline unsigned split_index( void ) const{
|
||||
return sindex & ( (1U<<31) - 1U );
|
||||
}
|
||||
inline bool default_left( void ) const{
|
||||
return (sindex >> 31) != 0;
|
||||
}
|
||||
};
|
||||
/*! \brief per thread x per node entry to store tmp data */
|
||||
struct ThreadEntry{
|
||||
/*! \brief sum gradient statistics */
|
||||
double sum_grad;
|
||||
/*! \brief sum hessian statistics */
|
||||
double sum_hess;
|
||||
/*! \brief current best solution */
|
||||
SplitEntry best;
|
||||
ThreadEntry( void ){
|
||||
sum_grad = sum_hess = 0;
|
||||
}
|
||||
};
|
||||
private:
|
||||
// find split at current level
|
||||
inline void FindSplit( int depth ){
|
||||
unsigned nsize = static_cast<unsigned>(feat_index.size());
|
||||
|
||||
#pragma omp parallel for schedule( dynamic, 1 )
|
||||
for( unsigned i = 0; i < nsize; ++ i ){
|
||||
const unsigned fid = feat_index[i];
|
||||
}
|
||||
}
|
||||
// initialize temp data structure
|
||||
inline void InitData( void ){
|
||||
position.resize( grad.size() );
|
||||
if( root_index.size() == 0 ){
|
||||
std::fill( position.begin(), position.end(), 0 );
|
||||
}else{
|
||||
for( size_t i = 0; i < root_index.size(); ++ i ){
|
||||
position[i] = root_index[i];
|
||||
utils::Assert( root_index[i] < (unsigned)tree.param.num_roots, "root index exceed setting" );
|
||||
}
|
||||
}
|
||||
{// initialize feature index
|
||||
for( int i = 0; i < tree.param.num_feature; i ++ ){
|
||||
if( smat.GetSortedCol(i).Next() ){
|
||||
feat_index.push_back( i );
|
||||
}
|
||||
}
|
||||
random::Shuffle( feat_index );
|
||||
}
|
||||
{// setup temp space for each thread
|
||||
int nthread;
|
||||
#pragma omp parallel
|
||||
{
|
||||
nthread = omp_get_num_threads();
|
||||
}
|
||||
// reserve a small space
|
||||
stemp.resize( nthread, std::vector<ThreadEntry>() );
|
||||
for( size_t i = 0; i < stemp.size(); ++ i ){
|
||||
stemp[i].reserve( 256 );
|
||||
stemp[i].resize( tree.param.num_roots, ThreadEntry() );
|
||||
}
|
||||
}
|
||||
{// setup statistics space for each tree node
|
||||
snode.resize( tree.param.num_roots, SplitEntry() );
|
||||
}
|
||||
}
|
||||
private:
|
||||
// local helper tmp data structure
|
||||
// Per feature: shuffle index of each feature index
|
||||
std::vector<int> feat_index;
|
||||
// Instance Data: current node position in the tree of each instance
|
||||
std::vector<int> position;
|
||||
// TreeNode Data: statistics for each constructed node
|
||||
std::vector<SplitEntry> snode;
|
||||
// PerThread x PerTreeNode: statistics for per thread construction
|
||||
std::vector< std::vector<SplitEntry> > stemp;
|
||||
private:
|
||||
// original data that supports tree construction
|
||||
RegTree &tree;
|
||||
const TreeParamTrain ¶m;
|
||||
const std::vector<float> &grad;
|
||||
const std::vector<float> &hess;
|
||||
const FMatrix &smat;
|
||||
const std::vector<unsigned> &root_index;
|
||||
|
||||
};
|
||||
};
|
||||
};
|
||||
#endif
|
||||
@ -205,13 +205,13 @@ namespace xgboost{
|
||||
|
||||
// enumerate split point of the tree
|
||||
inline void enumerate_split( RTSelecter &sglobal, int tlen,
|
||||
double rsum_grad, double rsum_hess, double root_cost,
|
||||
double rsum_grad, double rsum_hess, double root_gain,
|
||||
const SCEntry *entry, size_t start, size_t end,
|
||||
int findex, float parent_base_weight ){
|
||||
// local selecter
|
||||
RTSelecter slocal( param );
|
||||
|
||||
if( param.default_direction != 1 ){
|
||||
if( param.need_forward_search() ){
|
||||
// forward process, default right
|
||||
double csum_grad = 0.0, csum_hess = 0.0;
|
||||
for( size_t j = start; j < end; j ++ ){
|
||||
@ -225,8 +225,8 @@ namespace xgboost{
|
||||
if( dsum_hess < param.min_child_weight ) break;
|
||||
// change of loss
|
||||
double loss_chg =
|
||||
param.CalcCost( csum_grad, csum_hess, parent_base_weight ) +
|
||||
param.CalcCost( rsum_grad - csum_grad, dsum_hess, parent_base_weight ) - root_cost;
|
||||
param.CalcGain( csum_grad, csum_hess, parent_base_weight ) +
|
||||
param.CalcGain( rsum_grad - csum_grad, dsum_hess, parent_base_weight ) - root_gain;
|
||||
|
||||
const int clen = static_cast<int>( j + 1 - start );
|
||||
// add candidate to selecter
|
||||
@ -237,7 +237,7 @@ namespace xgboost{
|
||||
}
|
||||
}
|
||||
|
||||
if( param.default_direction != 2 ){
|
||||
if( param.need_backward_search() ){
|
||||
// backward process, default left
|
||||
double csum_grad = 0.0, csum_hess = 0.0;
|
||||
for( size_t j = end; j > start; j -- ){
|
||||
@ -249,8 +249,8 @@ namespace xgboost{
|
||||
if( csum_hess < param.min_child_weight ) continue;
|
||||
const double dsum_hess = rsum_hess - csum_hess;
|
||||
if( dsum_hess < param.min_child_weight ) break;
|
||||
double loss_chg = param.CalcCost( csum_grad, csum_hess, parent_base_weight ) +
|
||||
param.CalcCost( rsum_grad - csum_grad, dsum_hess, parent_base_weight ) - root_cost;
|
||||
double loss_chg = param.CalcGain( csum_grad, csum_hess, parent_base_weight ) +
|
||||
param.CalcGain( rsum_grad - csum_grad, dsum_hess, parent_base_weight ) - root_gain;
|
||||
const int clen = static_cast<int>( end - j + 1 );
|
||||
// add candidate to selecter
|
||||
slocal.push_back( RTSelecter::Entry( loss_chg, j - 1, clen, findex,
|
||||
@ -319,8 +319,8 @@ namespace xgboost{
|
||||
|
||||
// global selecter
|
||||
RTSelecter sglobal( param );
|
||||
// cost root
|
||||
const double root_cost = param.CalcRootCost( rsum_grad, rsum_hess );
|
||||
// gain root
|
||||
const double root_gain = param.CalcRootGain( rsum_grad, rsum_hess );
|
||||
// KEY: layerwise, weight of current node if it is leaf
|
||||
const double base_weight = param.CalcWeight( rsum_grad, rsum_hess, tsk.parent_base_weight );
|
||||
// enumerate feature index
|
||||
@ -333,7 +333,7 @@ namespace xgboost{
|
||||
std::sort( entry.begin() + start, entry.begin() + end );
|
||||
// local selecter
|
||||
this->enumerate_split( sglobal, tsk.len,
|
||||
rsum_grad, rsum_hess, root_cost,
|
||||
rsum_grad, rsum_hess, root_gain,
|
||||
&entry[0], start, end, findex, base_weight );
|
||||
}
|
||||
// Cleanup tmp_rptr for next use
|
||||
|
||||
@ -23,6 +23,7 @@ namespace xgboost{
|
||||
};
|
||||
|
||||
#include "xgboost_svdf_tree.hpp"
|
||||
#include "xgboost_col_treemaker.hpp"
|
||||
|
||||
namespace xgboost{
|
||||
namespace booster{
|
||||
@ -30,7 +31,9 @@ namespace xgboost{
|
||||
// see RegTreeUpdater
|
||||
class RegTreeTrainer : public IBooster{
|
||||
public:
|
||||
RegTreeTrainer( void ){ silent = 0; }
|
||||
RegTreeTrainer( void ){
|
||||
silent = 0; tree_maker = 0;
|
||||
}
|
||||
virtual ~RegTreeTrainer( void ){}
|
||||
public:
|
||||
virtual void SetParam( const char *name, const char *val ){
|
||||
@ -51,8 +54,8 @@ namespace xgboost{
|
||||
virtual void DoBoost( std::vector<float> &grad,
|
||||
std::vector<float> &hess,
|
||||
const FMatrixS &smat,
|
||||
const std::vector<unsigned> &group_id ){
|
||||
this->DoBoost_( grad, hess, smat, group_id );
|
||||
const std::vector<unsigned> &root_index ){
|
||||
this->DoBoost_( grad, hess, smat, root_index );
|
||||
}
|
||||
|
||||
virtual int GetLeafIndex( const std::vector<float> &feat,
|
||||
@ -108,23 +111,28 @@ namespace xgboost{
|
||||
inline void DoBoost_( std::vector<float> &grad,
|
||||
std::vector<float> &hess,
|
||||
const FMatrix &smat,
|
||||
const std::vector<unsigned> &group_id ){
|
||||
const std::vector<unsigned> &root_index ){
|
||||
utils::Assert( grad.size() < UINT_MAX, "number of instance exceed what we can handle" );
|
||||
if( !silent ){
|
||||
printf( "\nbuild GBRT with %u instances\n", (unsigned)grad.size() );
|
||||
}
|
||||
// start with a id set
|
||||
RTreeUpdater<FMatrix> updater( param, tree, grad, hess, smat, group_id );
|
||||
int num_pruned;
|
||||
tree.param.max_depth = updater.do_boost( num_pruned );
|
||||
|
||||
if( !silent ){
|
||||
printf( "tree train end, %d roots, %d extra nodes, %d pruned nodes ,max_depth=%d\n",
|
||||
tree.param.num_roots, tree.num_extra_nodes(), num_pruned, tree.param.max_depth );
|
||||
if( tree_maker == 0 ){
|
||||
// start with a id set
|
||||
RTreeUpdater<FMatrix> updater( param, tree, grad, hess, smat, root_index );
|
||||
int num_pruned;
|
||||
tree.param.max_depth = updater.do_boost( num_pruned );
|
||||
if( !silent ){
|
||||
printf( "tree train end, %d roots, %d extra nodes, %d pruned nodes ,max_depth=%d\n",
|
||||
tree.param.num_roots, tree.num_extra_nodes(), num_pruned, tree.param.max_depth );
|
||||
}
|
||||
}else{
|
||||
ColTreeMaker<FMatrix> maker( tree, param, grad, hess, smat, root_index );
|
||||
maker.Make();
|
||||
}
|
||||
}
|
||||
private:
|
||||
int silent;
|
||||
int tree_maker;
|
||||
RegTree tree;
|
||||
TreeParamTrain param;
|
||||
private:
|
||||
|
||||
@ -391,7 +391,7 @@ namespace xgboost{
|
||||
}
|
||||
public:
|
||||
// calculate the cost of loss function
|
||||
inline double CalcCost( double sum_grad, double sum_hess ) const{
|
||||
inline double CalcGain( double sum_grad, double sum_hess ) const{
|
||||
if( sum_hess < min_child_weight ){
|
||||
return 0.0;
|
||||
}
|
||||
@ -405,29 +405,37 @@ namespace xgboost{
|
||||
}
|
||||
// KEY:layerwise
|
||||
// calculate cost of root
|
||||
inline double CalcRootCost( double sum_grad, double sum_hess ) const{
|
||||
if( use_layerwise == 0 ) return this->CalcCost( sum_grad, sum_hess );
|
||||
inline double CalcRootGain( double sum_grad, double sum_hess ) const{
|
||||
if( use_layerwise == 0 ) return this->CalcGain( sum_grad, sum_hess );
|
||||
else return 0.0;
|
||||
}
|
||||
// KEY:layerwise
|
||||
// calculate the cost after split
|
||||
// base_weight: the base_weight of parent
|
||||
inline double CalcCost( double sum_grad, double sum_hess, double base_weight ) const{
|
||||
if( use_layerwise == 0 ) return this->CalcCost( sum_grad, sum_hess );
|
||||
else return this->CalcCost( sum_grad + sum_hess * base_weight, sum_hess );
|
||||
inline double CalcGain( double sum_grad, double sum_hess, double base_weight ) const{
|
||||
if( use_layerwise == 0 ) return this->CalcGain( sum_grad, sum_hess );
|
||||
else return this->CalcGain( sum_grad + sum_hess * base_weight, sum_hess );
|
||||
}
|
||||
// calculate the weight of leaf
|
||||
inline double CalcWeight( double sum_grad, double sum_hess, double parent_base_weight )const{
|
||||
if( use_layerwise == 0 ) return CalcWeight( sum_grad, sum_hess );
|
||||
else return parent_base_weight + CalcWeight( sum_grad + parent_base_weight * sum_hess, sum_hess );
|
||||
}
|
||||
/*! \brief whether need forward small to big search: default right */
|
||||
inline bool need_forward_search( void ) const{
|
||||
return this->default_direction != 1;
|
||||
}
|
||||
/*! \brief whether need forward big to small search: default left */
|
||||
inline bool need_backward_search( void ) const{
|
||||
return this->default_direction != 2;
|
||||
}
|
||||
/*! \brief given the loss change, whether we need to invode prunning */
|
||||
inline bool need_prune( double loss_chg, int depth ) const{
|
||||
return loss_chg < min_split_loss;
|
||||
return loss_chg < this->min_split_loss;
|
||||
}
|
||||
/*! \brief whether we can split with current hessian */
|
||||
inline bool cannot_split( double sum_hess, int depth ) const{
|
||||
return sum_hess < min_child_weight * 2.0;
|
||||
return sum_hess < this->min_child_weight * 2.0;
|
||||
}
|
||||
};
|
||||
};
|
||||
|
||||
@ -66,6 +66,8 @@ namespace xgboost{
|
||||
/*! \return feature value in current position */
|
||||
inline bst_float fvalue( void ) const;
|
||||
};
|
||||
/*! \brief backward iterator over column */
|
||||
struct ColBackIter : public ColIter {};
|
||||
public:
|
||||
/*!
|
||||
* \brief get number of rows
|
||||
@ -83,17 +85,21 @@ namespace xgboost{
|
||||
* \return row iterator
|
||||
*/
|
||||
inline RowIter GetRow( size_t ridx ) const;
|
||||
|
||||
/*! \return whether column access is enabled */
|
||||
inline bool HaveColAccess( void ) const;
|
||||
/*!
|
||||
* \brief get column iterator, the columns must be sorted by feature value
|
||||
* \param ridx column index
|
||||
* \return column iterator
|
||||
*/
|
||||
inline ColIter GetSortedCol( size_t ridx ) const;
|
||||
|
||||
/*! \return the view of derived class */
|
||||
inline const Derived& self( void ) const{
|
||||
return *static_cast<const Derived*>(this);
|
||||
}
|
||||
/*!
|
||||
* \brief get column backward iterator, starts from biggest fvalue, and iterator back
|
||||
* \param ridx column index
|
||||
* \return reverse column iterator
|
||||
*/
|
||||
inline ColBackIter GetReverseSortedCol( size_t ridx ) const;
|
||||
};
|
||||
};
|
||||
};
|
||||
@ -152,6 +158,16 @@ namespace xgboost{
|
||||
return this->findex();
|
||||
}
|
||||
};
|
||||
/*! \brief reverse column iterator */
|
||||
struct ColBackIter: public ColIter{
|
||||
// shadows RowIter::Next
|
||||
inline bool Next( void ){
|
||||
if( dptr_ == end_ ) return false;
|
||||
else{
|
||||
-- dptr_; return true;
|
||||
}
|
||||
}
|
||||
};
|
||||
public:
|
||||
/*! \brief constructor */
|
||||
FMatrixS( void ){ this->Clear(); }
|
||||
@ -229,6 +245,14 @@ namespace xgboost{
|
||||
it.end_ = &col_data_[ col_ptr_[cidx+1] ] - 1;
|
||||
return it;
|
||||
}
|
||||
/*! \brief get col iterator */
|
||||
inline ColIter GetReverseSortedCol( size_t cidx ) const{
|
||||
utils::Assert( !bst_debug || cidx < this->NumCol(), "col id exceed bound" );
|
||||
ColIter it;
|
||||
it.dptr_ = &col_data_[ col_ptr_[cidx+1] ];
|
||||
it.end_ = &col_data_[ col_ptr_[cidx] ];
|
||||
return it;
|
||||
}
|
||||
/*!
|
||||
* \brief intialize the data so that we have both column and row major
|
||||
* access, call this whenever we need column access
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user