From fffad41e534d5e4c155a9dddbba14f8b018da39c Mon Sep 17 00:00:00 2001 From: tqchen Date: Fri, 28 Feb 2014 11:44:50 -0800 Subject: [PATCH] start add coltree maker --- booster/tree/xgboost_col_treemaker.hpp | 142 +++++++++++++++++++++++++ booster/tree/xgboost_svdf_tree.hpp | 20 ++-- booster/tree/xgboost_tree.hpp | 32 +++--- booster/tree/xgboost_tree_model.h | 24 +++-- booster/xgboost_data.h | 34 +++++- 5 files changed, 217 insertions(+), 35 deletions(-) create mode 100644 booster/tree/xgboost_col_treemaker.hpp diff --git a/booster/tree/xgboost_col_treemaker.hpp b/booster/tree/xgboost_col_treemaker.hpp new file mode 100644 index 000000000..d5591d8df --- /dev/null +++ b/booster/tree/xgboost_col_treemaker.hpp @@ -0,0 +1,142 @@ +#ifndef _XGBOOST_COL_TREEMAKER_HPP_ +#define _XGBOOST_COL_TREEMAKER_HPP_ +/*! + * \file xgboost_col_treemaker.hpp + * \brief implementation of regression tree maker, + * use a column based approach, with OpenMP + * \author Tianqi Chen: tianqi.tchen@gmail.com + */ +#include +#include +#include "xgboost_tree_model.h" +#include "../../utils/xgboost_random.h" + +namespace xgboost{ + namespace booster{ + template + class ColTreeMaker{ + public: + ColTreeMaker( RegTree &tree, + const TreeParamTrain ¶m, + const std::vector &grad, + const std::vector &hess, + const FMatrix &smat, + const std::vector &root_index ): + tree( tree ), param( param ), grad( grad ), hess( hess ), + smat( smat ), root_index( root_index ){ + utils::Assert( grad.size() == hess.size(), "booster:invalid input" ); + utils::Assert( smat.NumRow() == hess.size(), "booster:invalid input" ); + utils::Assert( root_index.size() == 0 || root_index.size() == hess.size(), "booster:invalid input" ); + utils::Assert( smat.HaveColAccess(), "ColTreeMaker: need column access matrix" ); + } + inline void Make( void ){ + } + private: + // statistics that is helpful to decide a split + struct SplitEntry{ + /*! \brief gain in terms of loss */ + float loss_gain; + /*! \brief weight calculated related to current data */ + float weight; + /*! \brief split index */ + unsigned sindex; + /*! \brief split value */ + float split_value; + /*! \brief constructor */ + SplitEntry( void ){ + weight = loss_gain = 0.0f; + split_value = 0.0f; sindex = 0; + } + inline void SetSplit( unsigned split_index, float split_value, bool default_left ){ + if( default_left ) split_index |= (1U << 31); + this->sindex = split_index; + this->split_value = split_value; + } + inline unsigned split_index( void ) const{ + return sindex & ( (1U<<31) - 1U ); + } + inline bool default_left( void ) const{ + return (sindex >> 31) != 0; + } + }; + /*! \brief per thread x per node entry to store tmp data */ + struct ThreadEntry{ + /*! \brief sum gradient statistics */ + double sum_grad; + /*! \brief sum hessian statistics */ + double sum_hess; + /*! \brief current best solution */ + SplitEntry best; + ThreadEntry( void ){ + sum_grad = sum_hess = 0; + } + }; + private: + // find split at current level + inline void FindSplit( int depth ){ + unsigned nsize = static_cast(feat_index.size()); + + #pragma omp parallel for schedule( dynamic, 1 ) + for( unsigned i = 0; i < nsize; ++ i ){ + const unsigned fid = feat_index[i]; + } + } + // initialize temp data structure + inline void InitData( void ){ + position.resize( grad.size() ); + if( root_index.size() == 0 ){ + std::fill( position.begin(), position.end(), 0 ); + }else{ + for( size_t i = 0; i < root_index.size(); ++ i ){ + position[i] = root_index[i]; + utils::Assert( root_index[i] < (unsigned)tree.param.num_roots, "root index exceed setting" ); + } + } + {// initialize feature index + for( int i = 0; i < tree.param.num_feature; i ++ ){ + if( smat.GetSortedCol(i).Next() ){ + feat_index.push_back( i ); + } + } + random::Shuffle( feat_index ); + } + {// setup temp space for each thread + int nthread; + #pragma omp parallel + { + nthread = omp_get_num_threads(); + } + // reserve a small space + stemp.resize( nthread, std::vector() ); + for( size_t i = 0; i < stemp.size(); ++ i ){ + stemp[i].reserve( 256 ); + stemp[i].resize( tree.param.num_roots, ThreadEntry() ); + } + } + {// setup statistics space for each tree node + snode.resize( tree.param.num_roots, SplitEntry() ); + } + } + private: + // local helper tmp data structure + // Per feature: shuffle index of each feature index + std::vector feat_index; + // Instance Data: current node position in the tree of each instance + std::vector position; + // TreeNode Data: statistics for each constructed node + std::vector snode; + // PerThread x PerTreeNode: statistics for per thread construction + std::vector< std::vector > stemp; + private: + // original data that supports tree construction + RegTree &tree; + const TreeParamTrain ¶m; + const std::vector &grad; + const std::vector &hess; + const FMatrix &smat; + const std::vector &root_index; + + }; + }; +}; +#endif diff --git a/booster/tree/xgboost_svdf_tree.hpp b/booster/tree/xgboost_svdf_tree.hpp index b504910a4..b171f8876 100644 --- a/booster/tree/xgboost_svdf_tree.hpp +++ b/booster/tree/xgboost_svdf_tree.hpp @@ -205,13 +205,13 @@ namespace xgboost{ // enumerate split point of the tree inline void enumerate_split( RTSelecter &sglobal, int tlen, - double rsum_grad, double rsum_hess, double root_cost, + double rsum_grad, double rsum_hess, double root_gain, const SCEntry *entry, size_t start, size_t end, int findex, float parent_base_weight ){ // local selecter RTSelecter slocal( param ); - if( param.default_direction != 1 ){ + if( param.need_forward_search() ){ // forward process, default right double csum_grad = 0.0, csum_hess = 0.0; for( size_t j = start; j < end; j ++ ){ @@ -225,8 +225,8 @@ namespace xgboost{ if( dsum_hess < param.min_child_weight ) break; // change of loss double loss_chg = - param.CalcCost( csum_grad, csum_hess, parent_base_weight ) + - param.CalcCost( rsum_grad - csum_grad, dsum_hess, parent_base_weight ) - root_cost; + param.CalcGain( csum_grad, csum_hess, parent_base_weight ) + + param.CalcGain( rsum_grad - csum_grad, dsum_hess, parent_base_weight ) - root_gain; const int clen = static_cast( j + 1 - start ); // add candidate to selecter @@ -237,7 +237,7 @@ namespace xgboost{ } } - if( param.default_direction != 2 ){ + if( param.need_backward_search() ){ // backward process, default left double csum_grad = 0.0, csum_hess = 0.0; for( size_t j = end; j > start; j -- ){ @@ -249,8 +249,8 @@ namespace xgboost{ if( csum_hess < param.min_child_weight ) continue; const double dsum_hess = rsum_hess - csum_hess; if( dsum_hess < param.min_child_weight ) break; - double loss_chg = param.CalcCost( csum_grad, csum_hess, parent_base_weight ) + - param.CalcCost( rsum_grad - csum_grad, dsum_hess, parent_base_weight ) - root_cost; + double loss_chg = param.CalcGain( csum_grad, csum_hess, parent_base_weight ) + + param.CalcGain( rsum_grad - csum_grad, dsum_hess, parent_base_weight ) - root_gain; const int clen = static_cast( end - j + 1 ); // add candidate to selecter slocal.push_back( RTSelecter::Entry( loss_chg, j - 1, clen, findex, @@ -319,8 +319,8 @@ namespace xgboost{ // global selecter RTSelecter sglobal( param ); - // cost root - const double root_cost = param.CalcRootCost( rsum_grad, rsum_hess ); + // gain root + const double root_gain = param.CalcRootGain( rsum_grad, rsum_hess ); // KEY: layerwise, weight of current node if it is leaf const double base_weight = param.CalcWeight( rsum_grad, rsum_hess, tsk.parent_base_weight ); // enumerate feature index @@ -333,7 +333,7 @@ namespace xgboost{ std::sort( entry.begin() + start, entry.begin() + end ); // local selecter this->enumerate_split( sglobal, tsk.len, - rsum_grad, rsum_hess, root_cost, + rsum_grad, rsum_hess, root_gain, &entry[0], start, end, findex, base_weight ); } // Cleanup tmp_rptr for next use diff --git a/booster/tree/xgboost_tree.hpp b/booster/tree/xgboost_tree.hpp index 783e7c380..c74030ca6 100644 --- a/booster/tree/xgboost_tree.hpp +++ b/booster/tree/xgboost_tree.hpp @@ -23,6 +23,7 @@ namespace xgboost{ }; #include "xgboost_svdf_tree.hpp" +#include "xgboost_col_treemaker.hpp" namespace xgboost{ namespace booster{ @@ -30,7 +31,9 @@ namespace xgboost{ // see RegTreeUpdater class RegTreeTrainer : public IBooster{ public: - RegTreeTrainer( void ){ silent = 0; } + RegTreeTrainer( void ){ + silent = 0; tree_maker = 0; + } virtual ~RegTreeTrainer( void ){} public: virtual void SetParam( const char *name, const char *val ){ @@ -51,8 +54,8 @@ namespace xgboost{ virtual void DoBoost( std::vector &grad, std::vector &hess, const FMatrixS &smat, - const std::vector &group_id ){ - this->DoBoost_( grad, hess, smat, group_id ); + const std::vector &root_index ){ + this->DoBoost_( grad, hess, smat, root_index ); } virtual int GetLeafIndex( const std::vector &feat, @@ -108,23 +111,28 @@ namespace xgboost{ inline void DoBoost_( std::vector &grad, std::vector &hess, const FMatrix &smat, - const std::vector &group_id ){ + const std::vector &root_index ){ utils::Assert( grad.size() < UINT_MAX, "number of instance exceed what we can handle" ); if( !silent ){ printf( "\nbuild GBRT with %u instances\n", (unsigned)grad.size() ); } - // start with a id set - RTreeUpdater updater( param, tree, grad, hess, smat, group_id ); - int num_pruned; - tree.param.max_depth = updater.do_boost( num_pruned ); - - if( !silent ){ - printf( "tree train end, %d roots, %d extra nodes, %d pruned nodes ,max_depth=%d\n", - tree.param.num_roots, tree.num_extra_nodes(), num_pruned, tree.param.max_depth ); + if( tree_maker == 0 ){ + // start with a id set + RTreeUpdater updater( param, tree, grad, hess, smat, root_index ); + int num_pruned; + tree.param.max_depth = updater.do_boost( num_pruned ); + if( !silent ){ + printf( "tree train end, %d roots, %d extra nodes, %d pruned nodes ,max_depth=%d\n", + tree.param.num_roots, tree.num_extra_nodes(), num_pruned, tree.param.max_depth ); + } + }else{ + ColTreeMaker maker( tree, param, grad, hess, smat, root_index ); + maker.Make(); } } private: int silent; + int tree_maker; RegTree tree; TreeParamTrain param; private: diff --git a/booster/tree/xgboost_tree_model.h b/booster/tree/xgboost_tree_model.h index 74f864455..31501f215 100644 --- a/booster/tree/xgboost_tree_model.h +++ b/booster/tree/xgboost_tree_model.h @@ -391,7 +391,7 @@ namespace xgboost{ } public: // calculate the cost of loss function - inline double CalcCost( double sum_grad, double sum_hess ) const{ + inline double CalcGain( double sum_grad, double sum_hess ) const{ if( sum_hess < min_child_weight ){ return 0.0; } @@ -405,29 +405,37 @@ namespace xgboost{ } // KEY:layerwise // calculate cost of root - inline double CalcRootCost( double sum_grad, double sum_hess ) const{ - if( use_layerwise == 0 ) return this->CalcCost( sum_grad, sum_hess ); + inline double CalcRootGain( double sum_grad, double sum_hess ) const{ + if( use_layerwise == 0 ) return this->CalcGain( sum_grad, sum_hess ); else return 0.0; } // KEY:layerwise // calculate the cost after split // base_weight: the base_weight of parent - inline double CalcCost( double sum_grad, double sum_hess, double base_weight ) const{ - if( use_layerwise == 0 ) return this->CalcCost( sum_grad, sum_hess ); - else return this->CalcCost( sum_grad + sum_hess * base_weight, sum_hess ); + inline double CalcGain( double sum_grad, double sum_hess, double base_weight ) const{ + if( use_layerwise == 0 ) return this->CalcGain( sum_grad, sum_hess ); + else return this->CalcGain( sum_grad + sum_hess * base_weight, sum_hess ); } // calculate the weight of leaf inline double CalcWeight( double sum_grad, double sum_hess, double parent_base_weight )const{ if( use_layerwise == 0 ) return CalcWeight( sum_grad, sum_hess ); else return parent_base_weight + CalcWeight( sum_grad + parent_base_weight * sum_hess, sum_hess ); } + /*! \brief whether need forward small to big search: default right */ + inline bool need_forward_search( void ) const{ + return this->default_direction != 1; + } + /*! \brief whether need forward big to small search: default left */ + inline bool need_backward_search( void ) const{ + return this->default_direction != 2; + } /*! \brief given the loss change, whether we need to invode prunning */ inline bool need_prune( double loss_chg, int depth ) const{ - return loss_chg < min_split_loss; + return loss_chg < this->min_split_loss; } /*! \brief whether we can split with current hessian */ inline bool cannot_split( double sum_hess, int depth ) const{ - return sum_hess < min_child_weight * 2.0; + return sum_hess < this->min_child_weight * 2.0; } }; }; diff --git a/booster/xgboost_data.h b/booster/xgboost_data.h index e7d5636cc..dcb18a45c 100644 --- a/booster/xgboost_data.h +++ b/booster/xgboost_data.h @@ -66,6 +66,8 @@ namespace xgboost{ /*! \return feature value in current position */ inline bst_float fvalue( void ) const; }; + /*! \brief backward iterator over column */ + struct ColBackIter : public ColIter {}; public: /*! * \brief get number of rows @@ -83,17 +85,21 @@ namespace xgboost{ * \return row iterator */ inline RowIter GetRow( size_t ridx ) const; + + /*! \return whether column access is enabled */ + inline bool HaveColAccess( void ) const; /*! * \brief get column iterator, the columns must be sorted by feature value * \param ridx column index * \return column iterator */ inline ColIter GetSortedCol( size_t ridx ) const; - - /*! \return the view of derived class */ - inline const Derived& self( void ) const{ - return *static_cast(this); - } + /*! + * \brief get column backward iterator, starts from biggest fvalue, and iterator back + * \param ridx column index + * \return reverse column iterator + */ + inline ColBackIter GetReverseSortedCol( size_t ridx ) const; }; }; }; @@ -152,6 +158,16 @@ namespace xgboost{ return this->findex(); } }; + /*! \brief reverse column iterator */ + struct ColBackIter: public ColIter{ + // shadows RowIter::Next + inline bool Next( void ){ + if( dptr_ == end_ ) return false; + else{ + -- dptr_; return true; + } + } + }; public: /*! \brief constructor */ FMatrixS( void ){ this->Clear(); } @@ -229,6 +245,14 @@ namespace xgboost{ it.end_ = &col_data_[ col_ptr_[cidx+1] ] - 1; return it; } + /*! \brief get col iterator */ + inline ColIter GetReverseSortedCol( size_t cidx ) const{ + utils::Assert( !bst_debug || cidx < this->NumCol(), "col id exceed bound" ); + ColIter it; + it.dptr_ = &col_data_[ col_ptr_[cidx+1] ]; + it.end_ = &col_data_[ col_ptr_[cidx] ]; + return it; + } /*! * \brief intialize the data so that we have both column and row major * access, call this whenever we need column access