diff --git a/booster/tree/xgboost_col_treemaker.hpp b/booster/tree/xgboost_col_treemaker.hpp index 3b1258e21..1c82464e3 100644 --- a/booster/tree/xgboost_col_treemaker.hpp +++ b/booster/tree/xgboost_col_treemaker.hpp @@ -6,8 +6,9 @@ * use a column based approach, with OpenMP * \author Tianqi Chen: tianqi.tchen@gmail.com */ -#include +// use openmp #include +#include #include "xgboost_tree_model.h" #include "../../utils/xgboost_random.h" @@ -29,28 +30,55 @@ namespace xgboost{ utils::Assert( root_index.size() == 0 || root_index.size() == hess.size(), "booster:invalid input" ); utils::Assert( smat.HaveColAccess(), "ColTreeMaker: need column access matrix" ); } - inline void Make( void ){ + inline void Make( int& stat_max_depth, int& stat_num_pruned ){ + this->InitData(); + this->InitNewNode( this->qexpand ); + stat_max_depth = 0; + + for( int depth = 0; depth < param.max_depth; ++ depth ){ + this->FindSplit( depth ); + this->UpdateQueueExpand( this->qexpand ); + this->InitNewNode( this->qexpand ); + // if nothing left to be expand, break + if( qexpand.size() == 0 ) break; + stat_max_depth = depth + 1; + } + // set all the rest expanding nodes to leaf + for( size_t i = 0; i < qexpand.size(); ++ i ){ + const int nid = qexpand[i]; + tree[ nid ].set_leaf( snode[nid].weight * param.learning_rate ); + } + // start prunning the tree + stat_num_pruned = this->DoPrune(); } private: // statistics that is helpful to decide a split struct SplitEntry{ - /*! \brief gain in terms of loss */ - float loss_gain; - /*! \brief weight calculated related to current data */ - float weight; + /*! \brief loss change after split this node */ + float loss_chg; /*! \brief split index */ unsigned sindex; /*! \brief split value */ float split_value; /*! \brief constructor */ SplitEntry( void ){ - weight = loss_gain = 0.0f; + loss_chg = 0.0f; split_value = 0.0f; sindex = 0; } - inline void SetSplit( unsigned split_index, float split_value, bool default_left ){ - if( default_left ) split_index |= (1U << 31); - this->sindex = split_index; - this->split_value = split_value; + inline void Update( const SplitEntry &e ){ + if( e.loss_chg > this->loss_chg ){ + this->loss_chg = e.loss_chg; + this->sindex = e.sindex; + this->split_value = e.split_value; + } + } + inline void Update( float loss_chg, unsigned split_index, float split_value, bool default_left ){ + if( loss_chg > this->loss_chg ){ + this->loss_chg = loss_chg; + if( default_left ) split_index |= (1U << 31); + this->sindex = split_index; + this->split_value = split_value; + } } inline unsigned split_index( void ) const{ return sindex & ( (1U<<31) - 1U ); @@ -59,6 +87,23 @@ namespace xgboost{ return (sindex >> 31) != 0; } }; + struct NodeEntry{ + /*! \brief sum gradient statistics */ + double sum_grad; + /*! \brief sum hessian statistics */ + double sum_hess; + /*! \brief loss of this node, without split */ + float root_gain; + /*! \brief weight calculated related to current data */ + float weight; + /*! \brief current best solution */ + SplitEntry best; + NodeEntry( void ){ + sum_grad = sum_hess = 0.0; + weight = root_gain = 0.0f; + } + }; + /*! \brief per thread x per node entry to store tmp data */ struct ThreadEntry{ /*! \brief sum gradient statistics */ @@ -70,24 +115,77 @@ namespace xgboost{ /*! \brief current best solution */ SplitEntry best; /*! \brief constructor */ - ThreadEntry( void ){ - sum_grad = sum_hess = 0; + ThreadEntry( void ){ + this->ClearStats(); + } + /*! \brief clear statistics */ + inline void ClearStats( void ){ + sum_grad = sum_hess = 0.0; } }; private: + // try to prune off current leaf, return true if successful + inline void TryPruneLeaf( int nid, int depth ){ + if( tree[ nid ].is_root() ) return; + int pid = tree[ nid ].parent(); + RegTree::NodeStat &s = tree.stat( pid ); + ++ s.leaf_child_cnt; + + if( s.leaf_child_cnt >= 2 && param.need_prune( s.loss_chg, depth - 1 ) ){ + this->stat_num_pruned += 2; + // need to be pruned + tree.ChangeToLeaf( pid, param.learning_rate * snode[pid].weight ); + // tail recursion + this->TryPruneLeaf( pid, depth - 1 ); + } + } + // prune tree + inline int DoPrune( void ){ + this->stat_num_pruned = 0; + // initialize auxiliary statistics + for( int nid = 0; nid < tree.param.num_nodes; ++ nid ){ + tree.stat( nid ).leaf_child_cnt = 0; + tree.stat( nid ).loss_chg = snode[ nid ].best.loss_chg; + } + for( int nid = 0; nid < tree.param.num_nodes; ++ nid ){ + if( tree[ nid ].is_leaf() ) this->TryPruneLeaf( nid, tree.GetDepth(nid) ); + } + return this->stat_num_pruned; + } + private: + // main procedure inline void CleanSTemp( const std::vector &qexpand ){ for( size_t i = 0; i < stemp.size(); ++ i ){ for( size_t j = 0; j < qexpand.size(); ++ j ){ - ThreadEntry &e = stemp[i][ qexpand[j] ]; - e.sum_grad = e.sum_hess = 0.0f; + stemp[i][ qexpand[j] ].ClearStats(); } } } + // update queue expand + inline void UpdateQueueExpand( std::vector &qexpand ){ + std::vector newnodes; + for( size_t i = 0; i < qexpand.size(); ++ i ){ + const int nid = qexpand[i]; + if( !tree[ nid ].is_leaf() ){ + newnodes.push_back( tree[nid].cleft() ); + newnodes.push_back( tree[nid].cright() ); + } + } + // use new nodes for qexpand + qexpand = newnodes; + } + // make leaf nodes for all qexpand, update node statistics, mark leaf value - inline void UpdateSNode( const std::vector &qexpand ){ - this->CleanSTemp( qexpand ); - // step 1: find sum statistics + inline void InitNewNode( const std::vector &qexpand ){ + {// setup statistics space for each tree node + for( size_t i = 0; i < stemp.size(); ++ i ){ + stemp[i].resize( tree.param.num_nodes, ThreadEntry() ); + } + snode.resize( tree.param.num_nodes, NodeEntry() ); + } + const unsigned ndata = static_cast( position.size() ); + #pragma omp parallel for schedule( static ) for( unsigned i = 0; i < ndata; ++ i ){ const int tid = omp_get_thread_num(); @@ -95,42 +193,191 @@ namespace xgboost{ stemp[tid][ position[i] ].sum_grad += grad[i]; stemp[tid][ position[i] ].sum_hess += hess[i]; } + for( size_t j = 0; j < qexpand.size(); ++ j ){ + const int nid = qexpand[ j ]; double sum_grad = 0.0, sum_hess = 0.0; for( size_t tid = 0; tid < stemp.size(); tid ++ ){ - sum_grad += stemp[tid][j].sum_grad; - sum_hess += stemp[tid][j].sum_hess; + sum_grad += stemp[tid][nid].sum_grad; + sum_hess += stemp[tid][nid].sum_hess; } - if( !tree[j].is_root() ){ - const float pweight = snode[ tree[j].parent() ].weight; - snode[j].weight = param.CalcWeight( sum_grad, sum_hess, pweight ); + // update node statistics + snode[nid].sum_grad = sum_grad; + snode[nid].sum_hess = sum_hess; + snode[nid].root_gain = param.CalcRootGain( sum_grad, sum_hess ); + if( !tree[nid].is_root() ){ + snode[nid].weight = param.CalcWeight( sum_grad, sum_hess, snode[ tree[nid].parent() ].weight ); }else{ - snode[j].weight = param.CalcWeight( sum_grad, sum_hess, 0.0f ); - snode[j].loss_gain = param.CalcGain( sum_grad, sum_hess, 0.0f ); + snode[nid].weight = param.CalcWeight( sum_grad, sum_hess, 0.0f ); } } } - // find split at current level - inline void FindSplit( int depth ){ - unsigned nsize = static_cast(feat_index.size()); - #pragma omp parallel for schedule( dynamic, 1 ) - for( unsigned i = 0; i < nsize; ++ i ){ - const unsigned fid = feat_index[i]; - const int tid = omp_get_thread_num(); + private: + // enumerate the split values of specific feature + template + inline void EnumerateSplit( Iter it, const unsigned fid, std::vector &temp, bool is_forward_search ){ + while( it.Next() ){ + const unsigned ridx = it.rindex(); + const int nid = position[ ridx ]; + if( nid < 0 ) continue; + + const float fvalue = it.fvalue(); + ThreadEntry &e = temp[ nid ]; + // test if first hit, this is fine, because we set 0 during init + if( e.sum_hess == 0.0 ){ + e.sum_grad = grad[ ridx ]; + e.sum_hess = hess[ ridx ]; + e.last_fvalue = fvalue; + }else{ + // try to find a split + if( fabsf(fvalue - e.last_fvalue) > rt_2eps && e.sum_hess >= param.min_child_weight ){ + const double csum_hess = snode[ nid ].sum_hess - e.sum_hess; + if( csum_hess >= param.min_child_weight ){ + const double csum_grad = snode[nid].sum_grad - e.sum_grad; + const double loss_chg = + + param.CalcGain( e.sum_grad, e.sum_hess, snode[nid].weight ) + + param.CalcGain( csum_grad , csum_hess , snode[nid].weight ) + - snode[nid].root_gain; + e.best.Update( loss_chg, fid, (fvalue + e.last_fvalue) * 0.5f, !is_forward_search ); + } + } + // update the statistics + e.sum_grad += grad[ ridx ]; + e.sum_hess += hess[ ridx ]; + e.last_fvalue = fvalue; + } + } + // finish updating all statistics, check if it is possible to include all sum statistics + for( size_t i = 0; i < qexpand.size(); ++ i ){ + const int nid = qexpand[ i ]; + ThreadEntry &e = temp[ nid ]; + const double csum_hess = snode[nid].sum_hess - e.sum_hess; + if( e.sum_hess >= param.min_child_weight && csum_hess >= param.min_child_weight ){ + const double csum_grad = snode[nid].sum_grad - e.sum_grad; + const double loss_chg = + + param.CalcGain( e.sum_grad, e.sum_hess, snode[nid].weight ) + + param.CalcGain( csum_grad, csum_hess, snode[nid].weight ) + - snode[nid].root_gain; + const float delta = is_forward_search ? rt_eps:-rt_eps; + e.best.Update( loss_chg, fid, e.last_fvalue + delta, !is_forward_search ); + } } } + + // find splits at current level + inline void FindSplit( int depth ){ + const unsigned nsize = static_cast( feat_index.size() ); + + if( param.need_forward_search() ){ + this->CleanSTemp( this->qexpand ); + #pragma omp parallel for schedule( dynamic, 1 ) + for( unsigned i = 0; i < nsize; ++ i ){ + const unsigned fid = feat_index[i]; + const int tid = omp_get_thread_num(); + this->EnumerateSplit( smat.GetSortedCol(fid), fid, stemp[tid], true ); + } + } + if( param.need_backward_search() ){ + this->CleanSTemp( this->qexpand ); + #pragma omp parallel for schedule( dynamic, 1 ) + for( unsigned i = 0; i < nsize; ++ i ){ + const unsigned fid = feat_index[i]; + const int tid = omp_get_thread_num(); + this->EnumerateSplit( smat.GetReverseSortedCol(fid), fid, stemp[tid], false ); + } + } + + // after this each thread's stemp will get the best candidates, aggregate results + for( size_t i = 0; i < qexpand.size(); ++ i ){ + const int nid = qexpand[ i ]; + NodeEntry &e = snode[ nid ]; + for( int tid = 0; tid < this->nthread; ++ tid ){ + e.best.Update( stemp[ tid ][ nid ].best ); + } + // now we know the solution in snode[ nid ], set split + if( snode[ nid ].best.loss_chg > rt_eps ){ + tree.AddChilds( nid ); + tree[ nid ].set_split( e.best.split_index(), e.best.split_value, e.best.default_left() ); + } else{ + tree[ nid ].set_leaf( e.weight * param.learning_rate ); + } + } + + {// reset position + // step 1, set default direct nodes to default, and leaf nodes to -1, + const unsigned ndata = static_cast( position.size() ); + #pragma omp parallel for schedule( static ) + for( unsigned i = 0; i < ndata; ++ i ){ + const int nid = position[i]; + if( nid >= 0 ){ + if( tree[ nid ].is_leaf() ){ + position[i] = -1; + }else{ + // push to default branch, correct latter + position[i] = tree[nid].default_left() ? tree[nid].cleft(): tree[nid].cright(); + } + } + } + + // step 2, classify the non-default data into right places + std::vector fsplits; + + for( size_t i = 0; i < qexpand.size(); ++ i ){ + const int nid = qexpand[i]; + if( !tree[nid].is_leaf() ) fsplits.push_back( tree[nid].split_index() ); + } + std::sort( fsplits.begin(), fsplits.end() ); + fsplits.resize( std::unique( fsplits.begin(), fsplits.end() ) - fsplits.begin() ); + + const unsigned nfeats = static_cast( fsplits.size() ); + #pragma omp parallel for schedule( dynamic, 1 ) + for( unsigned i = 0; i < nfeats; ++ i ){ + const unsigned fid = fsplits[i]; + for( typename FMatrix::ColIter it = smat.GetSortedCol( fid ); it.Next(); ){ + const unsigned ridx = it.rindex(); + int nid = position[ ridx ]; + if( nid == -1 ) continue; + // go back to parent, correct those who are not default + nid = tree[ nid ].parent(); + if( tree[ nid ].split_index() == fid ){ + if( it.fvalue() < tree[nid].split_cond() ){ + position[ ridx ] = tree[ nid ].cleft(); + }else{ + position[ ridx ] = tree[ nid ].cright(); + } + } + } + } + } + } + private: // initialize temp data structure inline void InitData( void ){ - position.resize( grad.size() ); - if( root_index.size() == 0 ){ - std::fill( position.begin(), position.end(), 0 ); - }else{ - for( size_t i = 0; i < root_index.size(); ++ i ){ - position[i] = root_index[i]; - utils::Assert( root_index[i] < (unsigned)tree.param.num_roots, "root index exceed setting" ); + { + position.resize( grad.size() ); + if( root_index.size() == 0 ){ + std::fill( position.begin(), position.end(), 0 ); + }else{ + for( size_t i = 0; i < root_index.size(); ++ i ){ + position[i] = root_index[i]; + utils::Assert( root_index[i] < (unsigned)tree.param.num_roots, "root index exceed setting" ); + } + } + // mark delete for the deleted datas + for( size_t i = 0; i < grad.size(); ++ i ){ + if( hess[i] < 0.0f ) position[i] = -1; + } + if( param.subsample < 1.0f - 1e-6f ){ + for( size_t i = 0; i < grad.size(); ++ i ){ + if( hess[i] < 0.0f ) continue; + if( random::SampleBinary( param.subsample) == 0 ){ + position[ i ] = -1; + } + } } } + {// initialize feature index int ncol = static_cast( smat.NumCol() ); for( int i = 0; i < ncol; i ++ ){ @@ -141,22 +388,22 @@ namespace xgboost{ random::Shuffle( feat_index ); } {// setup temp space for each thread - int nthread; + if( param.nthread != 0 ){ + omp_set_num_threads( param.nthread ); + } #pragma omp parallel { - nthread = omp_get_num_threads(); - } + this->nthread = omp_get_num_threads(); + } + // reserve a small space - stemp.resize( nthread, std::vector() ); + stemp.resize( this->nthread, std::vector() ); for( size_t i = 0; i < stemp.size(); ++ i ){ stemp[i].reserve( 256 ); - stemp[i].resize( tree.param.num_roots, ThreadEntry() ); } + snode.reserve( 256 ); } - {// setup statistics space for each tree node - snode.resize( tree.param.num_roots, SplitEntry() ); - } - + {// expand query qexpand.reserve( 256 ); qexpand.clear(); for( int i = 0; i < tree.param.num_roots; ++ i ){ @@ -166,16 +413,20 @@ namespace xgboost{ } private: // local helper tmp data structure + // statistics + int stat_num_pruned; + // number of omp thread used during training + int nthread; // queue of nodes to be expanded std::vector qexpand; // Per feature: shuffle index of each feature index std::vector feat_index; // Instance Data: current node position in the tree of each instance - std::vector position; + std::vector position; // TreeNode Data: statistics for each constructed node - std::vector snode; + std::vector snode; // PerThread x PerTreeNode: statistics for per thread construction - std::vector< std::vector > stemp; + std::vector< std::vector > stemp; private: // original data that supports tree construction RegTree &tree; diff --git a/booster/tree/xgboost_svdf_tree.hpp b/booster/tree/xgboost_svdf_tree.hpp index b171f8876..bc3ad430a 100644 --- a/booster/tree/xgboost_svdf_tree.hpp +++ b/booster/tree/xgboost_svdf_tree.hpp @@ -146,7 +146,7 @@ namespace xgboost{ // tail recursion this->try_prune_leaf( pid, depth - 1 ); } - } + } // make leaf for current node :) inline void make_leaf( Task tsk, double sum_grad, double sum_hess, bool compute ){ for( unsigned i = 0; i < tsk.len; i ++ ){ diff --git a/booster/tree/xgboost_tree.hpp b/booster/tree/xgboost_tree.hpp index c74030ca6..5d54bc03c 100644 --- a/booster/tree/xgboost_tree.hpp +++ b/booster/tree/xgboost_tree.hpp @@ -116,18 +116,18 @@ namespace xgboost{ if( !silent ){ printf( "\nbuild GBRT with %u instances\n", (unsigned)grad.size() ); } + int num_pruned; if( tree_maker == 0 ){ // start with a id set RTreeUpdater updater( param, tree, grad, hess, smat, root_index ); - int num_pruned; tree.param.max_depth = updater.do_boost( num_pruned ); - if( !silent ){ - printf( "tree train end, %d roots, %d extra nodes, %d pruned nodes ,max_depth=%d\n", - tree.param.num_roots, tree.num_extra_nodes(), num_pruned, tree.param.max_depth ); - } }else{ ColTreeMaker maker( tree, param, grad, hess, smat, root_index ); - maker.Make(); + maker.Make( tree.param.max_depth, num_pruned ); + } + if( !silent ){ + printf( "tree train end, %d roots, %d extra nodes, %d pruned nodes ,max_depth=%d\n", + tree.param.num_roots, tree.num_extra_nodes(), num_pruned, tree.param.max_depth ); } } private: diff --git a/booster/tree/xgboost_tree_model.h b/booster/tree/xgboost_tree_model.h index d8dd8e03a..efe986b36 100644 --- a/booster/tree/xgboost_tree_model.h +++ b/booster/tree/xgboost_tree_model.h @@ -329,6 +329,8 @@ namespace xgboost{ float subsample; // whether to use layerwise aware regularization int use_layerwise; + // number of threads to be used for tree construction, if OpenMP is enabled, if equals 0, use system default + int nthread; /*! \brief constructor */ TreeParamTrain( void ){ learning_rate = 0.3f; @@ -339,6 +341,7 @@ namespace xgboost{ default_direction = 0; subsample = 1.0f; use_layerwise = 0; + nthread = 0; } /*! * \brief set parameters from outside @@ -359,6 +362,7 @@ namespace xgboost{ if( !strcmp( name, "reg_method") ) reg_method = (float)atof( val ); if( !strcmp( name, "subsample") ) subsample = (float)atof( val ); if( !strcmp( name, "use_layerwise") ) use_layerwise = atoi( val ); + if( !strcmp( name, "nthread") ) nthread = atoi( val ); if( !strcmp( name, "default_direction") ) { if( !strcmp( val, "learn") ) default_direction = 0; if( !strcmp( val, "left") ) default_direction = 1; diff --git a/booster/xgboost_data.h b/booster/xgboost_data.h index dcb18a45c..4165f2fc2 100644 --- a/booster/xgboost_data.h +++ b/booster/xgboost_data.h @@ -246,9 +246,9 @@ namespace xgboost{ return it; } /*! \brief get col iterator */ - inline ColIter GetReverseSortedCol( size_t cidx ) const{ + inline ColBackIter GetReverseSortedCol( size_t cidx ) const{ utils::Assert( !bst_debug || cidx < this->NumCol(), "col id exceed bound" ); - ColIter it; + ColBackIter it; it.dptr_ = &col_data_[ col_ptr_[cidx+1] ]; it.end_ = &col_data_[ col_ptr_[cidx] ]; return it;