From d982be9dcabbdfd3efb605c3454458439e88d7f9 Mon Sep 17 00:00:00 2001 From: tqchen Date: Wed, 5 Mar 2014 14:38:13 -0800 Subject: [PATCH] complete row maker --- booster/tree/xgboost_row_treemaker.hpp | 198 +++++++++++++++++++++++-- booster/xgboost_data.h | 40 +++-- 2 files changed, 216 insertions(+), 22 deletions(-) diff --git a/booster/tree/xgboost_row_treemaker.hpp b/booster/tree/xgboost_row_treemaker.hpp index 00f16af75..8c5251c9f 100644 --- a/booster/tree/xgboost_row_treemaker.hpp +++ b/booster/tree/xgboost_row_treemaker.hpp @@ -36,8 +36,8 @@ namespace xgboost{ this->InitNewNode( this->qexpand ); stat_max_depth = 0; - for( int depth = 0; depth < param.max_depth; ++ depth ){ - //this->FindSplit( this->qexpand ); + for( int depth = 0; depth < param.max_depth; ++ depth ){ + this->FindSplit( this->qexpand, depth ); this->UpdateQueueExpand( this->qexpand ); this->InitNewNode( this->qexpand ); // if nothing left to be expand, break @@ -51,20 +51,24 @@ namespace xgboost{ } // start prunning the tree stat_num_pruned = this->DoPrune(); - } + } private: // make leaf nodes for all qexpand, update node statistics, mark leaf value inline void InitNewNode( const std::vector &qexpand ){ snode.resize( tree.param.num_nodes, NodeEntry() ); - for( size_t j = 0; j < qexpand.size(); ++ j ){ + for( size_t j = 0; j < qexpand.size(); ++j ){ const int nid = qexpand[ j ]; double sum_grad = 0.0, sum_hess = 0.0; - // TODO: get sum statistics for nid + for( bst_uint i = node_bound[nid].first; i < node_bound[nid].second; ++i ){ + const bst_uint ridx = row_index_set[i]; + sum_grad += grad[ridx]; sum_hess += hess[ridx]; + } // update node statistics snode[nid].sum_grad = sum_grad; snode[nid].sum_hess = sum_hess; + snode[nid].root_gain = param.CalcRootGain( sum_grad, sum_hess ); if( !tree[nid].is_root() ){ snode[nid].weight = param.CalcWeight( sum_grad, sum_hess, snode[ tree[nid].parent() ].weight ); @@ -73,10 +77,182 @@ namespace xgboost{ } } } - // find splits at current level - inline void FindSplit( int nid ){ - // TODO + private: + // enumerate the split values of specific feature + template + inline void EnumerateSplit( Iter it, SplitEntry &best, const int nid, const unsigned fid, bool is_forward_search ){ + float last_fvalue = 0.0f; + double sum_hess = 0.0, sum_grad = 0.0; + const NodeEntry enode = snode[ nid ]; + while( it.Next() ){ + const bst_uint ridx = it.rindex(); + const float fvalue = it.fvalue(); + + if( sum_hess == 0.0 ){ + sum_grad = grad[ ridx ]; + sum_hess = hess[ ridx ]; + last_fvalue = fvalue; + }else{ + // try to find a split + if( fabsf(fvalue - last_fvalue) > rt_2eps && sum_hess >= param.min_child_weight ){ + const double csum_hess = enode.sum_hess - sum_hess; + if( csum_hess >= param.min_child_weight ){ + const double csum_grad = enode.sum_grad - sum_grad; + const double loss_chg = + + param.CalcGain( sum_grad, sum_hess, enode.weight ) + + param.CalcGain( csum_grad, csum_hess, enode.weight ) + - enode.root_gain; + best.Update( loss_chg, fid, (fvalue + last_fvalue) * 0.5f, !is_forward_search ); + }else{ + // the rest part doesn't meet split condition anyway, return + return; + } + } + // update the statistics + sum_grad += grad[ ridx ]; + sum_hess += hess[ ridx ]; + last_fvalue = fvalue; + } + } + + const double csum_hess = enode.sum_hess - sum_hess; + if( sum_hess >= param.min_child_weight && csum_hess >= param.min_child_weight ){ + const double csum_grad = enode.sum_grad - sum_grad; + const double loss_chg = + + param.CalcGain( sum_grad, sum_hess, enode.weight ) + + param.CalcGain( csum_grad, csum_hess, enode.weight ) + - snode[nid].root_gain; + const float delta = is_forward_search ? rt_eps:-rt_eps; + best.Update( loss_chg, fid, last_fvalue + delta, !is_forward_search ); + } + } + private: + inline void FindSplit( const std::vector &qexpand, int depth ){ + int nexpand = (int)qexpand.size(); + if( depth < 3 ){ + for( int i = 0; i < nexpand; ++ i ){ + this->FindSplit( qexpand[i], tmp_rptr[0] ); + } + }else{ + // if get to enough depth, parallelize over node + #pragma omp parallel for schedule(dynamic,1) + for( int i = 0; i < nexpand; ++ i ){ + const int tid = omp_get_thread_num(); + utils::Assert( tid < (int)tmp_rptr.size(), "BUG: FindSplit, tid exceed tmp_rptr size" ); + this->FindSplit( qexpand[i], tmp_rptr[tid] ); + } + } + } + private: + inline void MakeSplit( int nid, unsigned gid ){ + node_bound.resize( tree.param.num_nodes ); + // re-organize the row_index_set after split on nid + const unsigned split_index = tree[nid].split_index(); + const float split_value = tree[nid].split_cond(); + + std::vector right; + bst_uint top = node_bound[nid].first; + for( bst_uint i = node_bound[ nid ].first; i < node_bound[ nid ].second; ++i ){ + const bst_uint ridx = row_index_set[i]; + bool goleft = tree[ nid ].default_left(); + for( typename FMatrix::RowIter it = smat.GetRow(ridx,gid); it.Next(); ){ + if( it.findex() == split_index ){ + if( it.fvalue() < split_value ){ + goleft = true; break; + }else{ + goleft = false; break; + } + } + } + if( goleft ) { + row_index_set[ top ++ ] = ridx; + }else{ + right.push_back( ridx ); + } + } + node_bound[ tree[nid].cleft() ] = std::make_pair( node_bound[nid].first, top ); + node_bound[ tree[nid].cright() ] = std::make_pair( top, node_bound[nid].second ); + + utils::Assert( node_bound[nid].second - top == (bst_uint)right.size(), "BUG:MakeSplit" ); + for( size_t i = 0; i < right.size(); ++ i ){ + row_index_set[ top ++ ] = right[ i ]; + } + } + + // find splits at current level + inline void FindSplit( int nid, std::vector &tmp_rptr ){ + if( tmp_rptr.size() == 0 ){ + tmp_rptr.resize( tree.param.num_feature + 1, 0 ); + } + const bst_uint begin = node_bound[ nid ].first; + const bst_uint end = node_bound[ nid ].second; + const unsigned ncgroup = smat.NumColGroup(); + unsigned best_group = 0; + + for( unsigned gid = 0; gid < ncgroup; ++gid ){ + // records the columns + std::vector centry; + // records the active features + std::vector aclist; + utils::SparseCSRMBuilder builder( tmp_rptr, centry, aclist ); + builder.InitBudget( tree.param.num_feature ); + for( bst_uint i = begin; i < end; ++i ){ + const bst_uint ridx = row_index_set[i]; + for( typename FMatrix::RowIter it = smat.GetRow(ridx,gid); it.Next(); ){ + builder.AddBudget( it.findex() ); + } + } + builder.InitStorage(); + for( bst_uint i = begin; i < end; ++i ){ + const bst_uint ridx = row_index_set[i]; + for( typename FMatrix::RowIter it = smat.GetRow(ridx,gid); it.Next(); ){ + builder.PushElem( it.findex(), FMatrixS::REntry( ridx, it.fvalue() ) ); + } + } + // --- end of building column major matrix --- + // after this point, tmp_rptr and entry is ready to use + int naclist = (int)aclist.size(); + // best entry for each thread + SplitEntry nbest, tbest; + #pragma omp parallel private(tbest) + { + #pragma omp for schedule(dynamic,1) + for( int j = 0; j < naclist; ++j ){ + bst_uint findex = static_cast( aclist[j] ); + // local sort can be faster when the features are sparse + std::sort( centry.begin() + tmp_rptr[findex], centry.begin() + tmp_rptr[findex+1], FMatrixS::REntry::cmp_fvalue ); + if( param.need_forward_search() ){ + this->EnumerateSplit( FMatrixS::ColIter( ¢ry[tmp_rptr[findex]]-1, ¢ry[tmp_rptr[findex+1]] - 1 ), + tbest, nid, findex, true ); + } + if( param.need_backward_search() ){ + this->EnumerateSplit( FMatrixS::ColBackIter( ¢ry[tmp_rptr[findex+1]], ¢ry[tmp_rptr[findex]] ), + tbest, nid, findex, false ); + } + } + #pragma omp critical + { + nbest.Update( tbest ); + } + } + // if current solution gives the best + if( snode[nid].best.Update( nbest ) ){ + best_group = gid; + } + // cleanup tmp_rptr for next usage + builder.Cleanup(); + } + + // at this point, we already know the best split + if( snode[nid].best.loss_chg > rt_eps ){ + const SplitEntry &e = snode[nid].best; + tree.AddChilds( nid ); + tree[ nid ].set_split( e.split_index(), e.split_value, e.default_left() ); + this->MakeSplit( nid, best_group ); + }else{ + tree[ nid ].set_leaf( snode[nid].weight * param.learning_rate ); + } } private: // initialize temp data structure @@ -121,9 +297,9 @@ namespace xgboost{ { this->nthread = omp_get_num_threads(); } + tmp_rptr.resize( this->nthread, std::vector() ); snode.reserve( 256 ); - } - + } {// expand query qexpand.reserve( 256 ); qexpand.clear(); for( int i = 0; i < tree.param.num_roots; ++ i ){ @@ -134,6 +310,8 @@ namespace xgboost{ private: // number of omp thread used during training int nthread; + // tmp row pointer, per thread, used for tmp data construction + std::vector< std::vector > tmp_rptr; // Instance row indexes corresponding to each node std::vector row_index_set; // lower and upper bound of each nodes' row_index diff --git a/booster/xgboost_data.h b/booster/xgboost_data.h index 31cb3d9f4..ebe2faa4d 100644 --- a/booster/xgboost_data.h +++ b/booster/xgboost_data.h @@ -86,6 +86,20 @@ namespace xgboost{ * \return row iterator */ inline RowIter GetRow( size_t ridx ) const; + /*! + * \brief get number of column groups, this ise used together with GetRow( ridx, gid ) + * \return number of column group + */ + inline unsigned NumColGroup( void ) const{ + return 1; + } + /*! + * \brief get row iterator, return iterator of specific column group + * \param ridx row index + * \param gid colmun group id + * \return row iterator, only iterates over features of specified column group + */ + inline RowIter GetRow( size_t ridx, unsigned gid ) const; /*! \return whether column access is enabled */ inline bool HaveColAccess( void ) const; @@ -140,6 +154,8 @@ namespace xgboost{ /*! \brief row iterator */ struct RowIter{ const REntry *dptr_, *end_; + RowIter( const REntry* dptr, const REntry* end ) + :dptr_(dptr),end_(end){} inline bool Next( void ){ if( dptr_ == end_ ) return false; else{ @@ -155,12 +171,16 @@ namespace xgboost{ }; /*! \brief column iterator */ struct ColIter: public RowIter{ + ColIter( const REntry* dptr, const REntry* end ) + :RowIter( dptr, end ){} inline bst_uint rindex( void ) const{ return this->findex(); } }; /*! \brief reverse column iterator */ struct ColBackIter: public ColIter{ + ColBackIter( const REntry* dptr, const REntry* end ) + :ColIter( dptr, end ){} // shadows RowIter::Next inline bool Next( void ){ if( dptr_ == end_ ) return false; @@ -223,10 +243,12 @@ namespace xgboost{ /*! \brief get row iterator*/ inline RowIter GetRow( size_t ridx ) const{ utils::Assert( !bst_debug || ridx < this->NumRow(), "row id exceed bound" ); - RowIter it; - it.dptr_ = &row_data_[ row_ptr_[ridx] ] - 1; - it.end_ = &row_data_[ row_ptr_[ridx+1] ] - 1; - return it; + return RowIter( &row_data_[ row_ptr_[ridx] ] - 1, &row_data_[ row_ptr_[ridx+1] ] - 1 ); + } + /*! \brief get row iterator*/ + inline RowIter GetRow( size_t ridx, unsigned gid ) const{ + utils::Assert( gid == 0, "FMatrixS only have 1 column group" ); + return FMatrixS::GetRow( ridx ); } public: /*! \return whether column access is enabled */ @@ -241,18 +263,12 @@ namespace xgboost{ /*! \brief get col iterator*/ inline ColIter GetSortedCol( size_t cidx ) const{ utils::Assert( !bst_debug || cidx < this->NumCol(), "col id exceed bound" ); - ColIter it; - it.dptr_ = &col_data_[ col_ptr_[cidx] ] - 1; - it.end_ = &col_data_[ col_ptr_[cidx+1] ] - 1; - return it; + return ColIter( &col_data_[ col_ptr_[cidx] ] - 1, &col_data_[ col_ptr_[cidx+1] ] - 1 ); } /*! \brief get col iterator */ inline ColBackIter GetReverseSortedCol( size_t cidx ) const{ utils::Assert( !bst_debug || cidx < this->NumCol(), "col id exceed bound" ); - ColBackIter it; - it.dptr_ = &col_data_[ col_ptr_[cidx+1] ]; - it.end_ = &col_data_[ col_ptr_[cidx] ]; - return it; + return ColBackIter( &col_data_[ col_ptr_[cidx+1] ], &col_data_[ col_ptr_[cidx] ] ); } /*! * \brief intialize the data so that we have both column and row major