start add coltree maker

2014-02-28 11:44:50 -08:00
parent 10382f6365
commit fffad41e53
5 changed files with 217 additions and 35 deletions
--- a/booster/tree/xgboost_col_treemaker.hpp
+++ b/booster/tree/xgboost_col_treemaker.hpp
@@ -0,0 +1,142 @@
+#ifndef _XGBOOST_COL_TREEMAKER_HPP_
+#define _XGBOOST_COL_TREEMAKER_HPP_
+/*!
+ * \file xgboost_col_treemaker.hpp
+ * \brief implementation of regression tree maker,
+ *        use a column based approach, with OpenMP 
+ * \author Tianqi Chen: tianqi.tchen@gmail.com 
+ */
+#include <vector>
+#include <omp.h>
+#include "xgboost_tree_model.h"
+#include "../../utils/xgboost_random.h"
+
+namespace xgboost{
+    namespace booster{
+        template<typename FMatrix>
+        class ColTreeMaker{
+        public:
+            ColTreeMaker( RegTree &tree,
+                          const TreeParamTrain &param, 
+                          const std::vector<float> &grad,
+                          const std::vector<float> &hess,
+                          const FMatrix &smat, 
+                          const std::vector<unsigned> &root_index ):
+                tree( tree ), param( param ), grad( grad ), hess( hess ),
+                smat( smat ), root_index( root_index ){
+                utils::Assert( grad.size() == hess.size(), "booster:invalid input" );
+                utils::Assert( smat.NumRow() == hess.size(), "booster:invalid input" );
+                utils::Assert( root_index.size() == 0 || root_index.size() == hess.size(), "booster:invalid input" );
+                utils::Assert( smat.HaveColAccess(), "ColTreeMaker: need column access matrix" );
+            }
+            inline void Make( void ){
+            }
+        private:
+            // statistics that is helpful to decide a split
+            struct SplitEntry{
+                /*! \brief gain in terms of loss */
+                float  loss_gain;
+                /*! \brief weight calculated related to current data */
+                float  weight;
+                /*! \brief split index */
+                unsigned  sindex;
+                /*! \brief split value */
+                float     split_value;
+                /*! \brief constructor */
+                SplitEntry( void ){
+                    weight = loss_gain = 0.0f;
+                    split_value = 0.0f; sindex = 0;
+                }
+                inline void SetSplit( unsigned split_index, float split_value, bool default_left ){
+                    if( default_left ) split_index |= (1U << 31);
+                    this->sindex = split_index;
+                    this->split_value = split_value;
+                }
+                inline unsigned split_index( void ) const{
+                    return sindex & ( (1U<<31) - 1U );
+                }
+                inline bool default_left( void ) const{
+                    return (sindex >> 31) != 0;
+                }
+            };
+            /*! \brief per thread x per node entry to store tmp data */
+            struct ThreadEntry{
+                /*! \brief sum gradient statistics */
+                double sum_grad;
+                /*! \brief sum hessian statistics */
+                double sum_hess;
+                /*! \brief current best solution */
+                SplitEntry best;
+                ThreadEntry( void ){
+                    sum_grad = sum_hess = 0;
+                }
+            };
+        private:
+            // find split at current level
+            inline void FindSplit( int depth ){
+                unsigned nsize = static_cast<unsigned>(feat_index.size());
+
+                #pragma omp parallel for schedule( dynamic, 1 )
+                for( unsigned i = 0; i < nsize; ++ i ){
+                    const unsigned fid = feat_index[i];                    
+                }
+            }
+            // initialize temp data structure
+            inline void InitData( void ){
+                position.resize( grad.size() );
+                if( root_index.size() == 0 ){
+                    std::fill( position.begin(), position.end(), 0 );
+                }else{
+                    for( size_t i = 0; i < root_index.size(); ++ i ){
+                        position[i] = root_index[i];
+                        utils::Assert( root_index[i] < (unsigned)tree.param.num_roots, "root index exceed setting" );
+                    }
+                }
+                {// initialize feature index
+                    for( int i = 0; i < tree.param.num_feature; i ++ ){
+                        if( smat.GetSortedCol(i).Next() ){
+                            feat_index.push_back( i );
+                        }
+                    }
+                    random::Shuffle( feat_index );
+                }
+                {// setup temp space for each thread
+                    int nthread;
+                    #pragma omp parallel
+                    {
+                        nthread = omp_get_num_threads();
+                    }                
+                    // reserve a small space
+                    stemp.resize( nthread, std::vector<ThreadEntry>() );
+                    for( size_t i = 0; i < stemp.size(); ++ i ){
+                        stemp[i].reserve( 256 );
+                        stemp[i].resize( tree.param.num_roots, ThreadEntry() );
+                    }
+                }
+                {// setup statistics space for each tree node
+                    snode.resize( tree.param.num_roots, SplitEntry() );
+                }
+            }
+        private:
+            // local helper tmp data structure
+            // Per feature: shuffle index of each feature index
+            std::vector<int> feat_index;
+            // Instance Data: current node position in the tree of each instance
+            std::vector<int> position;                
+            // TreeNode Data: statistics for each constructed node
+            std::vector<SplitEntry> snode;
+            // PerThread x PerTreeNode: statistics for per thread construction
+            std::vector< std::vector<SplitEntry> > stemp;
+        private:
+            // original data that supports tree construction
+            RegTree &tree;
+            const TreeParamTrain &param;
+            const std::vector<float> &grad;
+            const std::vector<float> &hess;
+            const FMatrix            &smat;
+            const std::vector<unsigned> &root_index;
+            
+        };
+    };
+};
+#endif
--- a/booster/tree/xgboost_svdf_tree.hpp
+++ b/booster/tree/xgboost_svdf_tree.hpp
@@ -205,13 +205,13 @@ namespace xgboost{
            
            // enumerate split point of the tree
            inline void enumerate_split( RTSelecter &sglobal, int tlen,
-                                         double rsum_grad, double rsum_hess, double root_cost,
+                                         double rsum_grad, double rsum_hess, double root_gain,
                                         const SCEntry *entry, size_t start, size_t end, 
                                         int findex, float parent_base_weight ){
                // local selecter
                RTSelecter slocal( param );
                
-                if( param.default_direction != 1 ){
+                if( param.need_forward_search() ){
                    // forward process, default right
                    double csum_grad = 0.0, csum_hess = 0.0;
                    for( size_t j = start; j < end; j ++ ){
@@ -225,8 +225,8 @@ namespace xgboost{
                            if( dsum_hess < param.min_child_weight ) break;                        
                            // change of loss 
                            double loss_chg = 
-                            param.CalcCost( csum_grad, csum_hess, parent_base_weight ) + 
-                                param.CalcCost( rsum_grad - csum_grad, dsum_hess, parent_base_weight ) - root_cost;
+                            param.CalcGain( csum_grad, csum_hess, parent_base_weight ) + 
+                                param.CalcGain( rsum_grad - csum_grad, dsum_hess, parent_base_weight ) - root_gain;
                            
                            const int clen = static_cast<int>( j + 1 - start );
                            // add candidate to selecter
@@ -237,7 +237,7 @@ namespace xgboost{
                    }
                }
                
-                if( param.default_direction != 2 ){
+                if( param.need_backward_search() ){
                    // backward process, default left
                    double csum_grad = 0.0, csum_hess = 0.0;
                    for( size_t j = end; j > start; j -- ){
@@ -249,8 +249,8 @@ namespace xgboost{
                            if( csum_hess < param.min_child_weight ) continue;
                            const double dsum_hess = rsum_hess - csum_hess;
                            if( dsum_hess < param.min_child_weight ) break;
-                            double loss_chg = param.CalcCost( csum_grad, csum_hess, parent_base_weight ) + 
-                                param.CalcCost( rsum_grad - csum_grad, dsum_hess, parent_base_weight ) - root_cost;
+                            double loss_chg = param.CalcGain( csum_grad, csum_hess, parent_base_weight ) + 
+                                param.CalcGain( rsum_grad - csum_grad, dsum_hess, parent_base_weight ) - root_gain;
                            const int clen = static_cast<int>( end - j + 1 );
                            // add candidate to selecter
                            slocal.push_back( RTSelecter::Entry( loss_chg, j - 1, clen, findex,
@@ -319,8 +319,8 @@ namespace xgboost{
                
                // global selecter
                RTSelecter sglobal( param );
-                // cost root 
-                const double root_cost = param.CalcRootCost( rsum_grad, rsum_hess );
+                // gain root 
+                const double root_gain = param.CalcRootGain( rsum_grad, rsum_hess );
                // KEY: layerwise, weight of current node if it is leaf
                const double base_weight = param.CalcWeight( rsum_grad, rsum_hess, tsk.parent_base_weight );
                // enumerate feature index
@@ -333,7 +333,7 @@ namespace xgboost{
                    std::sort( entry.begin() + start, entry.begin() + end );
                    // local selecter
                    this->enumerate_split( sglobal, tsk.len,
-                                           rsum_grad, rsum_hess, root_cost,
+                                           rsum_grad, rsum_hess, root_gain,
                                           &entry[0], start, end, findex, base_weight );
                }
                // Cleanup tmp_rptr for next use
--- a/booster/tree/xgboost_tree.hpp
+++ b/booster/tree/xgboost_tree.hpp
@@ -23,6 +23,7 @@ namespace xgboost{
 };

 #include "xgboost_svdf_tree.hpp"
+#include "xgboost_col_treemaker.hpp"

 namespace xgboost{
    namespace booster{
@@ -30,7 +31,9 @@ namespace xgboost{
        // see RegTreeUpdater
        class RegTreeTrainer : public IBooster{
        public:
-            RegTreeTrainer( void ){ silent = 0; }
+            RegTreeTrainer( void ){ 
+                silent = 0; tree_maker = 0; 
+            }
            virtual ~RegTreeTrainer( void ){}
        public:
            virtual void SetParam( const char *name, const char *val ){
@@ -51,8 +54,8 @@ namespace xgboost{
            virtual void DoBoost( std::vector<float> &grad, 
                                  std::vector<float> &hess,
                                  const FMatrixS &smat,
-                                  const std::vector<unsigned> &group_id ){
-                this->DoBoost_( grad, hess, smat, group_id );
+                                  const std::vector<unsigned> &root_index ){
+                this->DoBoost_( grad, hess, smat, root_index );
            }
            
            virtual int GetLeafIndex( const std::vector<float> &feat,
@@ -108,23 +111,28 @@ namespace xgboost{
            inline void DoBoost_( std::vector<float> &grad, 
                                  std::vector<float> &hess,
                                  const FMatrix &smat,
-                                  const std::vector<unsigned> &group_id ){
+                                  const std::vector<unsigned> &root_index ){
                utils::Assert( grad.size() < UINT_MAX, "number of instance exceed what we can handle" );
                if( !silent ){
                    printf( "\nbuild GBRT with %u instances\n", (unsigned)grad.size() );
                }
-                // start with a id set
-                RTreeUpdater<FMatrix> updater( param, tree, grad, hess, smat, group_id );
-                int num_pruned;
-                tree.param.max_depth = updater.do_boost( num_pruned );
-                
-                if( !silent ){
-                    printf( "tree train end, %d roots, %d extra nodes, %d pruned nodes ,max_depth=%d\n", 
-                            tree.param.num_roots, tree.num_extra_nodes(), num_pruned, tree.param.max_depth );
+                if( tree_maker == 0 ){
+                    // start with a id set
+                    RTreeUpdater<FMatrix> updater( param, tree, grad, hess, smat, root_index );
+                    int num_pruned;
+                    tree.param.max_depth = updater.do_boost( num_pruned );
+                    if( !silent ){
+                        printf( "tree train end, %d roots, %d extra nodes, %d pruned nodes ,max_depth=%d\n", 
+                                tree.param.num_roots, tree.num_extra_nodes(), num_pruned, tree.param.max_depth );
+                    }
+                }else{
+                    ColTreeMaker<FMatrix> maker( tree, param, grad, hess, smat, root_index );
+                    maker.Make();
                }
            }
        private:
            int silent;
+            int tree_maker;
            RegTree tree;
            TreeParamTrain param;
        private:
--- a/booster/tree/xgboost_tree_model.h
+++ b/booster/tree/xgboost_tree_model.h
@@ -391,7 +391,7 @@ namespace xgboost{
            }
        public:
            // calculate the cost of loss function
-            inline double CalcCost( double sum_grad, double sum_hess ) const{
+            inline double CalcGain( double sum_grad, double sum_hess ) const{
                if( sum_hess < min_child_weight ){
                    return 0.0;
                }
@@ -405,29 +405,37 @@ namespace xgboost{
            }
            // KEY:layerwise
            // calculate cost of root
-            inline double CalcRootCost( double sum_grad, double sum_hess ) const{
-                if( use_layerwise == 0 ) return this->CalcCost( sum_grad, sum_hess );
+            inline double CalcRootGain( double sum_grad, double sum_hess ) const{
+                if( use_layerwise == 0 ) return this->CalcGain( sum_grad, sum_hess );
                else return 0.0;
            }
            // KEY:layerwise
            // calculate the cost after split
            // base_weight: the base_weight of parent           
-            inline double CalcCost( double sum_grad, double sum_hess, double base_weight ) const{
-                if( use_layerwise == 0 ) return this->CalcCost( sum_grad, sum_hess );
-                else return this->CalcCost( sum_grad + sum_hess * base_weight, sum_hess );
+            inline double CalcGain( double sum_grad, double sum_hess, double base_weight ) const{
+                if( use_layerwise == 0 ) return this->CalcGain( sum_grad, sum_hess );
+                else return this->CalcGain( sum_grad + sum_hess * base_weight, sum_hess );
            }
            // calculate the weight of leaf
            inline double CalcWeight( double sum_grad, double sum_hess, double parent_base_weight )const{
                if( use_layerwise == 0 ) return CalcWeight( sum_grad, sum_hess );
                else return parent_base_weight + CalcWeight( sum_grad + parent_base_weight * sum_hess, sum_hess );
            }           
+            /*! \brief whether need forward small to big search: default right */
+            inline bool need_forward_search( void ) const{
+                return this->default_direction != 1;
+            }
+            /*! \brief whether need forward big to small search: default left */
+            inline bool need_backward_search( void ) const{
+                return this->default_direction != 2;
+            }
            /*! \brief given the loss change, whether we need to invode prunning */
            inline bool need_prune( double loss_chg, int depth ) const{
-                return loss_chg < min_split_loss;
+                return loss_chg < this->min_split_loss;
            }
            /*! \brief whether we can split with current hessian */
            inline bool cannot_split( double sum_hess, int depth ) const{
-                return sum_hess < min_child_weight * 2.0; 
+                return sum_hess < this->min_child_weight * 2.0; 
            }
        };
    };
--- a/booster/xgboost_data.h
+++ b/booster/xgboost_data.h
@@ -66,6 +66,8 @@ namespace xgboost{
                /*! \return feature value in current position */
                inline bst_float fvalue( void ) const;
            };
+            /*! \brief backward iterator over column */
+            struct ColBackIter : public ColIter {};
        public:
            /*! 
             * \brief get number of rows 
@@ -83,17 +85,21 @@ namespace xgboost{
             * \return row iterator
             */
            inline RowIter GetRow( size_t ridx ) const;
+
+            /*! \return whether column access is enabled */
+            inline bool HaveColAccess( void ) const;
            /*!
             * \brief get column iterator, the columns must be sorted by feature value
             * \param ridx column index
             * \return column iterator
             */
            inline ColIter GetSortedCol( size_t ridx ) const;
-
-            /*! \return the view of derived class */
-            inline const Derived& self( void ) const{
-                return *static_cast<const Derived*>(this);
-            }
+            /*!
+             * \brief get column backward iterator, starts from biggest fvalue, and iterator back
+             * \param ridx column index
+             * \return reverse column iterator
+             */
+            inline ColBackIter GetReverseSortedCol( size_t ridx ) const;
        };
    };
 };
@@ -152,6 +158,16 @@ namespace xgboost{
                    return this->findex();
                }
            };
+            /*! \brief reverse column iterator */
+            struct ColBackIter: public ColIter{
+                // shadows RowIter::Next
+                inline bool Next( void ){
+                    if( dptr_ == end_ ) return false;
+                    else{
+                       -- dptr_; return true;
+                    }
+                }
+            };
        public:
            /*! \brief constructor */
            FMatrixS( void ){ this->Clear(); }
@@ -229,6 +245,14 @@ namespace xgboost{
                it.end_  = &col_data_[ col_ptr_[cidx+1] ] - 1;
                return it;
            }
+            /*!  \brief get col iterator */
+            inline ColIter GetReverseSortedCol( size_t cidx ) const{
+                utils::Assert( !bst_debug || cidx < this->NumCol(), "col id exceed bound" );
+                ColIter it; 
+                it.dptr_ = &col_data_[ col_ptr_[cidx+1] ];
+                it.end_  = &col_data_[ col_ptr_[cidx] ];
+                return it;
+            }
            /*!
             * \brief intialize the data so that we have both column and row major
             *        access, call this whenever we need column access