diff --git a/booster/tree/xgboost_svdf_tree.hpp b/booster/tree/xgboost_svdf_tree.hpp new file mode 100644 index 000000000..fd42402c0 --- /dev/null +++ b/booster/tree/xgboost_svdf_tree.hpp @@ -0,0 +1,556 @@ +#ifndef _XGBOOST_APEX_TREE_HPP_ +#define _XGBOOST_APEX_TREE_HPP_ +/*! + * \file xgboost_svdf_tree.hpp + * \brief implementation of regression tree, with layerwise support + * this file is adapted from GBRT implementation in SVDFeature project + * \author Tianqi Chen: tqchen@apex.sjtu.edu.cn, tianqi.tchen@gmail.com + */ +#include +#include "xgboost_tree_model.h" +#include "../../utils/xgboost_random.h" +#include "../../utils/xgboost_matrix_csr.h" + +namespace xgboost{ + namespace booster{ + const bool rt_debug = false; + // whether to check bugs + const bool check_bug = false; + + const float rt_eps = 1e-5f; + const float rt_2eps = rt_eps * 2.0f; + + inline double sqr( double a ){ + return a * a; + } + + inline void assert_sorted( unsigned *idset, int len ){ + if( !rt_debug || !check_bug ) return; + for( int i = 1; i < len; i ++ ){ + utils::Assert( idset[i-1] < idset[i], "idset not sorted" ); + } + } + }; + + namespace booster{ + // node stat used in rtree + struct RTreeNodeStat{ + // loss chg caused by current split + float loss_chg; + // weight of current node + float base_weight; + // number of child that is leaf node known up to now + int leaf_child_cnt; + }; + + // structure of Regression Tree + class RTree: public TreeModel{ + }; + + // selecter of rtree to find the suitable candidate + class RTSelecter{ + public: + struct Entry{ + float loss_chg; + size_t start; + int len; + unsigned sindex; + float split_value; + Entry(){} + Entry( float loss_chg, size_t start, int len, unsigned split_index, float split_value, bool default_left ){ + this->loss_chg = loss_chg; + this->start = start; + this->len = len; + if( default_left ) split_index |= (1U << 31); + this->sindex = split_index; + this->split_value = split_value; + } + inline unsigned split_index( void ) const{ + return sindex & ( (1U<<31) - 1U ); + } + inline bool default_left( void ) const{ + return (sindex >> 31) != 0; + } + }; + private: + Entry best_entry; + const TreeParamTrain ¶m; + public: + RTSelecter( const TreeParamTrain &p ):param( p ){ + memset( &best_entry, 0, sizeof(best_entry) ); + best_entry.loss_chg = 0.0f; + } + inline void push_back( const Entry &e ){ + if( e.loss_chg > best_entry.loss_chg ) best_entry = e; + } + inline const Entry & select( void ){ + return best_entry; + } + }; + + // updater of rtree, allows the parameters to be stored inside, key solver + class RTreeUpdater{ + protected: + // training task, element of single task + struct Task{ + // node id in tree + int nid; + // idset pointer, instance id in [idset,idset+len) + unsigned *idset; + // length of idset + unsigned len; + // base_weight of parent + float parent_base_weight; + Task(){} + Task( int nid, unsigned *idset, unsigned len, float pweight = 0.0f ){ + this->nid = nid; + this->idset = idset; + this->len = len; + this->parent_base_weight = pweight; + } + }; + + // sparse column entry + struct SCEntry{ + // feature value + float fvalue; + // row index in grad + unsigned rindex; + SCEntry(){} + SCEntry( float fvalue, unsigned rindex ){ + this->fvalue = fvalue; this->rindex = rindex; + } + inline bool operator<( const SCEntry &p ) const{ + return fvalue < p.fvalue; + } + }; + private: + // training parameter + const TreeParamTrain ¶m; + // parameters, reference + RTree &tree; + std::vector &grad; + std::vector &hess; + const FMatrixS::Image &smat; + const std::vector &group_id; + private: + // maximum depth up to now + int max_depth; + // number of nodes being pruned + int num_pruned; + // stack to store current task + std::vector task_stack; + // temporal space for index set + std::vector idset; + private: + // task management: NOTE DFS here + inline void add_task( Task tsk ){ + task_stack.push_back( tsk ); + } + inline bool next_task( Task &tsk ){ + if( task_stack.size() == 0 ) return false; + tsk = task_stack.back(); + task_stack.pop_back(); + return true; + } + private: + // try to prune off current leaf, return true if successful + inline void try_prune_leaf( int nid, int depth ){ + if( tree[ nid ].is_root() ) return; + int pid = tree[ nid ].parent(); + RTree::NodeStat &s = tree.stat( pid ); + s.leaf_child_cnt ++; + + if( s.leaf_child_cnt >= 2 && param.need_prune( s.loss_chg, depth - 1 ) ){ + // need to be pruned + tree.ChangeToLeaf( pid, param.learning_rate * s.base_weight ); + // add statistics to number of nodes pruned + num_pruned += 2; + // tail recursion + this->try_prune_leaf( pid, depth - 1 ); + } + } + // make leaf for current node :) + inline void make_leaf( Task tsk, double sum_grad, double sum_hess, bool compute ){ + for( unsigned i = 0; i < tsk.len; i ++ ){ + const unsigned ridx = tsk.idset[i]; + if( compute ){ + sum_grad += grad[ ridx ]; + sum_hess += hess[ ridx ]; + } + } + tree[ tsk.nid ].set_leaf( param.learning_rate * param.CalcWeight( sum_grad, sum_hess, tsk.parent_base_weight ) ); + this->try_prune_leaf( tsk.nid, tree.GetDepth( tsk.nid ) ); + } + private: + // make split for current task, re-arrange positions in idset + inline void make_split( Task tsk, const SCEntry *entry, int num, float loss_chg, double base_weight ){ + // before split, first prepare statistics + RTree::NodeStat &s = tree.stat( tsk.nid ); + s.loss_chg = loss_chg; + s.leaf_child_cnt = 0; + s.base_weight = static_cast( base_weight ); + + // add childs to current node + tree.AddChilds( tsk.nid ); + // assert that idset is sorted + assert_sorted( tsk.idset, tsk.len ); + // use merge sort style to get the solution + std::vector qset; + for( int i = 0; i < num; i ++ ){ + qset.push_back( entry[i].rindex ); + } + std::sort( qset.begin(), qset.end() ); + // do merge sort style, make the other set, remove elements in qset + for( unsigned i = 0, top = 0; i < tsk.len; i ++ ){ + if( top < qset.size() ){ + if( tsk.idset[ i ] != qset[ top ] ){ + tsk.idset[ i - top ] = tsk.idset[ i ]; + }else{ + top ++; + } + }else{ + tsk.idset[ i - qset.size() ] = tsk.idset[ i ]; + } + } + // get two parts + RTree::Node &n = tree[ tsk.nid ]; + Task def_part( n.default_left() ? n.cleft() : n.cright(), tsk.idset, tsk.len - qset.size(), s.base_weight ); + Task spl_part( n.default_left() ? n.cright(): n.cleft() , tsk.idset + def_part.len, qset.size(), s.base_weight ); + // fill back split part + for( unsigned i = 0; i < spl_part.len; i ++ ){ + spl_part.idset[ i ] = qset[ i ]; + } + // add tasks to the queue + this->add_task( def_part ); + this->add_task( spl_part ); + } + + // enumerate split point of the tree + inline void enumerate_split( RTSelecter &sglobal, int tlen, + double rsum_grad, double rsum_hess, double root_cost, + const SCEntry *entry, size_t start, size_t end, + int findex, float parent_base_weight ){ + // local selecter + RTSelecter slocal( param ); + + if( param.default_direction != 1 ){ + // forward process, default right + double csum_grad = 0.0, csum_hess = 0.0; + for( size_t j = start; j < end; j ++ ){ + const unsigned ridx = entry[ j ].rindex; + csum_grad += grad[ ridx ]; + csum_hess += hess[ ridx ]; + // check for split + if( j == end - 1 || entry[j].fvalue + rt_2eps < entry[ j + 1 ].fvalue ){ + if( csum_hess < param.min_child_weight ) continue; + const double dsum_hess = rsum_hess - csum_hess; + if( dsum_hess < param.min_child_weight ) break; + // change of loss + double loss_chg = + param.CalcCost( csum_grad, csum_hess, parent_base_weight ) + + param.CalcCost( rsum_grad - csum_grad, dsum_hess, parent_base_weight ) - root_cost; + + const int clen = static_cast( j + 1 - start ); + // add candidate to selecter + slocal.push_back( RTSelecter::Entry( loss_chg, start, clen, findex, + j == end - 1 ? entry[j].fvalue + rt_eps : 0.5 * (entry[j].fvalue+entry[j+1].fvalue), + false ) ); + } + } + } + + if( param.default_direction != 2 ){ + // backward process, default left + double csum_grad = 0.0, csum_hess = 0.0; + for( size_t j = end; j > start; j -- ){ + const unsigned ridx = entry[ j - 1 ].rindex; + csum_grad += grad[ ridx ]; + csum_hess += hess[ ridx ]; + // check for split + if( j == start + 1 || entry[ j - 2 ].fvalue + rt_2eps < entry[ j - 1 ].fvalue ){ + if( csum_hess < param.min_child_weight ) continue; + const double dsum_hess = rsum_hess - csum_hess; + if( dsum_hess < param.min_child_weight ) break; + double loss_chg = param.CalcCost( csum_grad, csum_hess, parent_base_weight ) + + param.CalcCost( rsum_grad - csum_grad, dsum_hess, parent_base_weight ) - root_cost; + const int clen = static_cast( end - j + 1 ); + // add candidate to selecter + slocal.push_back( RTSelecter::Entry( loss_chg, j - 1, clen, findex, + j == start + 1 ? entry[j-1].fvalue - rt_eps : 0.5 * (entry[j-2].fvalue + entry[j-1].fvalue), + true ) ); + } + } + } + sglobal.push_back( slocal.select() ); + } + + private: + // temporal storage for expand column major + std::vector tmp_rptr; + // find split for current task, another implementation of expand in column major manner + // should be more memory frugal, avoid global sorting across feature + inline void expand( Task tsk ){ + // assert that idset is sorted + // if reach maximum depth, make leaf from current node + int depth = tree.GetDepth( tsk.nid ); + // update statistiss + if( depth > max_depth ) max_depth = depth; + // if bigger than max depth + if( depth >= param.max_depth ){ + this->make_leaf( tsk, 0.0, 0.0, true ); return; + } + // convert to column major CSR format + const int nrows = tree.param.num_feature; + if( tmp_rptr.size() == 0 ){ + // initialize tmp storage in first usage + tmp_rptr.resize( nrows + 1 ); + std::fill( tmp_rptr.begin(), tmp_rptr.end(), 0 ); + } + // records the columns + std::vector entry; + // records the active features + std::vector aclist; + utils::SparseCSRMBuilder builder( tmp_rptr, entry, aclist ); + builder.InitBudget( nrows ); + // statistics of root + double rsum_grad = 0.0, rsum_hess = 0.0; + for( unsigned i = 0; i < tsk.len; i ++ ){ + const unsigned ridx = tsk.idset[i]; + rsum_grad += grad[ ridx ]; + rsum_hess += hess[ ridx ]; + + FMatrixS::Line sp = smat[ ridx ]; + for( unsigned j = 0; j < sp.len; j ++ ){ + builder.AddBudget( sp.findex[j] ); + } + } + + // if minimum split weight is not meet + if( param.cannot_split( rsum_hess, depth ) ){ + this->make_leaf( tsk, rsum_grad, rsum_hess, false ); builder.Cleanup(); return; + } + + builder.InitStorage(); + for( unsigned i = 0; i < tsk.len; i ++ ){ + const unsigned ridx = tsk.idset[i]; + FMatrixS::Line sp = smat[ ridx ]; + for( unsigned j = 0; j < sp.len; j ++ ){ + builder.PushElem( sp.findex[j], SCEntry( sp.fvalue[j], ridx ) ); + } + } + // --- end of building column major matrix --- + // after this point, tmp_rptr and entry is ready to use + + // global selecter + RTSelecter sglobal( param ); + // cost root + const double root_cost = param.CalcRootCost( rsum_grad, rsum_hess ); + // KEY: layerwise, weight of current node if it is leaf + const double base_weight = param.CalcWeight( rsum_grad, rsum_hess, tsk.parent_base_weight ); + // enumerate feature index + for( size_t i = 0; i < aclist.size(); i ++ ){ + int findex = static_cast( aclist[i] ); + size_t start = tmp_rptr[ findex ]; + size_t end = tmp_rptr[ findex + 1 ]; + utils::Assert( start < end, "bug" ); + // local sort can be faster when the features are sparse + std::sort( entry.begin() + start, entry.begin() + end ); + // local selecter + this->enumerate_split( sglobal, tsk.len, + rsum_grad, rsum_hess, root_cost, + &entry[0], start, end, findex, base_weight ); + } + // Cleanup tmp_rptr for next use + builder.Cleanup(); + // get the best solution + const RTSelecter::Entry &e = sglobal.select(); + // allowed to split + if( e.loss_chg > rt_eps ){ + // add splits + tree[ tsk.nid ].set_split( e.split_index(), e.split_value, e.default_left() ); + // re-arrange idset, push tasks + this->make_split( tsk, &entry[ e.start ], e.len, e.loss_chg, base_weight ); + }else{ + // make leaf if we didn't meet requirement + this->make_leaf( tsk, rsum_grad, rsum_hess, false ); + } + } + private: + // initialize the tasks + inline void init_tasks( size_t ngrads ){ + // add group partition if necessary + if( group_id.size() == 0 ){ + if( param.subsample > 1.0f - 1e-6f ){ + idset.resize( 0 ); + for( size_t i = 0; i < ngrads; i ++ ){ + if( hess[i] < 0.0f ) continue; + idset.push_back( (unsigned)i ); + } + }else{ + idset.resize( 0 ); + for( size_t i = 0; i < ngrads; i ++ ){ + if( random::SampleBinary( param.subsample ) != 0 ){ + idset.push_back( (unsigned)i ); + } + } + } + this->add_task( Task( 0, &idset[0], idset.size() ) ); return; + } + + utils::Assert( group_id.size() == ngrads, "number of groups must be exact" ); + {// new method for grouping, use CSR builder + std::vector rptr; + utils::SparseCSRMBuilder builder( rptr, idset ); + builder.InitBudget( tree.param.num_roots ); + for( size_t i = 0; i < group_id.size(); i ++ ){ + // drop invalid elements + if( hess[ i ] < 0.0f ) continue; + utils::Assert( group_id[ i ] < (unsigned)tree.param.num_roots, + "group id exceed number of roots" ); + builder.AddBudget( group_id[ i ] ); + } + builder.InitStorage(); + for( size_t i = 0; i < group_id.size(); i ++ ){ + // drop invalid elements + if( hess[ i ] < 0.0f ) continue; + builder.PushElem( group_id[ i ], static_cast(i) ); + } + for( size_t i = 1; i < rptr.size(); i ++ ){ + const size_t start = rptr[ i - 1 ], end = rptr[ i ]; + if( start < end ){ + this->add_task( Task( i - 1, &idset[ start ], end - start ) ); + } + } + } + } + public: + RTreeUpdater( const TreeParamTrain &pparam, + RTree &ptree, + std::vector &pgrad, + std::vector &phess, + const FMatrixS::Image &psmat, + const std::vector &pgroup_id ): + param( pparam ), tree( ptree ), grad( pgrad ), hess( phess ), + smat( psmat ), group_id( pgroup_id ){ + } + inline int do_boost( int &num_pruned ){ + this->init_tasks( grad.size() ); + this->max_depth = 0; + this->num_pruned = 0; + Task tsk; + while( this->next_task( tsk ) ){ + this->expand( tsk ); + } + num_pruned = this->num_pruned; + return max_depth; + } + }; + + + + class RTreeTrainer : public IBooster{ + private: + int silent; + // tree of current shape + RTree tree; + TreeParamTrain param; + private: + std::vector tmp_feat; + std::vector tmp_funknown; + inline void init_tmpfeat( void ){ + if( tmp_feat.size() != (size_t)tree.param.num_feature ){ + tmp_feat.resize( tree.param.num_feature ); + tmp_funknown.resize( tree.param.num_feature ); + std::fill( tmp_funknown.begin(), tmp_funknown.end(), true ); + } + } + public: + virtual void SetParam( const char *name, const char *val ){ + if( !strcmp( name, "silent") ) silent = atoi( val ); + param.SetParam( name, val ); + tree.param.SetParam( name, val ); + } + virtual void LoadModel( utils::IStream &fi ){ + tree.LoadModel( fi ); + } + virtual void SaveModel( utils::IStream &fo ) const{ + tree.SaveModel( fo ); + } + virtual void InitModel( void ){ + tree.InitModel(); + } + private: + inline int get_next( int pid, float fvalue, bool is_unknown ){ + float split_value = tree[ pid ].split_cond(); + if( is_unknown ){ + if( tree[ pid ].default_left() ) return tree[ pid ].cleft(); + else return tree[ pid ].cright(); + }else{ + if( fvalue < split_value ) return tree[ pid ].cleft(); + else return tree[ pid ].cright(); + } + } + public: + virtual void DoBoost( std::vector &grad, + std::vector &hess, + const FMatrixS::Image &smat, + const std::vector &group_id ){ + utils::Assert( grad.size() < UINT_MAX, "number of instance exceed what we can handle" ); + if( !silent ){ + printf( "\nbuild GBRT with %u instances\n", (unsigned)grad.size() ); + } + // start with a id set + RTreeUpdater updater( param, tree, grad, hess, smat, group_id ); + int num_pruned; + tree.param.max_depth = updater.do_boost( num_pruned ); + + if( !silent ){ + printf( "tree train end, %d roots, %d extra nodes, %d pruned nodes ,max_depth=%d\n", + tree.param.num_roots, tree.num_extra_nodes(), num_pruned, tree.param.max_depth ); + } + } + + virtual int GetLeafIndex( const std::vector &feat, + const std::vector &funknown, + unsigned gid = 0 ){ + // start from groups that belongs to current data + int pid = (int)gid; + // tranverse tree + while( !tree[ pid ].is_leaf() ){ + unsigned split_index = tree[ pid ].split_index(); + pid = this->get_next( pid, feat[ split_index ], funknown[ split_index ] ); + } + return pid; + } + virtual float Predict( const FMatrixS::Line &feat, unsigned gid = 0 ){ + this->init_tmpfeat(); + for( unsigned i = 0; i < feat.len; i ++ ){ + utils::Assert( feat.findex[i] < (unsigned)tmp_funknown.size() , "input feature execeed bound" ); + tmp_funknown[ feat.findex[i] ] = false; + tmp_feat[ feat.findex[i] ] = feat.fvalue[i]; + } + int pid = this->GetLeafIndex( tmp_feat, tmp_funknown, gid ); + // set back + for( unsigned i = 0; i < feat.len; i ++ ){ + tmp_funknown[ feat.findex[i] ] = true; + } + return tree[ pid ].leaf_value(); + } + virtual float Predict( const std::vector &feat, + const std::vector &funknown, + unsigned gid = 0 ){ + utils::Assert( feat.size() >= (size_t)tree.param.num_feature, + "input data smaller than num feature" ); + int pid = this->GetLeafIndex( feat, funknown, gid ); + return tree[ pid ].leaf_value(); + } + public: + RTreeTrainer( void ){ silent = 0; } + virtual ~RTreeTrainer( void ){} + }; + }; +}; +#endif + + diff --git a/booster/tree/xgboost_tree_model.h b/booster/tree/xgboost_tree_model.h new file mode 100644 index 000000000..fb9f4fdf5 --- /dev/null +++ b/booster/tree/xgboost_tree_model.h @@ -0,0 +1,415 @@ +#ifndef _XGBOOST_TREE_MODEL_H_ +#define _XGBOOST_TREE_MODEL_H_ +/*! + * \file xgboost_tree_model.h + * \brief generic definition of model structure used in tree models + * used to support learning of boosting tree + * \author Tianqi Chen: tianqi.tchen@gmail.com + */ +#include +#include "../../utils/xgboost_utils.h" +#include "../../utils/xgboost_stream.h" + +namespace xgboost{ + namespace booster{ + /*! + * \brief template class of TreeModel + * \tparam TSplitCond data type to indicate split condition + * \tparam TNodeStat auxiliary statistics of node to help tree building + */ + template + class TreeModel{ + public: + /*! \brief data type to indicate split condition */ + typedef TNodeStat NodeStat; + /*! \brief auxiliary statistics of node to help tree building */ + typedef TSplitCond SplitCond; + public: + /*! \brief parameters of the tree */ + struct Param{ + /*! \brief number of start root */ + int num_roots; + /*! \brief total number of nodes */ + int num_nodes; + /*!\brief number of deleted nodes */ + int num_deleted; + /*! \brief maximum depth, this is a statistics of the tree */ + int max_depth; + /*! \brief number of features used for tree construction */ + int num_feature; + /*! \brief reserved part */ + int reserved[ 32 ]; + /*! \brief constructor */ + Param( void ){ + max_depth = 0; + memset( reserved, 0, sizeof( reserved ) ); + } + /*! + * \brief set parameters from outside + * \param name name of the parameter + * \param val value of the parameter + */ + inline void SetParam( const char *name, const char *val ){ + if( !strcmp("num_roots", name ) ) num_roots = atoi( val ); + if( !strcmp("num_feature", name ) ) num_feature = atoi( val ); + } + }; + /*! \brief tree node */ + class Node{ + private: + friend class TreeModel; + /*! + * \brief in leaf node, we have weights, in non-leaf nodes, + * we have split condition + */ + union Info{ + float leaf_value; + TSplitCond split_cond; + }; + private: + // pointer to parent, highest bit is used to indicate whether it's a left child or not + int sparent; + // pointer to left, right + int left, right; + // split feature index, left split or right split depends on the highest bit + unsigned sindex; + // extra info + Info info; + private: + inline void set_parent( int pidx, bool is_left_child = true ){ + if( is_left_child ) pidx |= (1U << 31); + this->sparent = pidx; + } + public: + /*! \brief index of left child */ + inline int cleft( void ) const{ + return this->left; + } + /*! \brief index of right child */ + inline int cright( void ) const{ + return this->right; + } + /*! \brief feature index of split condition */ + inline unsigned split_index( void ) const{ + return sindex & ( (1U<<31) - 1U ); + } + /*! \brief when feature is unknown, whether goes to left child */ + inline bool default_left( void ) const{ + return (sindex >> 31) != 0; + } + /*! \brief whether current node is leaf node */ + inline bool is_leaf( void ) const{ + return left == -1; + } + /*! \brief get leaf value of leaf node */ + inline float leaf_value( void ) const{ + return (this->info).leaf_value; + } + /*! \brief get split condition of the node */ + inline TSplitCond split_cond( void ) const{ + return (this->info).split_cond; + } + /*! \brief get parent of the node */ + inline int parent( void ) const{ + return sparent & ( (1U << 31) - 1 ); + } + /*! \brief whether current node is left child */ + inline bool is_left_child( void ) const{ + return ( sparent & (1U << 31)) != 0; + } + /*! \brief whether current node is root */ + inline bool is_root( void ) const{ + return sparent == -1; + } + /*! + * \brief set the right child + * \param nide node id to right child + */ + inline void set_right_child( int nid ){ + this->right = nid; + } + /*! + * \brief set split condition of current node + * \param split_index feature index to split + * \param split_cond split condition + * \param default_left the default direction when feature is unknown + */ + inline void set_split( unsigned split_index, TSplitCond split_cond, bool default_left = false ){ + if( default_left ) split_index |= (1U << 31); + this->sindex = split_index; + (this->info).split_cond = split_cond; + } + /*! + * \brief set the leaf value of the node + * \param value leaf value + * \param right right index, could be used to store + * additional information + */ + inline void set_leaf( float value, int right = -1 ){ + (this->info).leaf_value = value; + this->left = -1; + this->right = right; + } + }; + protected: + // vector of nodes + std::vector nodes; + // stats of nodes + std::vector stats; + protected: + // free node space, used during training process + std::vector deleted_nodes; + // allocate a new node, + // !!!!!! NOTE: may cause BUG here, nodes.resize + inline int AllocNode( void ){ + if( param.num_deleted != 0 ){ + int nd = deleted_nodes.back(); + deleted_nodes.pop_back(); + param.num_deleted --; + return nd; + } + int nd = param.num_nodes ++; + nodes.resize( param.num_nodes ); + stats.resize( param.num_nodes ); + return nd; + } + // delete a tree node + inline void DeleteNode( int nid ){ + utils::Assert( nid >= param.num_roots, "can not delete root"); + deleted_nodes.push_back( nid ); + nodes[ nid ].set_parent( -1 ); + param.num_deleted ++; + } + public: + /*! + * \brief change a non leaf node to a leaf node, delete its children + * \param rid node id of the node + * \param new leaf value + */ + inline void ChangeToLeaf( int rid, float value ){ + utils::Assert( nodes[ nodes[rid].left ].is_leaf(), "can not delete a non termial child"); + utils::Assert( nodes[ nodes[rid].right ].is_leaf(), "can not delete a non termial child"); + this->DeleteNode( nodes[ rid ].left ); + this->DeleteNode( nodes[ rid ].right ); + nodes[ rid ].set_leaf( value ); + } + public: + /*! \brief model parameter */ + Param param; + public: + /*! \brief constructor */ + TreeModel( void ){ + param.num_nodes = 1; + param.num_roots = 1; + param.num_deleted = 0; + nodes.resize( 1 ); + } + /*! \brief get node given nid */ + inline Node &operator[]( int nid ){ + return nodes[ nid ]; + } + /*! \brief get node statistics given nid */ + inline NodeStat &stat( int nid ){ + return stats[ nid ]; + } + /*! \brief initialize the model */ + inline void InitModel( void ){ + param.num_nodes = param.num_roots; + nodes.resize( param.num_nodes ); + stats.resize( param.num_nodes ); + for( int i = 0; i < param.num_nodes; i ++ ){ + nodes[i].set_leaf( 0.0f ); + nodes[i].set_parent( -1 ); + } + } + /*! + * \brief load model from stream + * \param fi input stream + */ + inline void LoadModel( utils::IStream &fi ){ + utils::Assert( fi.Read( ¶m, sizeof(Param) ) > 0, "TreeModel" ); + nodes.resize( param.num_nodes ); + utils::Assert( fi.Read( &nodes[0], sizeof(Node) * nodes.size() ) > 0, "TreeModel::Node" ); + + deleted_nodes.resize( 0 ); + for( int i = param.num_roots; i < param.num_nodes; i ++ ){ + if( nodes[i].is_root() ) deleted_nodes.push_back( i ); + } + utils::Assert( (int)deleted_nodes.size() == param.num_deleted, "number of deleted nodes do not match" ); + } + /*! + * \brief save model to stream + * \param fo output stream + */ + inline void SaveModel( utils::IStream &fo ) const{ + utils::Assert( param.num_nodes == (int)nodes.size() ); + fo.Write( ¶m, sizeof(Param) ); + fo.Write( &nodes[0], sizeof(Node) * nodes.size() ); + } + /*! + * \brief add child nodes to node + * \param nid node id to add childs + */ + inline void AddChilds( int nid ){ + int pleft = this->AllocNode(); + int pright = this->AllocNode(); + nodes[ nid ].left = pleft; + nodes[ nid ].right = pright; + nodes[ nodes[ nid ].left ].set_parent( nid, true ); + nodes[ nodes[ nid ].right ].set_parent( nid, false ); + } + /*! + * \brief only add a right child to a leaf node + * \param node id to add right child + */ + inline void AddRightChild( int nid ){ + int pright = this->AllocNode(); + nodes[ nid ].right = pright; + nodes[ nodes[ nid ].right ].set_parent( nid, false ); + } + /*! + * \brief get current depth + * \param nid node id + * \param pass_rchild whether right child is not counted in depth + */ + inline int GetDepth( int nid, bool pass_rchild = false ) const{ + int depth = 0; + while( !nodes[ nid ].is_root() ){ + if( !pass_rchild || nodes[ nid ].is_left_child() ) depth ++; + nid = nodes[ nid ].parent(); + } + return depth; + } + /*! \brief number of extra nodes besides the root */ + inline int num_extra_nodes( void ) const { + return param.num_nodes - param.num_roots - param.num_deleted; + } + }; + }; + + namespace booster{ + /*! \brief training parameters for regression tree */ + struct TreeParamTrain{ + // learning step size for a time + float learning_rate; + // minimum loss change required for a split + float min_split_loss; + // maximum depth of a tree + int max_depth; + //----- the rest parameters are less important ---- + // minimum amount of hessian(weight) allowed in a child + float min_child_weight; + // weight decay parameter used to control leaf fitting + float reg_lambda; + // reg method + int reg_method; + // default direction choice + int default_direction; + // whether we want to do subsample + float subsample; + // whether to use layerwise aware regularization + int use_layerwise; + /*! \brief constructor */ + TreeParamTrain( void ){ + learning_rate = 0.3f; + min_child_weight = 1.0f; + max_depth = 6; + reg_lambda = 1.0f; + reg_method = 2; + default_direction = 0; + subsample = 1.0f; + use_layerwise = 0; + } + /*! + * \brief set parameters from outside + * \param name name of the parameter + * \param val value of the parameter + */ + inline void SetParam( const char *name, const char *val ){ + // sync-names + if( !strcmp( name, "gamma") ) min_split_loss = (float)atof( val ); + if( !strcmp( name, "eta") ) learning_rate = (float)atof( val ); + if( !strcmp( name, "lambda") ) reg_lambda = (float)atof( val ); + // normal tree prameters + if( !strcmp( name, "learning_rate") ) learning_rate = (float)atof( val ); + if( !strcmp( name, "min_child_weight") ) min_child_weight = (float)atof( val ); + if( !strcmp( name, "min_split_loss") ) min_split_loss = (float)atof( val ); + if( !strcmp( name, "max_depth") ) max_depth = atoi( val ); + if( !strcmp( name, "reg_lambda") ) reg_lambda = (float)atof( val ); + if( !strcmp( name, "reg_method") ) reg_method = (float)atof( val ); + if( !strcmp( name, "subsample") ) subsample = (float)atof( val ); + if( !strcmp( name, "use_layerwise") ) use_layerwise = atoi( val ); + if( !strcmp( name, "default_direction") ) { + if( !strcmp( val, "learn") ) default_direction = 0; + if( !strcmp( val, "left") ) default_direction = 1; + if( !strcmp( val, "right") ) default_direction = 2; + } + } + protected: + // functions for L1 cost + static inline double ThresholdL1( double w, double lambda ){ + if( w > +lambda ) return w - lambda; + if( w < -lambda ) return w + lambda; + return 0.0; + } + inline double CalcWeight( double sum_grad, double sum_hess )const{ + if( sum_hess < min_child_weight ){ + return 0.0; + }else{ + switch( reg_method ){ + case 1: return - ThresholdL1( sum_grad, reg_lambda ) / sum_hess; + case 2: return - sum_grad / ( sum_hess + reg_lambda ); + // elstic net + case 3: return - ThresholdL1( sum_grad, 0.5 * reg_lambda ) / ( sum_hess + 0.5 * reg_lambda ); + default: return - sum_grad / sum_hess; + } + } + } + private: + inline static double Sqr( double a ){ + return a * a; + } + public: + // calculate the cost of loss function + inline double CalcCost( double sum_grad, double sum_hess ) const{ + if( sum_hess < min_child_weight ){ + return 0.0; + } + switch( reg_method ){ + case 1 : return Sqr( ThresholdL1( sum_grad, reg_lambda ) ) / sum_hess; + case 2 : return Sqr( sum_grad ) / ( sum_hess + reg_lambda ); + // elstic net + case 3 : return Sqr( ThresholdL1( sum_grad, 0.5 * reg_lambda ) ) / ( sum_hess + 0.5 * reg_lambda ); + default: return Sqr( sum_grad ) / sum_hess; + } + } + // KEY:layerwise + // calculate cost of root + inline double CalcRootCost( double sum_grad, double sum_hess ) const{ + if( use_layerwise == 0 ) return this->CalcCost( sum_grad, sum_hess ); + else return 0.0; + } + // KEY:layerwise + // calculate the cost after split + // base_weight: the base_weight of parent + inline double CalcCost( double sum_grad, double sum_hess, double base_weight ) const{ + if( use_layerwise == 0 ) return this->CalcCost( sum_grad, sum_hess ); + else return this->CalcCost( sum_grad + sum_hess * base_weight, sum_hess ); + } + // calculate the weight of leaf + inline double CalcWeight( double sum_grad, double sum_hess, double parent_base_weight )const{ + if( use_layerwise == 0 ) return CalcWeight( sum_grad, sum_hess ); + else return parent_base_weight + CalcWeight( sum_grad + parent_base_weight * sum_hess, sum_hess ); + } + /*! \brief given the loss change, whether we need to invode prunning */ + inline bool need_prune( double loss_chg, int depth ) const{ + return loss_chg < min_split_loss; + } + /*! \brief whether we can split with current hessian */ + inline bool cannot_split( double sum_hess, int depth ) const{ + return sum_hess < min_child_weight * 2.0; + } + }; + }; +}; +#endif diff --git a/booster/xgboost.cpp b/booster/xgboost.cpp index 381067d08..6e04b3166 100644 --- a/booster/xgboost.cpp +++ b/booster/xgboost.cpp @@ -10,6 +10,9 @@ #include "xgboost.h" #include "../utils/xgboost_utils.h" #include "xgboost_gbmbase.h" +// implementations of boosters +#include "tree/xgboost_svdf_tree.hpp" + namespace xgboost{ namespace booster{ /*! diff --git a/booster/xgboost.h b/booster/xgboost.h index f1bfdb7ba..5e7bd47f6 100644 --- a/booster/xgboost.h +++ b/booster/xgboost.h @@ -3,6 +3,7 @@ /*! * \file xgboost.h * \brief the general gradient boosting interface + * * \author Tianqi Chen: tianqi.tchen@gmail.com */ #include diff --git a/booster/xgboost_gbmbase.h b/booster/xgboost_gbmbase.h index 66d51ca7a..c43f79871 100644 --- a/booster/xgboost_gbmbase.h +++ b/booster/xgboost_gbmbase.h @@ -53,7 +53,7 @@ namespace xgboost{ /*! \brief type of tree used */ int booster_type; /*! \brief number of root: default 0, means single tree */ - int num_root; + int num_roots; /*! \brief number of features to be used by boosters */ int num_feature; /*! \brief size of predicton buffer allocated for buffering boosting computation */ @@ -69,7 +69,7 @@ namespace xgboost{ Param( void ){ num_boosters = 0; booster_type = 0; - num_root = num_feature = 0; + num_roots = num_feature = 0; do_reboost = 0; num_pbuffer = 0; memset( reserved, 0, sizeof( reserved ) ); @@ -83,7 +83,7 @@ namespace xgboost{ if( !strcmp("booster_type", name ) ) booster_type = atoi( val ); if( !strcmp("num_pbuffer", name ) ) num_pbuffer = atoi( val ); if( !strcmp("do_reboost", name ) ) do_reboost = atoi( val ); - if( !strcmp("bst:num_root", name ) ) num_root = atoi( val ); + if( !strcmp("bst:num_roots", name ) ) num_roots = atoi( val ); if( !strcmp("bst:num_feature", name ) ) num_feature = atoi( val ); } }; @@ -98,9 +98,12 @@ namespace xgboost{ * \param val value of the parameter */ inline void SetParam( const char *name, const char *val ){ - if( strncmp( name, "bst:", 4 ) == 0 ){ + if( !strncmp( name, "bst:", 4 ) ){ cfg.PushBack( name + 4, val ); } + if( !strcmp( name, "silent") ){ + cfg.PushBack( name, val ); + } if( boosters.size() == 0 ) param.SetParam( name, val ); } /*! diff --git a/utils/xgboost_matrix_csr.h b/utils/xgboost_matrix_csr.h new file mode 100644 index 000000000..58c54f77b --- /dev/null +++ b/utils/xgboost_matrix_csr.h @@ -0,0 +1,152 @@ +/*! + * \file xgboost_matrix_csr.h + * \brief this file defines some easy to use STL based class for in memory sparse CSR matrix + * \author Tianqi Chen: tianqi.tchen@gmail.com +*/ +#ifndef _XGBOOST_MATRIX_CSR_H_ +#define _XGBOOST_MATRIX_CSR_H_ +#include +#include +#include "xgboost_utils.h" + +namespace xgboost{ + namespace utils{ + /*! + * \brief a class used to help construct CSR format matrix, + * can be used to convert row major CSR to column major CSR + * \tparam IndexType type of index used to store the index position, usually unsigned or size_t + * \tparam whether enabling the usage of aclist, this option must be enabled manually + */ + template + struct SparseCSRMBuilder{ + private: + /*! \brief dummy variable used in the indicator matrix construction */ + std::vector dummy_aclist; + /*! \brief pointer to each of the row */ + std::vector &rptr; + /*! \brief index of nonzero entries in each row */ + std::vector &findex; + /*! \brief a list of active rows, used when many rows are empty */ + std::vector &aclist; + public: + SparseCSRMBuilder( std::vector &p_rptr, + std::vector &p_findex ) + :rptr(p_rptr), findex( p_findex ), aclist( dummy_aclist ){ + Assert( !UseAcList, "enabling bug" ); + } + /*! \brief use with caution! rptr must be cleaned before use */ + SparseCSRMBuilder( std::vector &p_rptr, + std::vector &p_findex, + std::vector &p_aclist ) + :rptr(p_rptr), findex( p_findex ), aclist( p_aclist ){ + Assert( UseAcList, "must manually enable the option use aclist" ); + } + public: + /*! + * \brief step 1: initialize the number of rows in the data + * \nrows number of rows in the matrix + */ + inline void InitBudget( size_t nrows ){ + if( !UseAcList ){ + rptr.resize( nrows + 1 ); + std::fill( rptr.begin(), rptr.end(), 0 ); + }else{ + Assert( nrows + 1 == rptr.size(), "rptr must be initialized already" ); + this->Cleanup(); + } + } + /*! + * \brief step 2: add budget to each rows, this function is called when aclist is used + * \param row_id the id of the row + * \param nelem number of element budget add to this row + */ + inline void AddBudget( size_t row_id, size_t nelem = 1 ){ + if( UseAcList ){ + if( rptr[ row_id + 1 ] == 0 ) aclist.push_back( row_id ); + } + rptr[ row_id + 1 ] += nelem; + } + /*! \brief step 3: initialize the necessary storage */ + inline void InitStorage( void ){ + // initialize rptr to be beginning of each segment + size_t start = 0; + if( !UseAcList ){ + for( size_t i = 1; i < rptr.size(); i ++ ){ + size_t rlen = rptr[ i ]; + rptr[ i ] = start; + start += rlen; + } + }else{ + // case with active list + std::sort( aclist.begin(), aclist.end() ); + + for( size_t i = 0; i < aclist.size(); i ++ ){ + size_t ridx = aclist[ i ]; + size_t rlen = rptr[ ridx + 1 ]; + rptr[ ridx + 1 ] = start; + // set previous rptr to right position if previous feature is not active + if( i == 0 || ridx != aclist[i-1] + 1 ) rptr[ ridx ] = start; + start += rlen; + } + } + findex.resize( start ); + } + /*! + * \brief step 4: + * used in indicator matrix construction, add new + * element to each row, the number of calls shall be exactly same as add_budget + */ + inline void PushElem( size_t row_id, IndexType col_id ){ + size_t &rp = rptr[ row_id + 1 ]; + findex[ rp ++ ] = col_id; + } + /*! + * \brief step 5: only needed when aclist is used + * clean up the rptr for next usage + */ + inline void Cleanup( void ){ + Assert( UseAcList, "this function can only be called use AcList" ); + for( size_t i = 0; i < aclist.size(); i ++ ){ + const size_t ridx = aclist[i]; + rptr[ ridx ] = 0; rptr[ ridx + 1 ] = 0; + } + aclist.clear(); + } + }; + }; + + namespace utils{ + /*! + * \brief simple sparse matrix container + * \tparam IndexType type of index used to store the index position, usually unsigned or size_t + */ + template + struct SparseCSRMat{ + private: + /*! \brief pointer to each of the row */ + std::vector rptr; + /*! \brief index of nonzero entries in each row */ + std::vector findex; + public: + /*! \brief matrix builder*/ + SparseCSRMBuilder builder; + public: + SparseCSRMat( void ):builder( rptr, findex ){ + } + public: + /*! \return number of rows in the matrx */ + inline size_t NumRow( void ) const{ + return rptr.size() - 1; + } + /*! \return number of elements r-th row */ + inline size_t NumElem( size_t r ) const{ + return rptr[ r + 1 ] - rptr[ r ]; + } + /*! \return r-th row */ + inline const IndexType *operator[]( size_t r ) const{ + return &findex[ rptr[r] ]; + } + }; + }; +}; +#endif diff --git a/utils/xgboost_random.h b/utils/xgboost_random.h new file mode 100644 index 000000000..e7caef757 --- /dev/null +++ b/utils/xgboost_random.h @@ -0,0 +1,131 @@ +#ifndef _XGBOOST_RANDOM_H_ +#define _XGBOOST_RANDOM_H_ +/*! + * \file xgboost_random.h + * \brief PRNG to support random number generation + * \author Tianqi Chen: tianqi.tchen@gmail.com + * + * Use standard PRNG from stdlib + */ +#include +#include +#include + +#ifdef _MSC_VER +typedef unsigned char uint8_t; +typedef unsigned short int uint16_t; +typedef unsigned int uint32_t; +#else +#include +#endif + +/*! namespace of PRNG */ +namespace xgboost{ + namespace random{ + /*! \brief seed the PRNG */ + inline void Seed( uint32_t seed ){ + srand( seed ); + } + + /*! \brief return a real number uniform in [0,1) */ + inline double NextDouble(){ + return static_cast( rand() ) / (static_cast( RAND_MAX )+1.0); + } + /*! \brief return a real numer uniform in (0,1) */ + inline double NextDouble2(){ + return (static_cast( rand() ) + 1.0 ) / (static_cast(RAND_MAX) + 2.0); + } + }; + + namespace random{ + /*! \brief return a random number */ + inline uint32_t NextUInt32( void ){ + return (uint32_t)rand(); + } + /*! \brief return a random number in n */ + inline uint32_t NextUInt32( uint32_t n ){ + return (uint32_t) floor( NextDouble() * n ) ; + } + /*! \brief return x~N(0,1) */ + inline double SampleNormal(){ + double x,y,s; + do{ + x = 2 * NextDouble2() - 1.0; + y = 2 * NextDouble2() - 1.0; + s = x*x + y*y; + }while( s >= 1.0 || s == 0.0 ); + + return x * sqrt( -2.0 * log(s) / s ) ; + } + + /*! \brief return iid x,y ~N(0,1) */ + inline void SampleNormal2D( double &xx, double &yy ){ + double x,y,s; + do{ + x = 2 * NextDouble2() - 1.0; + y = 2 * NextDouble2() - 1.0; + s = x*x + y*y; + }while( s >= 1.0 || s == 0.0 ); + double t = sqrt( -2.0 * log(s) / s ) ; + xx = x * t; + yy = y * t; + } + /*! \brief return x~N(mu,sigma^2) */ + inline double SampleNormal( double mu, double sigma ){ + return SampleNormal() * sigma + mu; + } + + /*! \brief return 1 with probability p, coin flip */ + inline int SampleBinary( double p ){ + return NextDouble() < p; + } + + /*! \brief return distribution from Gamma( alpha, beta ) */ + inline double SampleGamma( double alpha, double beta ) { + if ( alpha < 1.0 ) { + double u; + do { + u = NextDouble(); + } while (u == 0.0); + return SampleGamma(alpha + 1.0, beta) * pow(u, 1.0 / alpha); + } else { + double d,c,x,v,u; + d = alpha - 1.0/3.0; + c = 1.0 / sqrt( 9.0 * d ); + do { + do { + x = SampleNormal(); + v = 1.0 + c*x; + } while ( v <= 0.0 ); + v = v * v * v; + u = NextDouble(); + } while ( (u >= (1.0 - 0.0331 * (x*x) * (x*x))) + && (log(u) >= (0.5 * x * x + d * (1.0 - v + log(v)))) ); + return d * v / beta; + } + } + + template + inline void Exchange( T &a, T &b ){ + T c; + c = a; + a = b; + b = c; + } + + template + inline void Shuffle( T *data, size_t sz ){ + if( sz == 0 ) return; + for( uint32_t i = (uint32_t)sz - 1; i > 0; i-- ){ + Exchange( data[i], data[ NextUInt32( i+1 ) ] ); + } + } + // random shuffle the data inside, require PRNG + template + inline void Shuffle( std::vector &data ){ + Shuffle( &data[0], data.size() ); + } + }; +}; + +#endif