diff --git a/Makefile b/Makefile index cb142d3d8..819355989 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ export CC = gcc export CXX = g++ -export CFLAGS = -Wall -O3 -msse2 +export CFLAGS = -Wall -O3 -msse2 -fopenmp # specify tensor path BIN = xgboost diff --git a/booster/linear/xgboost_linear.hpp b/booster/linear/xgboost_linear.hpp index a6e10199f..90d79c0bb 100644 --- a/booster/linear/xgboost_linear.hpp +++ b/booster/linear/xgboost_linear.hpp @@ -3,7 +3,7 @@ /*! * \file xgboost_linear.h * \brief Implementation of Linear booster, with L1/L2 regularization: Elastic Net - * the update rule is coordinate descent + * the update rule is coordinate descent, require column major format * \author Tianqi Chen: tianqi.tchen@gmail.com */ #include @@ -11,7 +11,6 @@ #include "../xgboost.h" #include "../../utils/xgboost_utils.h" -#include "../../utils/xgboost_matrix_csr.h" namespace xgboost{ namespace booster{ @@ -41,7 +40,7 @@ namespace xgboost{ const FMatrixS &smat, const std::vector &root_index ){ utils::Assert( grad.size() < UINT_MAX, "number of instance exceed what we can handle" ); - this->Update( smat, grad, hess ); + this->UpdateWeights( grad, hess, smat ); } virtual float Predict( const FMatrixS::Line &sp, unsigned rid = 0 ){ float sum = model.bias(); @@ -149,29 +148,17 @@ namespace xgboost{ return weight.back(); } }; - /*! \brief array entry for column based feature construction */ - struct SCEntry{ - /*! \brief feature value */ - float fvalue; - /*! \brief row index related to each row */ - unsigned rindex; - /*! \brief default constructor */ - SCEntry( void ){} - /*! \brief constructor using entry */ - SCEntry( float fvalue, unsigned rindex ){ - this->fvalue = fvalue; this->rindex = rindex; - } - }; private: int silent; protected: Model model; ParamTrain param; protected: + // update weights, should work for any FMatrix + template inline void UpdateWeights( std::vector &grad, const std::vector &hess, - const std::vector &rptr, - const std::vector &entry ){ + const FMatrix &smat ){ {// optimize bias double sum_grad = 0.0, sum_hess = 0.0; for( size_t i = 0; i < grad.size(); i ++ ){ @@ -187,70 +174,25 @@ namespace xgboost{ } // optimize weight - const int nfeat = model.param.num_feature; - for( int i = 0; i < nfeat; i ++ ){ - size_t start = rptr[i]; - size_t end = rptr[i+1]; - if( start >= end ) continue; + const unsigned nfeat= (unsigned)smat.NumCol(); + for( unsigned i = 0; i < nfeat; i ++ ){ + if( !smat.GetSortedCol( i ).Next() ) continue; double sum_grad = 0.0, sum_hess = 0.0; - for( size_t j = start; j < end; j ++ ){ - const float v = entry[j].fvalue; - sum_grad += grad[ entry[j].rindex ] * v; - sum_hess += hess[ entry[j].rindex ] * v * v; + for( typename FMatrix::ColIter it = smat.GetSortedCol(i); it.Next(); ){ + const float v = it.fvalue(); + sum_grad += grad[ it.rindex() ] * v; + sum_hess += hess[ it.rindex() ] * v * v; } float w = model.weight[ i ]; double dw = param.learning_rate * param.CalcDelta( sum_grad, sum_hess, w ); model.weight[ i ] += dw; // update grad value - for( size_t j = start; j < end; j ++ ){ - const float v = entry[j].fvalue; - grad[ entry[j].rindex ] += hess[ entry[j].rindex ] * v * dw; + for( typename FMatrix::ColIter it = smat.GetSortedCol(i); it.Next(); ){ + const float v = it.fvalue(); + grad[ it.rindex() ] += hess[ it.rindex() ] * v * dw; } } } - inline void MakeCmajor( std::vector &rptr, - std::vector &entry, - const std::vector &hess, - const FMatrixS &smat ){ - // transform to column order first - const int nfeat = model.param.num_feature; - // build CSR column major format data - utils::SparseCSRMBuilder builder( rptr, entry ); - builder.InitBudget( nfeat ); - for( unsigned i = 0; i < (unsigned)hess.size(); i ++ ){ - // skip deleted entries - if( hess[i] < 0.0f ) continue; - // add sparse part budget - FMatrixS::Line sp = smat[ i ]; - for( unsigned j = 0; j < sp.len; j ++ ){ - if( j == 0 || sp[j-1].findex != sp[j].findex ){ - builder.AddBudget( sp[j].findex ); - } - } - } - builder.InitStorage(); - for( unsigned i = 0; i < (unsigned)hess.size(); i ++ ){ - // skip deleted entries - if( hess[i] < 0.0f ) continue; - // add sparse part budget - FMatrixS::Line sp = smat[ i ]; - for( unsigned j = 0; j < sp.len; j ++ ){ - // skip duplicated terms - if( j == 0 || sp[j-1].findex != sp[j].findex ){ - builder.PushElem( sp[j].findex, SCEntry( sp[j].fvalue, i ) ); - } - } - } - } - protected: - virtual void Update( const FMatrixS &smat, - std::vector &grad, - const std::vector &hess ){ - std::vector rptr; - std::vector entry; - this->MakeCmajor( rptr, entry, hess, smat ); - this->UpdateWeights( grad, hess, rptr, entry ); - } }; }; }; diff --git a/booster/tree/xgboost_svdf_tree.hpp b/booster/tree/xgboost_svdf_tree.hpp index 8155618b2..b504910a4 100644 --- a/booster/tree/xgboost_svdf_tree.hpp +++ b/booster/tree/xgboost_svdf_tree.hpp @@ -2,7 +2,7 @@ #define _XGBOOST_APEX_TREE_HPP_ /*! * \file xgboost_svdf_tree.hpp - * \brief implementation of regression tree, with layerwise support + * \brief implementation of regression tree constructor, with layerwise support * this file is adapted from GBRT implementation in SVDFeature project * \author Tianqi Chen: tqchen@apex.sjtu.edu.cn, tianqi.tchen@gmail.com */ @@ -12,18 +12,7 @@ #include "../../utils/xgboost_matrix_csr.h" namespace xgboost{ - namespace booster{ - const bool rt_debug = false; - // whether to check bugs - const bool check_bug = false; - - const float rt_eps = 1e-5f; - const float rt_2eps = rt_eps * 2.0f; - - inline double sqr( double a ){ - return a * a; - } - + namespace booster{ inline void assert_sorted( unsigned *idset, int len ){ if( !rt_debug || !check_bug ) return; for( int i = 1; i < len; i ++ ){ @@ -32,21 +21,7 @@ namespace xgboost{ } }; - namespace booster{ - // node stat used in rtree - struct RTreeNodeStat{ - // loss chg caused by current split - float loss_chg; - // weight of current node - float base_weight; - // number of child that is leaf node known up to now - int leaf_child_cnt; - }; - - // structure of Regression Tree - class RTree: public TreeModel{ - }; - + namespace booster{ // selecter of rtree to find the suitable candidate class RTSelecter{ public: @@ -88,7 +63,9 @@ namespace xgboost{ } }; + // updater of rtree, allows the parameters to be stored inside, key solver + template class RTreeUpdater{ protected: // training task, element of single task @@ -128,10 +105,10 @@ namespace xgboost{ // training parameter const TreeParamTrain ¶m; // parameters, reference - RTree &tree; + RegTree &tree; std::vector &grad; std::vector &hess; - const FMatrixS &smat; + const FMatrix &smat; const std::vector &group_id; private: // maximum depth up to now @@ -158,7 +135,7 @@ namespace xgboost{ inline void try_prune_leaf( int nid, int depth ){ if( tree[ nid ].is_root() ) return; int pid = tree[ nid ].parent(); - RTree::NodeStat &s = tree.stat( pid ); + RegTree::NodeStat &s = tree.stat( pid ); s.leaf_child_cnt ++; if( s.leaf_child_cnt >= 2 && param.need_prune( s.loss_chg, depth - 1 ) ){ @@ -186,7 +163,7 @@ namespace xgboost{ // make split for current task, re-arrange positions in idset inline void make_split( Task tsk, const SCEntry *entry, int num, float loss_chg, double base_weight ){ // before split, first prepare statistics - RTree::NodeStat &s = tree.stat( tsk.nid ); + RegTree::NodeStat &s = tree.stat( tsk.nid ); s.loss_chg = loss_chg; s.leaf_child_cnt = 0; s.base_weight = static_cast( base_weight ); @@ -214,7 +191,7 @@ namespace xgboost{ } } // get two parts - RTree::Node &n = tree[ tsk.nid ]; + RegTree::Node &n = tree[ tsk.nid ]; Task def_part( n.default_left() ? n.cleft() : n.cright(), tsk.idset, tsk.len - qset.size(), s.base_weight ); Task spl_part( n.default_left() ? n.cright(): n.cleft() , tsk.idset + def_part.len, qset.size(), s.base_weight ); // fill back split part @@ -320,9 +297,8 @@ namespace xgboost{ rsum_grad += grad[ ridx ]; rsum_hess += hess[ ridx ]; - FMatrixS::Line sp = smat[ ridx ]; - for( unsigned j = 0; j < sp.len; j ++ ){ - builder.AddBudget( sp[j].findex ); + for( typename FMatrix::RowIter it = smat.GetRow(ridx); it.Next(); ){ + builder.AddBudget( it.findex() ); } } @@ -334,10 +310,9 @@ namespace xgboost{ builder.InitStorage(); for( unsigned i = 0; i < tsk.len; i ++ ){ const unsigned ridx = tsk.idset[i]; - FMatrixS::Line sp = smat[ ridx ]; - for( unsigned j = 0; j < sp.len; j ++ ){ - builder.PushElem( sp[j].findex, SCEntry( sp[j].fvalue, ridx ) ); - } + for( typename FMatrix::RowIter it = smat.GetRow(ridx); it.Next(); ){ + builder.PushElem( it.findex(), SCEntry( it.fvalue(), ridx ) ); + } } // --- end of building column major matrix --- // after this point, tmp_rptr and entry is ready to use @@ -426,10 +401,10 @@ namespace xgboost{ } public: RTreeUpdater( const TreeParamTrain &pparam, - RTree &ptree, + RegTree &ptree, std::vector &pgrad, std::vector &phess, - const FMatrixS &psmat, + const FMatrix &psmat, const std::vector &pgroup_id ): param( pparam ), tree( ptree ), grad( pgrad ), hess( phess ), smat( psmat ), group_id( pgroup_id ){ @@ -446,113 +421,6 @@ namespace xgboost{ return max_depth; } }; - - - - class RTreeTrainer : public IBooster{ - private: - int silent; - // tree of current shape - RTree tree; - TreeParamTrain param; - private: - std::vector tmp_feat; - std::vector tmp_funknown; - inline void init_tmpfeat( void ){ - if( tmp_feat.size() != (size_t)tree.param.num_feature ){ - tmp_feat.resize( tree.param.num_feature ); - tmp_funknown.resize( tree.param.num_feature ); - std::fill( tmp_funknown.begin(), tmp_funknown.end(), true ); - } - } - public: - virtual void SetParam( const char *name, const char *val ){ - if( !strcmp( name, "silent") ) silent = atoi( val ); - param.SetParam( name, val ); - tree.param.SetParam( name, val ); - } - virtual void LoadModel( utils::IStream &fi ){ - tree.LoadModel( fi ); - } - virtual void SaveModel( utils::IStream &fo ) const{ - tree.SaveModel( fo ); - } - virtual void InitModel( void ){ - tree.InitModel(); - } - private: - inline int get_next( int pid, float fvalue, bool is_unknown ){ - float split_value = tree[ pid ].split_cond(); - if( is_unknown ){ - if( tree[ pid ].default_left() ) return tree[ pid ].cleft(); - else return tree[ pid ].cright(); - }else{ - if( fvalue < split_value ) return tree[ pid ].cleft(); - else return tree[ pid ].cright(); - } - } - public: - virtual void DoBoost( std::vector &grad, - std::vector &hess, - const FMatrixS &smat, - const std::vector &group_id ){ - utils::Assert( grad.size() < UINT_MAX, "number of instance exceed what we can handle" ); - if( !silent ){ - printf( "\nbuild GBRT with %u instances\n", (unsigned)grad.size() ); - } - // start with a id set - RTreeUpdater updater( param, tree, grad, hess, smat, group_id ); - int num_pruned; - tree.param.max_depth = updater.do_boost( num_pruned ); - - if( !silent ){ - printf( "tree train end, %d roots, %d extra nodes, %d pruned nodes ,max_depth=%d\n", - tree.param.num_roots, tree.num_extra_nodes(), num_pruned, tree.param.max_depth ); - } - } - - virtual int GetLeafIndex( const std::vector &feat, - const std::vector &funknown, - unsigned gid = 0 ){ - // start from groups that belongs to current data - int pid = (int)gid; - // tranverse tree - while( !tree[ pid ].is_leaf() ){ - unsigned split_index = tree[ pid ].split_index(); - pid = this->get_next( pid, feat[ split_index ], funknown[ split_index ] ); - } - return pid; - } - virtual float Predict( const FMatrixS::Line &feat, unsigned gid = 0 ){ - this->init_tmpfeat(); - for( unsigned i = 0; i < feat.len; i ++ ){ - utils::Assert( feat[i].findex < (unsigned)tmp_funknown.size() , "input feature execeed bound" ); - tmp_funknown[ feat[i].findex ] = false; - tmp_feat[ feat[i].findex ] = feat[i].fvalue; - } - int pid = this->GetLeafIndex( tmp_feat, tmp_funknown, gid ); - // set back - for( unsigned i = 0; i < feat.len; i ++ ){ - tmp_funknown[ feat[i].findex ] = true; - } - return tree[ pid ].leaf_value(); - } - virtual float Predict( const std::vector &feat, - const std::vector &funknown, - unsigned gid = 0 ){ - utils::Assert( feat.size() >= (size_t)tree.param.num_feature, - "input data smaller than num feature" ); - int pid = this->GetLeafIndex( feat, funknown, gid ); - return tree[ pid ].leaf_value(); - } - - virtual void DumpModel( FILE *fo ){ - tree.DumpModel( fo ); - } - public: - RTreeTrainer( void ){ silent = 0; } - virtual ~RTreeTrainer( void ){} - }; }; }; #endif diff --git a/booster/tree/xgboost_tree_model.h b/booster/tree/xgboost_tree_model.h index 7cdfea981..74f864455 100644 --- a/booster/tree/xgboost_tree_model.h +++ b/booster/tree/xgboost_tree_model.h @@ -306,7 +306,7 @@ namespace xgboost{ } }; }; - + namespace booster{ /*! \brief training parameters for regression tree */ struct TreeParamTrain{ @@ -431,5 +431,20 @@ namespace xgboost{ } }; }; + + namespace booster{ + /*! \brief node statistics used in regression tree */ + struct RTreeNodeStat{ + // loss chg caused by current split + float loss_chg; + // weight of current node + float base_weight; + // number of child that is leaf node known up to now + int leaf_child_cnt; + }; + /*! \brief most comment structure of regression tree */ + class RegTree: public TreeModel{ + }; + }; }; #endif diff --git a/booster/xgboost.cpp b/booster/xgboost.cpp index 7507c11c6..61f42004b 100644 --- a/booster/xgboost.cpp +++ b/booster/xgboost.cpp @@ -11,18 +11,11 @@ #include "../utils/xgboost_utils.h" #include "xgboost_gbmbase.h" // implementations of boosters -#include "tree/xgboost_svdf_tree.hpp" +#include "tree/xgboost_tree.hpp" #include "linear/xgboost_linear.hpp" namespace xgboost{ namespace booster{ - /* - * \brief listing the types of boosters - */ - enum BOOSTER_TYPE_LIST{ - TREE, - LINEAR, - }; /*! * \brief create a gradient booster, given type of booster * \param booster_type type of gradient booster, can be used to specify implements @@ -30,8 +23,8 @@ namespace xgboost{ */ IBooster *CreateBooster( int booster_type ){ switch( booster_type ){ - case TREE: return new RTreeTrainer(); - case LINEAR: return new LinearBooster(); + case 0: return new RegTreeTrainer(); + case 1: return new LinearBooster(); default: utils::Error("unknown booster_type"); return NULL; } } diff --git a/booster/xgboost_data.h b/booster/xgboost_data.h index 84935e4a7..e7d5636cc 100644 --- a/booster/xgboost_data.h +++ b/booster/xgboost_data.h @@ -11,6 +11,7 @@ #include #include "../utils/xgboost_utils.h" #include "../utils/xgboost_stream.h" +#include "../utils/xgboost_matrix_csr.h" namespace xgboost{ namespace booster{ @@ -66,10 +67,6 @@ namespace xgboost{ inline bst_float fvalue( void ) const; }; public: - /*! - * \brief prepare sorted columns so that GetSortedCol can be called - */ - inline void MakeSortedCol( void ); /*! * \brief get number of rows * \return number of rows @@ -114,13 +111,13 @@ namespace xgboost{ bst_uint findex; /*! \brief feature value */ bst_float fvalue; - }; - /*! \brief one entry in a row */ - struct CEntry{ - /*! \brief row index */ - bst_uint rindex; - /*! \brief feature value */ - bst_float fvalue; + /*! \brief constructor */ + REntry( void ){} + /*! \brief constructor */ + REntry( bst_uint findex, bst_float fvalue ) : findex(findex), fvalue(fvalue){} + inline static bool cmp_fvalue( const REntry &a, const REntry &b ){ + return a.fvalue < b.fvalue; + } }; /*! \brief one row of sparse feature matrix */ struct Line{ @@ -128,25 +125,32 @@ namespace xgboost{ const REntry *data_; /*! \brief size of the data */ bst_uint len; + /*! \brief get k-th element */ inline const REntry& operator[]( unsigned i ) const{ return data_[i]; } }; - public: + /*! \brief row iterator */ struct RowIter{ - const REntry *dptr, *end; + const REntry *dptr_, *end_; inline bool Next( void ){ - if( dptr == end ) return false; + if( dptr_ == end_ ) return false; else{ - ++ dptr; return true; + ++ dptr_; return true; } } inline bst_uint findex( void ) const{ - return dptr->findex; + return dptr_->findex; } inline bst_float fvalue( void ) const{ - return dptr->fvalue; - } + return dptr_->fvalue; + } + }; + /*! \brief column iterator */ + struct ColIter: public RowIter{ + inline bst_uint rindex( void ) const{ + return this->findex(); + } }; public: /*! \brief constructor */ @@ -167,6 +171,8 @@ namespace xgboost{ row_ptr_.clear(); row_ptr_.push_back( 0 ); row_data_.clear(); + col_ptr_.clear(); + col_data_.clear(); } /*! \brief get sparse part of current row */ inline Line operator[]( size_t sidx ) const{ @@ -175,14 +181,6 @@ namespace xgboost{ sp.len = static_cast( row_ptr_[ sidx + 1 ] - row_ptr_[ sidx ] ); sp.data_ = &row_data_[ row_ptr_[ sidx ] ]; return sp; - } - /*! \brief get row iterator*/ - inline RowIter GetRow( size_t ridx ) const{ - utils::Assert( !bst_debug || ridx < this->NumRow(), "row id exceed bound" ); - RowIter it; - it.dptr = &row_data_[ row_ptr_[ridx] ] - 1; - it.dptr = &row_data_[ row_ptr_[ridx+1] ] - 1; - return it; } /*! * \brief add a row to the matrix, with data stored in STL container @@ -199,43 +197,124 @@ namespace xgboost{ unsigned cnt = 0; for( size_t i = 0; i < findex.size(); i ++ ){ if( findex[i] < fstart || findex[i] >= fend ) continue; - REntry e; e.findex = findex[i]; e.fvalue = fvalue[i]; - row_data_.push_back( e ); + row_data_.push_back( REntry( findex[i], fvalue[i] ) ); cnt ++; } row_ptr_.push_back( row_ptr_.back() + cnt ); return row_ptr_.size() - 2; } + /*! \brief get row iterator*/ + inline RowIter GetRow( size_t ridx ) const{ + utils::Assert( !bst_debug || ridx < this->NumRow(), "row id exceed bound" ); + RowIter it; + it.dptr_ = &row_data_[ row_ptr_[ridx] ] - 1; + it.end_ = &row_data_[ row_ptr_[ridx+1] ] - 1; + return it; + } public: + /*! \return whether column access is enabled */ + inline bool HaveColAccess( void ) const{ + return col_ptr_.size() != 0 && col_data_.size() == row_data_.size(); + } + /*! \brief get number of colmuns */ + inline size_t NumCol( void ) const{ + utils::Assert( this->HaveColAccess() ); + return col_ptr_.size() - 1; + } + /*! \brief get col iterator*/ + inline ColIter GetSortedCol( size_t cidx ) const{ + utils::Assert( !bst_debug || cidx < this->NumCol(), "col id exceed bound" ); + ColIter it; + it.dptr_ = &col_data_[ col_ptr_[cidx] ] - 1; + it.end_ = &col_data_[ col_ptr_[cidx+1] ] - 1; + return it; + } + /*! + * \brief intialize the data so that we have both column and row major + * access, call this whenever we need column access + */ + inline void InitData( void ){ + utils::SparseCSRMBuilder builder( col_ptr_, col_data_ ); + builder.InitBudget( 0 ); + for( size_t i = 0; i < this->NumRow(); i ++ ){ + for( RowIter it = this->GetRow(i); it.Next(); ){ + builder.AddBudget( it.findex() ); + } + } + builder.InitStorage(); + for( size_t i = 0; i < this->NumRow(); i ++ ){ + for( RowIter it = this->GetRow(i); it.Next(); ){ + builder.PushElem( it.findex(), REntry( (bst_uint)i, it.fvalue() ) ); + } + } + // sort columns + unsigned ncol = static_cast( this->NumCol() ); + for( unsigned i = 0; i < ncol; i ++ ){ + std::sort( &col_data_[ col_ptr_[ i ] ], &col_data_[ col_ptr_[ i+1 ] ], REntry::cmp_fvalue ); + } + } /*! * \brief save data to binary stream - * note: since we have size_t in row_ptr, + * note: since we have size_t in ptr, * the function is not consistent between 64bit and 32bit machine * \param fo output stream */ - inline void SaveBinary(utils::IStream &fo ) const{ - size_t nrow = this->NumRow(); - fo.Write( &nrow, sizeof(size_t) ); - fo.Write( &row_ptr_[0], row_ptr_.size() * sizeof(size_t) ); - if( row_data_.size() != 0 ){ - fo.Write( &row_data_[0] , row_data_.size() * sizeof(REntry) ); + inline void SaveBinary( utils::IStream &fo ) const{ + FMatrixS::SaveBinary( fo, row_ptr_, row_data_ ); + int col_access = this->HaveColAccess() ? 1 : 0; + fo.Write( &col_access, sizeof(int) ); + if( col_access != 0 ){ + FMatrixS::SaveBinary( fo, col_ptr_, col_data_ ); } } /*! * \brief load data from binary stream - * note: since we have size_t in row_ptr, - * the function is not consistent between 64bit and 32bit machine - * \param fi output stream + * note: since we have size_t in ptr, + * the function is not consistent between 64bit and 32bit machin + * \param fi input stream */ inline void LoadBinary( utils::IStream &fi ){ + FMatrixS::LoadBinary( fi, row_ptr_, row_data_ ); + int col_access; + fi.Read( &col_access, sizeof(int) ); + if( col_access != 0 ){ + FMatrixS::LoadBinary( fi, col_ptr_, col_data_ ); + } + } + private: + /*! + * \brief save data to binary stream + * \param fo output stream + * \param ptr pointer data + * \param data data content + */ + inline static void SaveBinary( utils::IStream &fo, + const std::vector &ptr, + const std::vector &data ){ + size_t nrow = ptr.size() - 1; + fo.Write( &nrow, sizeof(size_t) ); + fo.Write( &ptr[0], ptr.size() * sizeof(size_t) ); + if( data.size() != 0 ){ + fo.Write( &data[0] , data.size() * sizeof(REntry) ); + } + } + /*! + * \brief load data from binary stream + * \param fi input stream + * \param ptr pointer data + * \param data data content + */ + inline static void LoadBinary( utils::IStream &fi, + std::vector &ptr, + std::vector &data ){ size_t nrow; utils::Assert( fi.Read( &nrow, sizeof(size_t) ) != 0, "Load FMatrixS" ); - row_ptr_.resize( nrow + 1 ); - utils::Assert( fi.Read( &row_ptr_[0], row_ptr_.size() * sizeof(size_t) ), "Load FMatrixS" ); + ptr.resize( nrow + 1 ); + utils::Assert( fi.Read( &ptr[0], ptr.size() * sizeof(size_t) ), "Load FMatrixS" ); - row_data_.resize( row_ptr_.back() ); - if( row_data_.size() != 0 ){ - utils::Assert( fi.Read( &row_data_[0] , row_data_.size() * sizeof(REntry) ) , "Load FMatrixS" ); + data.resize( ptr.back() ); + if( data.size() != 0 ){ + utils::Assert( fi.Read( &data[0] , data.size() * sizeof(REntry) ) , "Load FMatrixS" ); } } private: @@ -243,6 +322,10 @@ namespace xgboost{ std::vector row_ptr_; /*! \brief data in the row */ std::vector row_data_; + /*! \brief column pointer of CSC format */ + std::vector col_ptr_; + /*! \brief column datas */ + std::vector col_data_; }; }; }; diff --git a/regression/xgboost_regdata.h b/regression/xgboost_regdata.h index 0a86c51f5..56c1bef47 100644 --- a/regression/xgboost_regdata.h +++ b/regression/xgboost_regdata.h @@ -65,11 +65,12 @@ namespace xgboost{ labels.push_back( label ); data.AddRow( findex, fvalue ); + // initialize column support as well + data.InitData(); - this->UpdateInfo(); if( !silent ){ printf("%ux%u matrix with %lu entries is loaded from %s\n", - (unsigned)labels.size(), num_feature, (unsigned long)data.NumEntry(), fname ); + (unsigned)data.NumRow(), (unsigned)data.NumCol(), (unsigned long)data.NumEntry(), fname ); } fclose(file); } @@ -87,10 +88,12 @@ namespace xgboost{ labels.resize( data.NumRow() ); utils::Assert( fs.Read( &labels[0], sizeof(float) * data.NumRow() ) != 0, "DMatrix LoadBinary" ); fs.Close(); - this->UpdateInfo(); + // initialize column support as well + data.InitData(); + if( !silent ){ printf("%ux%u matrix with %lu entries is loaded from %s\n", - (unsigned)labels.size(), num_feature, (unsigned long)data.NumEntry(), fname ); + (unsigned)data.NumRow(), (unsigned)data.NumCol(), (unsigned long)data.NumEntry(), fname ); } return true; } @@ -100,13 +103,16 @@ namespace xgboost{ * \param silent whether print information or not */ inline void SaveBinary( const char* fname, bool silent = false ){ + // initialize column support as well + data.InitData(); + utils::FileStream fs( utils::FopenCheck( fname, "wb" ) ); data.SaveBinary( fs ); fs.Write( &labels[0], sizeof(float) * data.NumRow() ); fs.Close(); if( !silent ){ printf("%ux%u matrix with %lu entries is saved to %s\n", - (unsigned)labels.size(), num_feature, (unsigned long)data.NumEntry(), fname ); + (unsigned)data.NumRow(), (unsigned)data.NumCol(), (unsigned long)data.NumEntry(), fname ); } } /*! diff --git a/utils/xgboost_matrix_csr.h b/utils/xgboost_matrix_csr.h index 58c54f77b..b000095da 100644 --- a/utils/xgboost_matrix_csr.h +++ b/utils/xgboost_matrix_csr.h @@ -43,13 +43,13 @@ namespace xgboost{ } public: /*! - * \brief step 1: initialize the number of rows in the data - * \nrows number of rows in the matrix + * \brief step 1: initialize the number of rows in the data, not necessary exact + * \nrows number of rows in the matrix, can be smaller than expected */ - inline void InitBudget( size_t nrows ){ + inline void InitBudget( size_t nrows = 0 ){ if( !UseAcList ){ - rptr.resize( nrows + 1 ); - std::fill( rptr.begin(), rptr.end(), 0 ); + rptr.clear(); + rptr.resize( nrows + 1, 0 ); }else{ Assert( nrows + 1 == rptr.size(), "rptr must be initialized already" ); this->Cleanup(); @@ -61,6 +61,9 @@ namespace xgboost{ * \param nelem number of element budget add to this row */ inline void AddBudget( size_t row_id, size_t nelem = 1 ){ + if( rptr.size() < row_id + 2 ){ + rptr.resize( row_id + 2, 0 ); + } if( UseAcList ){ if( rptr[ row_id + 1 ] == 0 ) aclist.push_back( row_id ); }