From 752f336cb394ed97dd038ed4f5f273b96db00f48 Mon Sep 17 00:00:00 2001 From: tqchen Date: Fri, 28 Feb 2014 20:09:40 -0800 Subject: [PATCH] chg license, README --- LICENSE | 28 +++++-------- README.md | 23 +++++----- booster/tree/xgboost_col_treemaker.hpp | 55 ++++++++++++++++++++++-- booster/tree/xgboost_tree_model.h | 58 +++++++++++++------------- 4 files changed, 102 insertions(+), 62 deletions(-) diff --git a/LICENSE b/LICENSE index fd2b0bef2..2d9ea05e4 100644 --- a/LICENSE +++ b/LICENSE @@ -1,21 +1,13 @@ -The MIT License (MIT) - Copyright (c) 2014 Tianqi Chen -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. diff --git a/README.md b/README.md index 32ceb2706..f321609e3 100644 --- a/README.md +++ b/README.md @@ -1,18 +1,17 @@ -xgboost: A Gradient Boosting Library +xgboost: eXtreme Gradient Boosting Library ======= -Creater: Tianqi Chen: tianqi.tchen AT gmail +Creater: Tianqi Chen -General Purpose Gradient Boosting Library - -Goal: A stand-alone efficient library to do learning via boosting in functional space - -Features: -* Sparse feature format, handling of missing features. This allows efficient categorical feature encoding as indicators. The speed of booster only depends on number of existing features. +Features +======= +* Sparse feature format: + - Sparse feature format allows easy handling of missing values, and improve computation efficiency. +* Push the limit on single machine: + - Efficient implementation that optimizes memory and computation. * Layout of gradient boosting algorithm to support generic tasks, see project wiki. - -Planned key components: - +Planned key components +======= * Gradient boosting models: - regression tree (GBRT) - linear model/lasso @@ -22,7 +21,7 @@ Planned key components: - ranking - matrix factorization - structured prediction -(3) OpenMP implementation(optional) +(3) OpenMP implementation File extension convention: (1) .h are interface, utils and data structures, with detailed comment; diff --git a/booster/tree/xgboost_col_treemaker.hpp b/booster/tree/xgboost_col_treemaker.hpp index d5591d8df..3b1258e21 100644 --- a/booster/tree/xgboost_col_treemaker.hpp +++ b/booster/tree/xgboost_col_treemaker.hpp @@ -65,20 +65,59 @@ namespace xgboost{ double sum_grad; /*! \brief sum hessian statistics */ double sum_hess; + /*! \brief last feature value scanned */ + float last_fvalue; /*! \brief current best solution */ SplitEntry best; + /*! \brief constructor */ ThreadEntry( void ){ sum_grad = sum_hess = 0; } }; private: + inline void CleanSTemp( const std::vector &qexpand ){ + for( size_t i = 0; i < stemp.size(); ++ i ){ + for( size_t j = 0; j < qexpand.size(); ++ j ){ + ThreadEntry &e = stemp[i][ qexpand[j] ]; + e.sum_grad = e.sum_hess = 0.0f; + } + } + } + // make leaf nodes for all qexpand, update node statistics, mark leaf value + inline void UpdateSNode( const std::vector &qexpand ){ + this->CleanSTemp( qexpand ); + // step 1: find sum statistics + const unsigned ndata = static_cast( position.size() ); + #pragma omp parallel for schedule( static ) + for( unsigned i = 0; i < ndata; ++ i ){ + const int tid = omp_get_thread_num(); + if( position[i] < 0 ) continue; + stemp[tid][ position[i] ].sum_grad += grad[i]; + stemp[tid][ position[i] ].sum_hess += hess[i]; + } + for( size_t j = 0; j < qexpand.size(); ++ j ){ + double sum_grad = 0.0, sum_hess = 0.0; + for( size_t tid = 0; tid < stemp.size(); tid ++ ){ + sum_grad += stemp[tid][j].sum_grad; + sum_hess += stemp[tid][j].sum_hess; + } + if( !tree[j].is_root() ){ + const float pweight = snode[ tree[j].parent() ].weight; + snode[j].weight = param.CalcWeight( sum_grad, sum_hess, pweight ); + }else{ + snode[j].weight = param.CalcWeight( sum_grad, sum_hess, 0.0f ); + snode[j].loss_gain = param.CalcGain( sum_grad, sum_hess, 0.0f ); + } + } + } // find split at current level inline void FindSplit( int depth ){ unsigned nsize = static_cast(feat_index.size()); - #pragma omp parallel for schedule( dynamic, 1 ) for( unsigned i = 0; i < nsize; ++ i ){ - const unsigned fid = feat_index[i]; + const unsigned fid = feat_index[i]; + const int tid = omp_get_thread_num(); + } } // initialize temp data structure @@ -93,7 +132,8 @@ namespace xgboost{ } } {// initialize feature index - for( int i = 0; i < tree.param.num_feature; i ++ ){ + int ncol = static_cast( smat.NumCol() ); + for( int i = 0; i < ncol; i ++ ){ if( smat.GetSortedCol(i).Next() ){ feat_index.push_back( i ); } @@ -116,9 +156,18 @@ namespace xgboost{ {// setup statistics space for each tree node snode.resize( tree.param.num_roots, SplitEntry() ); } + + {// expand query + qexpand.reserve( 256 ); qexpand.clear(); + for( int i = 0; i < tree.param.num_roots; ++ i ){ + qexpand.push_back( i ); + } + } } private: // local helper tmp data structure + // queue of nodes to be expanded + std::vector qexpand; // Per feature: shuffle index of each feature index std::vector feat_index; // Instance Data: current node position in the tree of each instance diff --git a/booster/tree/xgboost_tree_model.h b/booster/tree/xgboost_tree_model.h index 31501f215..d8dd8e03a 100644 --- a/booster/tree/xgboost_tree_model.h +++ b/booster/tree/xgboost_tree_model.h @@ -68,65 +68,65 @@ namespace xgboost{ }; private: // pointer to parent, highest bit is used to indicate whether it's a left child or not - int sparent; + int parent_; // pointer to left, right - int left, right; + int cleft_, cright_; // split feature index, left split or right split depends on the highest bit - unsigned sindex; + unsigned sindex_; // extra info - Info info; + Info info_; private: inline void set_parent( int pidx, bool is_left_child = true ){ if( is_left_child ) pidx |= (1U << 31); - this->sparent = pidx; + this->parent_ = pidx; } public: /*! \brief index of left child */ inline int cleft( void ) const{ - return this->left; + return this->cleft_; } /*! \brief index of right child */ inline int cright( void ) const{ - return this->right; + return this->cright_; } /*! \brief feature index of split condition */ inline unsigned split_index( void ) const{ - return sindex & ( (1U<<31) - 1U ); + return sindex_ & ( (1U<<31) - 1U ); } /*! \brief when feature is unknown, whether goes to left child */ inline bool default_left( void ) const{ - return (sindex >> 31) != 0; + return (sindex_ >> 31) != 0; } /*! \brief whether current node is leaf node */ inline bool is_leaf( void ) const{ - return left == -1; + return cleft_ == -1; } /*! \brief get leaf value of leaf node */ inline float leaf_value( void ) const{ - return (this->info).leaf_value; + return (this->info_).leaf_value; } /*! \brief get split condition of the node */ inline TSplitCond split_cond( void ) const{ - return (this->info).split_cond; + return (this->info_).split_cond; } /*! \brief get parent of the node */ inline int parent( void ) const{ - return sparent & ( (1U << 31) - 1 ); + return parent_ & ( (1U << 31) - 1 ); } /*! \brief whether current node is left child */ inline bool is_left_child( void ) const{ - return ( sparent & (1U << 31)) != 0; + return ( parent_ & (1U << 31)) != 0; } /*! \brief whether current node is root */ inline bool is_root( void ) const{ - return sparent == -1; + return parent_ == -1; } /*! * \brief set the right child * \param nide node id to right child */ inline void set_right_child( int nid ){ - this->right = nid; + this->cright_ = nid; } /*! * \brief set split condition of current node @@ -136,8 +136,8 @@ namespace xgboost{ */ inline void set_split( unsigned split_index, TSplitCond split_cond, bool default_left = false ){ if( default_left ) split_index |= (1U << 31); - this->sindex = split_index; - (this->info).split_cond = split_cond; + this->sindex_ = split_index; + (this->info_).split_cond = split_cond; } /*! * \brief set the leaf value of the node @@ -146,9 +146,9 @@ namespace xgboost{ * additional information */ inline void set_leaf( float value, int right = -1 ){ - (this->info).leaf_value = value; - this->left = -1; - this->right = right; + (this->info_).leaf_value = value; + this->cleft_ = -1; + this->cright_ = right; } }; protected: @@ -187,10 +187,10 @@ namespace xgboost{ * \param new leaf value */ inline void ChangeToLeaf( int rid, float value ){ - utils::Assert( nodes[ nodes[rid].left ].is_leaf(), "can not delete a non termial child"); - utils::Assert( nodes[ nodes[rid].right ].is_leaf(), "can not delete a non termial child"); - this->DeleteNode( nodes[ rid ].left ); - this->DeleteNode( nodes[ rid ].right ); + utils::Assert( nodes[ nodes[rid].cleft() ].is_leaf(), "can not delete a non termial child"); + utils::Assert( nodes[ nodes[rid].cright() ].is_leaf(), "can not delete a non termial child"); + this->DeleteNode( nodes[ rid ].cleft() ); + this->DeleteNode( nodes[ rid ].cright() ); nodes[ rid ].set_leaf( value ); } public: @@ -253,10 +253,10 @@ namespace xgboost{ inline void AddChilds( int nid ){ int pleft = this->AllocNode(); int pright = this->AllocNode(); - nodes[ nid ].left = pleft; - nodes[ nid ].right = pright; - nodes[ nodes[ nid ].left ].set_parent( nid, true ); - nodes[ nodes[ nid ].right ].set_parent( nid, false ); + nodes[ nid ].cleft_ = pleft; + nodes[ nid ].cright_ = pright; + nodes[ nodes[ nid ].cleft() ].set_parent( nid, true ); + nodes[ nodes[ nid ].cright() ].set_parent( nid, false ); } /*! * \brief only add a right child to a leaf node