chg license, README
This commit is contained in:
parent
fffad41e53
commit
752f336cb3
28
LICENSE
28
LICENSE
@ -1,21 +1,13 @@
|
|||||||
The MIT License (MIT)
|
|
||||||
|
|
||||||
Copyright (c) 2014 Tianqi Chen
|
Copyright (c) 2014 Tianqi Chen
|
||||||
|
|
||||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
of this software and associated documentation files (the "Software"), to deal
|
you may not use this file except in compliance with the License.
|
||||||
in the Software without restriction, including without limitation the rights
|
You may obtain a copy of the License at
|
||||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
||||||
copies of the Software, and to permit persons to whom the Software is
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
furnished to do so, subject to the following conditions:
|
|
||||||
|
|
||||||
The above copyright notice and this permission notice shall be included in all
|
Unless required by applicable law or agreed to in writing, software
|
||||||
copies or substantial portions of the Software.
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
See the License for the specific language governing permissions and
|
||||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
limitations under the License.
|
||||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
||||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
||||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
||||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
||||||
SOFTWARE.
|
|
||||||
|
|||||||
23
README.md
23
README.md
@ -1,18 +1,17 @@
|
|||||||
xgboost: A Gradient Boosting Library
|
xgboost: eXtreme Gradient Boosting Library
|
||||||
=======
|
=======
|
||||||
Creater: Tianqi Chen: tianqi.tchen AT gmail
|
Creater: Tianqi Chen
|
||||||
|
|
||||||
General Purpose Gradient Boosting Library
|
Features
|
||||||
|
=======
|
||||||
Goal: A stand-alone efficient library to do learning via boosting in functional space
|
* Sparse feature format:
|
||||||
|
- Sparse feature format allows easy handling of missing values, and improve computation efficiency.
|
||||||
Features:
|
* Push the limit on single machine:
|
||||||
* Sparse feature format, handling of missing features. This allows efficient categorical feature encoding as indicators. The speed of booster only depends on number of existing features.
|
- Efficient implementation that optimizes memory and computation.
|
||||||
* Layout of gradient boosting algorithm to support generic tasks, see project wiki.
|
* Layout of gradient boosting algorithm to support generic tasks, see project wiki.
|
||||||
|
|
||||||
|
Planned key components
|
||||||
Planned key components:
|
=======
|
||||||
|
|
||||||
* Gradient boosting models:
|
* Gradient boosting models:
|
||||||
- regression tree (GBRT)
|
- regression tree (GBRT)
|
||||||
- linear model/lasso
|
- linear model/lasso
|
||||||
@ -22,7 +21,7 @@ Planned key components:
|
|||||||
- ranking
|
- ranking
|
||||||
- matrix factorization
|
- matrix factorization
|
||||||
- structured prediction
|
- structured prediction
|
||||||
(3) OpenMP implementation(optional)
|
(3) OpenMP implementation
|
||||||
|
|
||||||
File extension convention:
|
File extension convention:
|
||||||
(1) .h are interface, utils and data structures, with detailed comment;
|
(1) .h are interface, utils and data structures, with detailed comment;
|
||||||
|
|||||||
@ -65,20 +65,59 @@ namespace xgboost{
|
|||||||
double sum_grad;
|
double sum_grad;
|
||||||
/*! \brief sum hessian statistics */
|
/*! \brief sum hessian statistics */
|
||||||
double sum_hess;
|
double sum_hess;
|
||||||
|
/*! \brief last feature value scanned */
|
||||||
|
float last_fvalue;
|
||||||
/*! \brief current best solution */
|
/*! \brief current best solution */
|
||||||
SplitEntry best;
|
SplitEntry best;
|
||||||
|
/*! \brief constructor */
|
||||||
ThreadEntry( void ){
|
ThreadEntry( void ){
|
||||||
sum_grad = sum_hess = 0;
|
sum_grad = sum_hess = 0;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
private:
|
private:
|
||||||
|
inline void CleanSTemp( const std::vector<int> &qexpand ){
|
||||||
|
for( size_t i = 0; i < stemp.size(); ++ i ){
|
||||||
|
for( size_t j = 0; j < qexpand.size(); ++ j ){
|
||||||
|
ThreadEntry &e = stemp[i][ qexpand[j] ];
|
||||||
|
e.sum_grad = e.sum_hess = 0.0f;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// make leaf nodes for all qexpand, update node statistics, mark leaf value
|
||||||
|
inline void UpdateSNode( const std::vector<int> &qexpand ){
|
||||||
|
this->CleanSTemp( qexpand );
|
||||||
|
// step 1: find sum statistics
|
||||||
|
const unsigned ndata = static_cast<unsigned>( position.size() );
|
||||||
|
#pragma omp parallel for schedule( static )
|
||||||
|
for( unsigned i = 0; i < ndata; ++ i ){
|
||||||
|
const int tid = omp_get_thread_num();
|
||||||
|
if( position[i] < 0 ) continue;
|
||||||
|
stemp[tid][ position[i] ].sum_grad += grad[i];
|
||||||
|
stemp[tid][ position[i] ].sum_hess += hess[i];
|
||||||
|
}
|
||||||
|
for( size_t j = 0; j < qexpand.size(); ++ j ){
|
||||||
|
double sum_grad = 0.0, sum_hess = 0.0;
|
||||||
|
for( size_t tid = 0; tid < stemp.size(); tid ++ ){
|
||||||
|
sum_grad += stemp[tid][j].sum_grad;
|
||||||
|
sum_hess += stemp[tid][j].sum_hess;
|
||||||
|
}
|
||||||
|
if( !tree[j].is_root() ){
|
||||||
|
const float pweight = snode[ tree[j].parent() ].weight;
|
||||||
|
snode[j].weight = param.CalcWeight( sum_grad, sum_hess, pweight );
|
||||||
|
}else{
|
||||||
|
snode[j].weight = param.CalcWeight( sum_grad, sum_hess, 0.0f );
|
||||||
|
snode[j].loss_gain = param.CalcGain( sum_grad, sum_hess, 0.0f );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
// find split at current level
|
// find split at current level
|
||||||
inline void FindSplit( int depth ){
|
inline void FindSplit( int depth ){
|
||||||
unsigned nsize = static_cast<unsigned>(feat_index.size());
|
unsigned nsize = static_cast<unsigned>(feat_index.size());
|
||||||
|
|
||||||
#pragma omp parallel for schedule( dynamic, 1 )
|
#pragma omp parallel for schedule( dynamic, 1 )
|
||||||
for( unsigned i = 0; i < nsize; ++ i ){
|
for( unsigned i = 0; i < nsize; ++ i ){
|
||||||
const unsigned fid = feat_index[i];
|
const unsigned fid = feat_index[i];
|
||||||
|
const int tid = omp_get_thread_num();
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// initialize temp data structure
|
// initialize temp data structure
|
||||||
@ -93,7 +132,8 @@ namespace xgboost{
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
{// initialize feature index
|
{// initialize feature index
|
||||||
for( int i = 0; i < tree.param.num_feature; i ++ ){
|
int ncol = static_cast<int>( smat.NumCol() );
|
||||||
|
for( int i = 0; i < ncol; i ++ ){
|
||||||
if( smat.GetSortedCol(i).Next() ){
|
if( smat.GetSortedCol(i).Next() ){
|
||||||
feat_index.push_back( i );
|
feat_index.push_back( i );
|
||||||
}
|
}
|
||||||
@ -116,9 +156,18 @@ namespace xgboost{
|
|||||||
{// setup statistics space for each tree node
|
{// setup statistics space for each tree node
|
||||||
snode.resize( tree.param.num_roots, SplitEntry() );
|
snode.resize( tree.param.num_roots, SplitEntry() );
|
||||||
}
|
}
|
||||||
|
|
||||||
|
{// expand query
|
||||||
|
qexpand.reserve( 256 ); qexpand.clear();
|
||||||
|
for( int i = 0; i < tree.param.num_roots; ++ i ){
|
||||||
|
qexpand.push_back( i );
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
private:
|
private:
|
||||||
// local helper tmp data structure
|
// local helper tmp data structure
|
||||||
|
// queue of nodes to be expanded
|
||||||
|
std::vector<int> qexpand;
|
||||||
// Per feature: shuffle index of each feature index
|
// Per feature: shuffle index of each feature index
|
||||||
std::vector<int> feat_index;
|
std::vector<int> feat_index;
|
||||||
// Instance Data: current node position in the tree of each instance
|
// Instance Data: current node position in the tree of each instance
|
||||||
|
|||||||
@ -68,65 +68,65 @@ namespace xgboost{
|
|||||||
};
|
};
|
||||||
private:
|
private:
|
||||||
// pointer to parent, highest bit is used to indicate whether it's a left child or not
|
// pointer to parent, highest bit is used to indicate whether it's a left child or not
|
||||||
int sparent;
|
int parent_;
|
||||||
// pointer to left, right
|
// pointer to left, right
|
||||||
int left, right;
|
int cleft_, cright_;
|
||||||
// split feature index, left split or right split depends on the highest bit
|
// split feature index, left split or right split depends on the highest bit
|
||||||
unsigned sindex;
|
unsigned sindex_;
|
||||||
// extra info
|
// extra info
|
||||||
Info info;
|
Info info_;
|
||||||
private:
|
private:
|
||||||
inline void set_parent( int pidx, bool is_left_child = true ){
|
inline void set_parent( int pidx, bool is_left_child = true ){
|
||||||
if( is_left_child ) pidx |= (1U << 31);
|
if( is_left_child ) pidx |= (1U << 31);
|
||||||
this->sparent = pidx;
|
this->parent_ = pidx;
|
||||||
}
|
}
|
||||||
public:
|
public:
|
||||||
/*! \brief index of left child */
|
/*! \brief index of left child */
|
||||||
inline int cleft( void ) const{
|
inline int cleft( void ) const{
|
||||||
return this->left;
|
return this->cleft_;
|
||||||
}
|
}
|
||||||
/*! \brief index of right child */
|
/*! \brief index of right child */
|
||||||
inline int cright( void ) const{
|
inline int cright( void ) const{
|
||||||
return this->right;
|
return this->cright_;
|
||||||
}
|
}
|
||||||
/*! \brief feature index of split condition */
|
/*! \brief feature index of split condition */
|
||||||
inline unsigned split_index( void ) const{
|
inline unsigned split_index( void ) const{
|
||||||
return sindex & ( (1U<<31) - 1U );
|
return sindex_ & ( (1U<<31) - 1U );
|
||||||
}
|
}
|
||||||
/*! \brief when feature is unknown, whether goes to left child */
|
/*! \brief when feature is unknown, whether goes to left child */
|
||||||
inline bool default_left( void ) const{
|
inline bool default_left( void ) const{
|
||||||
return (sindex >> 31) != 0;
|
return (sindex_ >> 31) != 0;
|
||||||
}
|
}
|
||||||
/*! \brief whether current node is leaf node */
|
/*! \brief whether current node is leaf node */
|
||||||
inline bool is_leaf( void ) const{
|
inline bool is_leaf( void ) const{
|
||||||
return left == -1;
|
return cleft_ == -1;
|
||||||
}
|
}
|
||||||
/*! \brief get leaf value of leaf node */
|
/*! \brief get leaf value of leaf node */
|
||||||
inline float leaf_value( void ) const{
|
inline float leaf_value( void ) const{
|
||||||
return (this->info).leaf_value;
|
return (this->info_).leaf_value;
|
||||||
}
|
}
|
||||||
/*! \brief get split condition of the node */
|
/*! \brief get split condition of the node */
|
||||||
inline TSplitCond split_cond( void ) const{
|
inline TSplitCond split_cond( void ) const{
|
||||||
return (this->info).split_cond;
|
return (this->info_).split_cond;
|
||||||
}
|
}
|
||||||
/*! \brief get parent of the node */
|
/*! \brief get parent of the node */
|
||||||
inline int parent( void ) const{
|
inline int parent( void ) const{
|
||||||
return sparent & ( (1U << 31) - 1 );
|
return parent_ & ( (1U << 31) - 1 );
|
||||||
}
|
}
|
||||||
/*! \brief whether current node is left child */
|
/*! \brief whether current node is left child */
|
||||||
inline bool is_left_child( void ) const{
|
inline bool is_left_child( void ) const{
|
||||||
return ( sparent & (1U << 31)) != 0;
|
return ( parent_ & (1U << 31)) != 0;
|
||||||
}
|
}
|
||||||
/*! \brief whether current node is root */
|
/*! \brief whether current node is root */
|
||||||
inline bool is_root( void ) const{
|
inline bool is_root( void ) const{
|
||||||
return sparent == -1;
|
return parent_ == -1;
|
||||||
}
|
}
|
||||||
/*!
|
/*!
|
||||||
* \brief set the right child
|
* \brief set the right child
|
||||||
* \param nide node id to right child
|
* \param nide node id to right child
|
||||||
*/
|
*/
|
||||||
inline void set_right_child( int nid ){
|
inline void set_right_child( int nid ){
|
||||||
this->right = nid;
|
this->cright_ = nid;
|
||||||
}
|
}
|
||||||
/*!
|
/*!
|
||||||
* \brief set split condition of current node
|
* \brief set split condition of current node
|
||||||
@ -136,8 +136,8 @@ namespace xgboost{
|
|||||||
*/
|
*/
|
||||||
inline void set_split( unsigned split_index, TSplitCond split_cond, bool default_left = false ){
|
inline void set_split( unsigned split_index, TSplitCond split_cond, bool default_left = false ){
|
||||||
if( default_left ) split_index |= (1U << 31);
|
if( default_left ) split_index |= (1U << 31);
|
||||||
this->sindex = split_index;
|
this->sindex_ = split_index;
|
||||||
(this->info).split_cond = split_cond;
|
(this->info_).split_cond = split_cond;
|
||||||
}
|
}
|
||||||
/*!
|
/*!
|
||||||
* \brief set the leaf value of the node
|
* \brief set the leaf value of the node
|
||||||
@ -146,9 +146,9 @@ namespace xgboost{
|
|||||||
* additional information
|
* additional information
|
||||||
*/
|
*/
|
||||||
inline void set_leaf( float value, int right = -1 ){
|
inline void set_leaf( float value, int right = -1 ){
|
||||||
(this->info).leaf_value = value;
|
(this->info_).leaf_value = value;
|
||||||
this->left = -1;
|
this->cleft_ = -1;
|
||||||
this->right = right;
|
this->cright_ = right;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
protected:
|
protected:
|
||||||
@ -187,10 +187,10 @@ namespace xgboost{
|
|||||||
* \param new leaf value
|
* \param new leaf value
|
||||||
*/
|
*/
|
||||||
inline void ChangeToLeaf( int rid, float value ){
|
inline void ChangeToLeaf( int rid, float value ){
|
||||||
utils::Assert( nodes[ nodes[rid].left ].is_leaf(), "can not delete a non termial child");
|
utils::Assert( nodes[ nodes[rid].cleft() ].is_leaf(), "can not delete a non termial child");
|
||||||
utils::Assert( nodes[ nodes[rid].right ].is_leaf(), "can not delete a non termial child");
|
utils::Assert( nodes[ nodes[rid].cright() ].is_leaf(), "can not delete a non termial child");
|
||||||
this->DeleteNode( nodes[ rid ].left );
|
this->DeleteNode( nodes[ rid ].cleft() );
|
||||||
this->DeleteNode( nodes[ rid ].right );
|
this->DeleteNode( nodes[ rid ].cright() );
|
||||||
nodes[ rid ].set_leaf( value );
|
nodes[ rid ].set_leaf( value );
|
||||||
}
|
}
|
||||||
public:
|
public:
|
||||||
@ -253,10 +253,10 @@ namespace xgboost{
|
|||||||
inline void AddChilds( int nid ){
|
inline void AddChilds( int nid ){
|
||||||
int pleft = this->AllocNode();
|
int pleft = this->AllocNode();
|
||||||
int pright = this->AllocNode();
|
int pright = this->AllocNode();
|
||||||
nodes[ nid ].left = pleft;
|
nodes[ nid ].cleft_ = pleft;
|
||||||
nodes[ nid ].right = pright;
|
nodes[ nid ].cright_ = pright;
|
||||||
nodes[ nodes[ nid ].left ].set_parent( nid, true );
|
nodes[ nodes[ nid ].cleft() ].set_parent( nid, true );
|
||||||
nodes[ nodes[ nid ].right ].set_parent( nid, false );
|
nodes[ nodes[ nid ].cright() ].set_parent( nid, false );
|
||||||
}
|
}
|
||||||
/*!
|
/*!
|
||||||
* \brief only add a right child to a leaf node
|
* \brief only add a right child to a leaf node
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user