chg license, README
This commit is contained in:
parent
b57656902e
commit
e4a4f7d315
28
LICENSE
28
LICENSE
@ -1,21 +1,13 @@
|
||||
The MIT License (MIT)
|
||||
|
||||
Copyright (c) 2014 Tianqi Chen
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
|
||||
23
README.md
23
README.md
@ -1,18 +1,17 @@
|
||||
xgboost: A Gradient Boosting Library
|
||||
xgboost: eXtreme Gradient Boosting Library
|
||||
=======
|
||||
Creater: Tianqi Chen: tianqi.tchen AT gmail
|
||||
Creater: Tianqi Chen
|
||||
|
||||
General Purpose Gradient Boosting Library
|
||||
|
||||
Goal: A stand-alone efficient library to do learning via boosting in functional space
|
||||
|
||||
Features:
|
||||
* Sparse feature format, handling of missing features. This allows efficient categorical feature encoding as indicators. The speed of booster only depends on number of existing features.
|
||||
Features
|
||||
=======
|
||||
* Sparse feature format:
|
||||
- Sparse feature format allows easy handling of missing values, and improve computation efficiency.
|
||||
* Push the limit on single machine:
|
||||
- Efficient implementation that optimizes memory and computation.
|
||||
* Layout of gradient boosting algorithm to support generic tasks, see project wiki.
|
||||
|
||||
|
||||
Planned key components:
|
||||
|
||||
Planned key components
|
||||
=======
|
||||
* Gradient boosting models:
|
||||
- regression tree (GBRT)
|
||||
- linear model/lasso
|
||||
@ -22,7 +21,7 @@ Planned key components:
|
||||
- ranking
|
||||
- matrix factorization
|
||||
- structured prediction
|
||||
(3) OpenMP implementation(optional)
|
||||
(3) OpenMP implementation
|
||||
|
||||
File extension convention:
|
||||
(1) .h are interface, utils and data structures, with detailed comment;
|
||||
|
||||
@ -65,20 +65,59 @@ namespace xgboost{
|
||||
double sum_grad;
|
||||
/*! \brief sum hessian statistics */
|
||||
double sum_hess;
|
||||
/*! \brief last feature value scanned */
|
||||
float last_fvalue;
|
||||
/*! \brief current best solution */
|
||||
SplitEntry best;
|
||||
/*! \brief constructor */
|
||||
ThreadEntry( void ){
|
||||
sum_grad = sum_hess = 0;
|
||||
}
|
||||
};
|
||||
private:
|
||||
inline void CleanSTemp( const std::vector<int> &qexpand ){
|
||||
for( size_t i = 0; i < stemp.size(); ++ i ){
|
||||
for( size_t j = 0; j < qexpand.size(); ++ j ){
|
||||
ThreadEntry &e = stemp[i][ qexpand[j] ];
|
||||
e.sum_grad = e.sum_hess = 0.0f;
|
||||
}
|
||||
}
|
||||
}
|
||||
// make leaf nodes for all qexpand, update node statistics, mark leaf value
|
||||
inline void UpdateSNode( const std::vector<int> &qexpand ){
|
||||
this->CleanSTemp( qexpand );
|
||||
// step 1: find sum statistics
|
||||
const unsigned ndata = static_cast<unsigned>( position.size() );
|
||||
#pragma omp parallel for schedule( static )
|
||||
for( unsigned i = 0; i < ndata; ++ i ){
|
||||
const int tid = omp_get_thread_num();
|
||||
if( position[i] < 0 ) continue;
|
||||
stemp[tid][ position[i] ].sum_grad += grad[i];
|
||||
stemp[tid][ position[i] ].sum_hess += hess[i];
|
||||
}
|
||||
for( size_t j = 0; j < qexpand.size(); ++ j ){
|
||||
double sum_grad = 0.0, sum_hess = 0.0;
|
||||
for( size_t tid = 0; tid < stemp.size(); tid ++ ){
|
||||
sum_grad += stemp[tid][j].sum_grad;
|
||||
sum_hess += stemp[tid][j].sum_hess;
|
||||
}
|
||||
if( !tree[j].is_root() ){
|
||||
const float pweight = snode[ tree[j].parent() ].weight;
|
||||
snode[j].weight = param.CalcWeight( sum_grad, sum_hess, pweight );
|
||||
}else{
|
||||
snode[j].weight = param.CalcWeight( sum_grad, sum_hess, 0.0f );
|
||||
snode[j].loss_gain = param.CalcGain( sum_grad, sum_hess, 0.0f );
|
||||
}
|
||||
}
|
||||
}
|
||||
// find split at current level
|
||||
inline void FindSplit( int depth ){
|
||||
unsigned nsize = static_cast<unsigned>(feat_index.size());
|
||||
|
||||
#pragma omp parallel for schedule( dynamic, 1 )
|
||||
for( unsigned i = 0; i < nsize; ++ i ){
|
||||
const unsigned fid = feat_index[i];
|
||||
const unsigned fid = feat_index[i];
|
||||
const int tid = omp_get_thread_num();
|
||||
|
||||
}
|
||||
}
|
||||
// initialize temp data structure
|
||||
@ -93,7 +132,8 @@ namespace xgboost{
|
||||
}
|
||||
}
|
||||
{// initialize feature index
|
||||
for( int i = 0; i < tree.param.num_feature; i ++ ){
|
||||
int ncol = static_cast<int>( smat.NumCol() );
|
||||
for( int i = 0; i < ncol; i ++ ){
|
||||
if( smat.GetSortedCol(i).Next() ){
|
||||
feat_index.push_back( i );
|
||||
}
|
||||
@ -116,9 +156,18 @@ namespace xgboost{
|
||||
{// setup statistics space for each tree node
|
||||
snode.resize( tree.param.num_roots, SplitEntry() );
|
||||
}
|
||||
|
||||
{// expand query
|
||||
qexpand.reserve( 256 ); qexpand.clear();
|
||||
for( int i = 0; i < tree.param.num_roots; ++ i ){
|
||||
qexpand.push_back( i );
|
||||
}
|
||||
}
|
||||
}
|
||||
private:
|
||||
// local helper tmp data structure
|
||||
// queue of nodes to be expanded
|
||||
std::vector<int> qexpand;
|
||||
// Per feature: shuffle index of each feature index
|
||||
std::vector<int> feat_index;
|
||||
// Instance Data: current node position in the tree of each instance
|
||||
|
||||
@ -68,65 +68,65 @@ namespace xgboost{
|
||||
};
|
||||
private:
|
||||
// pointer to parent, highest bit is used to indicate whether it's a left child or not
|
||||
int sparent;
|
||||
int parent_;
|
||||
// pointer to left, right
|
||||
int left, right;
|
||||
int cleft_, cright_;
|
||||
// split feature index, left split or right split depends on the highest bit
|
||||
unsigned sindex;
|
||||
unsigned sindex_;
|
||||
// extra info
|
||||
Info info;
|
||||
Info info_;
|
||||
private:
|
||||
inline void set_parent( int pidx, bool is_left_child = true ){
|
||||
if( is_left_child ) pidx |= (1U << 31);
|
||||
this->sparent = pidx;
|
||||
this->parent_ = pidx;
|
||||
}
|
||||
public:
|
||||
/*! \brief index of left child */
|
||||
inline int cleft( void ) const{
|
||||
return this->left;
|
||||
return this->cleft_;
|
||||
}
|
||||
/*! \brief index of right child */
|
||||
inline int cright( void ) const{
|
||||
return this->right;
|
||||
return this->cright_;
|
||||
}
|
||||
/*! \brief feature index of split condition */
|
||||
inline unsigned split_index( void ) const{
|
||||
return sindex & ( (1U<<31) - 1U );
|
||||
return sindex_ & ( (1U<<31) - 1U );
|
||||
}
|
||||
/*! \brief when feature is unknown, whether goes to left child */
|
||||
inline bool default_left( void ) const{
|
||||
return (sindex >> 31) != 0;
|
||||
return (sindex_ >> 31) != 0;
|
||||
}
|
||||
/*! \brief whether current node is leaf node */
|
||||
inline bool is_leaf( void ) const{
|
||||
return left == -1;
|
||||
return cleft_ == -1;
|
||||
}
|
||||
/*! \brief get leaf value of leaf node */
|
||||
inline float leaf_value( void ) const{
|
||||
return (this->info).leaf_value;
|
||||
return (this->info_).leaf_value;
|
||||
}
|
||||
/*! \brief get split condition of the node */
|
||||
inline TSplitCond split_cond( void ) const{
|
||||
return (this->info).split_cond;
|
||||
return (this->info_).split_cond;
|
||||
}
|
||||
/*! \brief get parent of the node */
|
||||
inline int parent( void ) const{
|
||||
return sparent & ( (1U << 31) - 1 );
|
||||
return parent_ & ( (1U << 31) - 1 );
|
||||
}
|
||||
/*! \brief whether current node is left child */
|
||||
inline bool is_left_child( void ) const{
|
||||
return ( sparent & (1U << 31)) != 0;
|
||||
return ( parent_ & (1U << 31)) != 0;
|
||||
}
|
||||
/*! \brief whether current node is root */
|
||||
inline bool is_root( void ) const{
|
||||
return sparent == -1;
|
||||
return parent_ == -1;
|
||||
}
|
||||
/*!
|
||||
* \brief set the right child
|
||||
* \param nide node id to right child
|
||||
*/
|
||||
inline void set_right_child( int nid ){
|
||||
this->right = nid;
|
||||
this->cright_ = nid;
|
||||
}
|
||||
/*!
|
||||
* \brief set split condition of current node
|
||||
@ -136,8 +136,8 @@ namespace xgboost{
|
||||
*/
|
||||
inline void set_split( unsigned split_index, TSplitCond split_cond, bool default_left = false ){
|
||||
if( default_left ) split_index |= (1U << 31);
|
||||
this->sindex = split_index;
|
||||
(this->info).split_cond = split_cond;
|
||||
this->sindex_ = split_index;
|
||||
(this->info_).split_cond = split_cond;
|
||||
}
|
||||
/*!
|
||||
* \brief set the leaf value of the node
|
||||
@ -146,9 +146,9 @@ namespace xgboost{
|
||||
* additional information
|
||||
*/
|
||||
inline void set_leaf( float value, int right = -1 ){
|
||||
(this->info).leaf_value = value;
|
||||
this->left = -1;
|
||||
this->right = right;
|
||||
(this->info_).leaf_value = value;
|
||||
this->cleft_ = -1;
|
||||
this->cright_ = right;
|
||||
}
|
||||
};
|
||||
protected:
|
||||
@ -187,10 +187,10 @@ namespace xgboost{
|
||||
* \param new leaf value
|
||||
*/
|
||||
inline void ChangeToLeaf( int rid, float value ){
|
||||
utils::Assert( nodes[ nodes[rid].left ].is_leaf(), "can not delete a non termial child");
|
||||
utils::Assert( nodes[ nodes[rid].right ].is_leaf(), "can not delete a non termial child");
|
||||
this->DeleteNode( nodes[ rid ].left );
|
||||
this->DeleteNode( nodes[ rid ].right );
|
||||
utils::Assert( nodes[ nodes[rid].cleft() ].is_leaf(), "can not delete a non termial child");
|
||||
utils::Assert( nodes[ nodes[rid].cright() ].is_leaf(), "can not delete a non termial child");
|
||||
this->DeleteNode( nodes[ rid ].cleft() );
|
||||
this->DeleteNode( nodes[ rid ].cright() );
|
||||
nodes[ rid ].set_leaf( value );
|
||||
}
|
||||
public:
|
||||
@ -253,10 +253,10 @@ namespace xgboost{
|
||||
inline void AddChilds( int nid ){
|
||||
int pleft = this->AllocNode();
|
||||
int pright = this->AllocNode();
|
||||
nodes[ nid ].left = pleft;
|
||||
nodes[ nid ].right = pright;
|
||||
nodes[ nodes[ nid ].left ].set_parent( nid, true );
|
||||
nodes[ nodes[ nid ].right ].set_parent( nid, false );
|
||||
nodes[ nid ].cleft_ = pleft;
|
||||
nodes[ nid ].cright_ = pright;
|
||||
nodes[ nodes[ nid ].cleft() ].set_parent( nid, true );
|
||||
nodes[ nodes[ nid ].cright() ].set_parent( nid, false );
|
||||
}
|
||||
/*!
|
||||
* \brief only add a right child to a leaf node
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user