chg license, README

This commit is contained in:
tqchen 2014-02-28 20:09:40 -08:00
parent b57656902e
commit e4a4f7d315
4 changed files with 102 additions and 62 deletions

28
LICENSE
View File

@ -1,21 +1,13 @@
The MIT License (MIT)
Copyright (c) 2014 Tianqi Chen
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

View File

@ -1,18 +1,17 @@
xgboost: A Gradient Boosting Library
xgboost: eXtreme Gradient Boosting Library
=======
Creater: Tianqi Chen: tianqi.tchen AT gmail
Creater: Tianqi Chen
General Purpose Gradient Boosting Library
Goal: A stand-alone efficient library to do learning via boosting in functional space
Features:
* Sparse feature format, handling of missing features. This allows efficient categorical feature encoding as indicators. The speed of booster only depends on number of existing features.
Features
=======
* Sparse feature format:
- Sparse feature format allows easy handling of missing values, and improve computation efficiency.
* Push the limit on single machine:
- Efficient implementation that optimizes memory and computation.
* Layout of gradient boosting algorithm to support generic tasks, see project wiki.
Planned key components:
Planned key components
=======
* Gradient boosting models:
- regression tree (GBRT)
- linear model/lasso
@ -22,7 +21,7 @@ Planned key components:
- ranking
- matrix factorization
- structured prediction
(3) OpenMP implementation(optional)
(3) OpenMP implementation
File extension convention:
(1) .h are interface, utils and data structures, with detailed comment;

View File

@ -65,20 +65,59 @@ namespace xgboost{
double sum_grad;
/*! \brief sum hessian statistics */
double sum_hess;
/*! \brief last feature value scanned */
float last_fvalue;
/*! \brief current best solution */
SplitEntry best;
/*! \brief constructor */
ThreadEntry( void ){
sum_grad = sum_hess = 0;
}
};
private:
inline void CleanSTemp( const std::vector<int> &qexpand ){
for( size_t i = 0; i < stemp.size(); ++ i ){
for( size_t j = 0; j < qexpand.size(); ++ j ){
ThreadEntry &e = stemp[i][ qexpand[j] ];
e.sum_grad = e.sum_hess = 0.0f;
}
}
}
// make leaf nodes for all qexpand, update node statistics, mark leaf value
inline void UpdateSNode( const std::vector<int> &qexpand ){
this->CleanSTemp( qexpand );
// step 1: find sum statistics
const unsigned ndata = static_cast<unsigned>( position.size() );
#pragma omp parallel for schedule( static )
for( unsigned i = 0; i < ndata; ++ i ){
const int tid = omp_get_thread_num();
if( position[i] < 0 ) continue;
stemp[tid][ position[i] ].sum_grad += grad[i];
stemp[tid][ position[i] ].sum_hess += hess[i];
}
for( size_t j = 0; j < qexpand.size(); ++ j ){
double sum_grad = 0.0, sum_hess = 0.0;
for( size_t tid = 0; tid < stemp.size(); tid ++ ){
sum_grad += stemp[tid][j].sum_grad;
sum_hess += stemp[tid][j].sum_hess;
}
if( !tree[j].is_root() ){
const float pweight = snode[ tree[j].parent() ].weight;
snode[j].weight = param.CalcWeight( sum_grad, sum_hess, pweight );
}else{
snode[j].weight = param.CalcWeight( sum_grad, sum_hess, 0.0f );
snode[j].loss_gain = param.CalcGain( sum_grad, sum_hess, 0.0f );
}
}
}
// find split at current level
inline void FindSplit( int depth ){
unsigned nsize = static_cast<unsigned>(feat_index.size());
#pragma omp parallel for schedule( dynamic, 1 )
for( unsigned i = 0; i < nsize; ++ i ){
const unsigned fid = feat_index[i];
const unsigned fid = feat_index[i];
const int tid = omp_get_thread_num();
}
}
// initialize temp data structure
@ -93,7 +132,8 @@ namespace xgboost{
}
}
{// initialize feature index
for( int i = 0; i < tree.param.num_feature; i ++ ){
int ncol = static_cast<int>( smat.NumCol() );
for( int i = 0; i < ncol; i ++ ){
if( smat.GetSortedCol(i).Next() ){
feat_index.push_back( i );
}
@ -116,9 +156,18 @@ namespace xgboost{
{// setup statistics space for each tree node
snode.resize( tree.param.num_roots, SplitEntry() );
}
{// expand query
qexpand.reserve( 256 ); qexpand.clear();
for( int i = 0; i < tree.param.num_roots; ++ i ){
qexpand.push_back( i );
}
}
}
private:
// local helper tmp data structure
// queue of nodes to be expanded
std::vector<int> qexpand;
// Per feature: shuffle index of each feature index
std::vector<int> feat_index;
// Instance Data: current node position in the tree of each instance

View File

@ -68,65 +68,65 @@ namespace xgboost{
};
private:
// pointer to parent, highest bit is used to indicate whether it's a left child or not
int sparent;
int parent_;
// pointer to left, right
int left, right;
int cleft_, cright_;
// split feature index, left split or right split depends on the highest bit
unsigned sindex;
unsigned sindex_;
// extra info
Info info;
Info info_;
private:
inline void set_parent( int pidx, bool is_left_child = true ){
if( is_left_child ) pidx |= (1U << 31);
this->sparent = pidx;
this->parent_ = pidx;
}
public:
/*! \brief index of left child */
inline int cleft( void ) const{
return this->left;
return this->cleft_;
}
/*! \brief index of right child */
inline int cright( void ) const{
return this->right;
return this->cright_;
}
/*! \brief feature index of split condition */
inline unsigned split_index( void ) const{
return sindex & ( (1U<<31) - 1U );
return sindex_ & ( (1U<<31) - 1U );
}
/*! \brief when feature is unknown, whether goes to left child */
inline bool default_left( void ) const{
return (sindex >> 31) != 0;
return (sindex_ >> 31) != 0;
}
/*! \brief whether current node is leaf node */
inline bool is_leaf( void ) const{
return left == -1;
return cleft_ == -1;
}
/*! \brief get leaf value of leaf node */
inline float leaf_value( void ) const{
return (this->info).leaf_value;
return (this->info_).leaf_value;
}
/*! \brief get split condition of the node */
inline TSplitCond split_cond( void ) const{
return (this->info).split_cond;
return (this->info_).split_cond;
}
/*! \brief get parent of the node */
inline int parent( void ) const{
return sparent & ( (1U << 31) - 1 );
return parent_ & ( (1U << 31) - 1 );
}
/*! \brief whether current node is left child */
inline bool is_left_child( void ) const{
return ( sparent & (1U << 31)) != 0;
return ( parent_ & (1U << 31)) != 0;
}
/*! \brief whether current node is root */
inline bool is_root( void ) const{
return sparent == -1;
return parent_ == -1;
}
/*!
* \brief set the right child
* \param nide node id to right child
*/
inline void set_right_child( int nid ){
this->right = nid;
this->cright_ = nid;
}
/*!
* \brief set split condition of current node
@ -136,8 +136,8 @@ namespace xgboost{
*/
inline void set_split( unsigned split_index, TSplitCond split_cond, bool default_left = false ){
if( default_left ) split_index |= (1U << 31);
this->sindex = split_index;
(this->info).split_cond = split_cond;
this->sindex_ = split_index;
(this->info_).split_cond = split_cond;
}
/*!
* \brief set the leaf value of the node
@ -146,9 +146,9 @@ namespace xgboost{
* additional information
*/
inline void set_leaf( float value, int right = -1 ){
(this->info).leaf_value = value;
this->left = -1;
this->right = right;
(this->info_).leaf_value = value;
this->cleft_ = -1;
this->cright_ = right;
}
};
protected:
@ -187,10 +187,10 @@ namespace xgboost{
* \param new leaf value
*/
inline void ChangeToLeaf( int rid, float value ){
utils::Assert( nodes[ nodes[rid].left ].is_leaf(), "can not delete a non termial child");
utils::Assert( nodes[ nodes[rid].right ].is_leaf(), "can not delete a non termial child");
this->DeleteNode( nodes[ rid ].left );
this->DeleteNode( nodes[ rid ].right );
utils::Assert( nodes[ nodes[rid].cleft() ].is_leaf(), "can not delete a non termial child");
utils::Assert( nodes[ nodes[rid].cright() ].is_leaf(), "can not delete a non termial child");
this->DeleteNode( nodes[ rid ].cleft() );
this->DeleteNode( nodes[ rid ].cright() );
nodes[ rid ].set_leaf( value );
}
public:
@ -253,10 +253,10 @@ namespace xgboost{
inline void AddChilds( int nid ){
int pleft = this->AllocNode();
int pright = this->AllocNode();
nodes[ nid ].left = pleft;
nodes[ nid ].right = pright;
nodes[ nodes[ nid ].left ].set_parent( nid, true );
nodes[ nodes[ nid ].right ].set_parent( nid, false );
nodes[ nid ].cleft_ = pleft;
nodes[ nid ].cright_ = pright;
nodes[ nodes[ nid ].cleft() ].set_parent( nid, true );
nodes[ nodes[ nid ].cright() ].set_parent( nid, false );
}
/*!
* \brief only add a right child to a leaf node