Compare commits
110 Commits
create-pul
...
v0.1
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
2aa1031d24 | ||
|
|
1440dc9c8f | ||
|
|
27bd5496a8 | ||
|
|
81b32525e0 | ||
|
|
6fa0948461 | ||
|
|
61123f86aa | ||
|
|
110b97fea2 | ||
|
|
b2eb4e956b | ||
|
|
56ae0e32e3 | ||
|
|
e350c38483 | ||
|
|
e59ed018e6 | ||
|
|
3123d11655 | ||
|
|
ca74cba9ec | ||
|
|
a84d4f3e68 | ||
|
|
76cd1561a0 | ||
|
|
5b4f77488c | ||
|
|
b0676fc682 | ||
|
|
97418b113e | ||
|
|
d56394d2ef | ||
|
|
6a91438634 | ||
|
|
da3b3c8136 | ||
|
|
fcf06a7164 | ||
|
|
8f9efa2725 | ||
|
|
6e48a938c6 | ||
|
|
19b28b978d | ||
|
|
8f16ef8e75 | ||
|
|
d2377b26bd | ||
|
|
70f3f31206 | ||
|
|
f62c5dc3c1 | ||
|
|
2d67377a96 | ||
|
|
d982be9dca | ||
|
|
98114cabce | ||
|
|
2910bdedf4 | ||
|
|
128e94be1a | ||
|
|
eade6ddf7c | ||
|
|
9b45210fa7 | ||
|
|
ddd61b43be | ||
|
|
98e851d80f | ||
|
|
3d223232e3 | ||
|
|
b689b4525a | ||
|
|
a3ca03cfc1 | ||
|
|
2aa1978cb6 | ||
|
|
e3b7abfb47 | ||
|
|
2adf905dcf | ||
|
|
cfbeeef9c1 | ||
|
|
8ae1d37828 | ||
|
|
0fc64d1c2a | ||
|
|
1eca127f69 | ||
|
|
c7b29774c2 | ||
|
|
a8f69878eb | ||
|
|
51b6d86c17 | ||
|
|
082a57ba0b | ||
|
|
f3c98d0c4b | ||
|
|
1748e4517a | ||
|
|
328e41244c | ||
|
|
155b593984 | ||
|
|
76cbc754c9 | ||
|
|
97ca3bf739 | ||
|
|
752f336cb3 | ||
|
|
fffad41e53 | ||
|
|
10382f6365 | ||
|
|
7b2fe1bf5d | ||
|
|
88c982012a | ||
|
|
b6f98bf37a | ||
|
|
3a4d0f28d9 | ||
|
|
e58daa6d52 | ||
|
|
a5b37e0395 | ||
|
|
e75488b578 | ||
|
|
1160a38323 | ||
|
|
4401d549f1 | ||
|
|
fd120a8f5c | ||
|
|
00add6dd1d | ||
|
|
cd009f2541 | ||
|
|
582be45810 | ||
|
|
3c93216850 | ||
|
|
787f76e952 | ||
|
|
91c170e463 | ||
|
|
0c44347e82 | ||
|
|
603704287d | ||
|
|
c933625f94 | ||
|
|
cebf39ea47 | ||
|
|
f22139c659 | ||
|
|
06ce8c9f3a | ||
|
|
98a60b3610 | ||
|
|
2dc6c9c683 | ||
|
|
3e53fcf465 | ||
|
|
cb0fa75252 | ||
|
|
51a63d80d0 | ||
|
|
1e356c5bd2 | ||
|
|
c5ada79be5 | ||
|
|
dd924becd8 | ||
|
|
7fa301a8ce | ||
|
|
3d1e0badd3 | ||
|
|
7e605306ad | ||
|
|
5e5acdc121 | ||
|
|
7302a4e1b5 | ||
|
|
21dd4b5904 | ||
|
|
61e5410789 | ||
|
|
0febb1a443 | ||
|
|
36a04f17df | ||
|
|
3dd477c4b2 | ||
|
|
779d6a34de | ||
|
|
4535ab7e5c | ||
|
|
75c36a0667 | ||
|
|
790c76e814 | ||
|
|
a81ea03022 | ||
|
|
a198759df6 | ||
|
|
a607444038 | ||
|
|
ee6a0c7f4a | ||
|
|
57fef8bc54 |
6
.gitignore
vendored
6
.gitignore
vendored
@@ -11,3 +11,9 @@
|
||||
*.lai
|
||||
*.la
|
||||
*.a
|
||||
*~
|
||||
*txt*
|
||||
*conf
|
||||
*buffer
|
||||
*model
|
||||
xgboost
|
||||
211
LICENSE
211
LICENSE
@@ -1,202 +1,13 @@
|
||||
Apache License
|
||||
Version 2.0, January 2004
|
||||
http://www.apache.org/licenses/
|
||||
Copyright (c) 2014 Tianqi Chen
|
||||
|
||||
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
||||
|
||||
1. Definitions.
|
||||
|
||||
"License" shall mean the terms and conditions for use, reproduction,
|
||||
and distribution as defined by Sections 1 through 9 of this document.
|
||||
|
||||
"Licensor" shall mean the copyright owner or entity authorized by
|
||||
the copyright owner that is granting the License.
|
||||
|
||||
"Legal Entity" shall mean the union of the acting entity and all
|
||||
other entities that control, are controlled by, or are under common
|
||||
control with that entity. For the purposes of this definition,
|
||||
"control" means (i) the power, direct or indirect, to cause the
|
||||
direction or management of such entity, whether by contract or
|
||||
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
||||
outstanding shares, or (iii) beneficial ownership of such entity.
|
||||
|
||||
"You" (or "Your") shall mean an individual or Legal Entity
|
||||
exercising permissions granted by this License.
|
||||
|
||||
"Source" form shall mean the preferred form for making modifications,
|
||||
including but not limited to software source code, documentation
|
||||
source, and configuration files.
|
||||
|
||||
"Object" form shall mean any form resulting from mechanical
|
||||
transformation or translation of a Source form, including but
|
||||
not limited to compiled object code, generated documentation,
|
||||
and conversions to other media types.
|
||||
|
||||
"Work" shall mean the work of authorship, whether in Source or
|
||||
Object form, made available under the License, as indicated by a
|
||||
copyright notice that is included in or attached to the work
|
||||
(an example is provided in the Appendix below).
|
||||
|
||||
"Derivative Works" shall mean any work, whether in Source or Object
|
||||
form, that is based on (or derived from) the Work and for which the
|
||||
editorial revisions, annotations, elaborations, or other modifications
|
||||
represent, as a whole, an original work of authorship. For the purposes
|
||||
of this License, Derivative Works shall not include works that remain
|
||||
separable from, or merely link (or bind by name) to the interfaces of,
|
||||
the Work and Derivative Works thereof.
|
||||
|
||||
"Contribution" shall mean any work of authorship, including
|
||||
the original version of the Work and any modifications or additions
|
||||
to that Work or Derivative Works thereof, that is intentionally
|
||||
submitted to Licensor for inclusion in the Work by the copyright owner
|
||||
or by an individual or Legal Entity authorized to submit on behalf of
|
||||
the copyright owner. For the purposes of this definition, "submitted"
|
||||
means any form of electronic, verbal, or written communication sent
|
||||
to the Licensor or its representatives, including but not limited to
|
||||
communication on electronic mailing lists, source code control systems,
|
||||
and issue tracking systems that are managed by, or on behalf of, the
|
||||
Licensor for the purpose of discussing and improving the Work, but
|
||||
excluding communication that is conspicuously marked or otherwise
|
||||
designated in writing by the copyright owner as "Not a Contribution."
|
||||
|
||||
"Contributor" shall mean Licensor and any individual or Legal Entity
|
||||
on behalf of whom a Contribution has been received by Licensor and
|
||||
subsequently incorporated within the Work.
|
||||
|
||||
2. Grant of Copyright License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
copyright license to reproduce, prepare Derivative Works of,
|
||||
publicly display, publicly perform, sublicense, and distribute the
|
||||
Work and such Derivative Works in Source or Object form.
|
||||
|
||||
3. Grant of Patent License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
(except as stated in this section) patent license to make, have made,
|
||||
use, offer to sell, sell, import, and otherwise transfer the Work,
|
||||
where such license applies only to those patent claims licensable
|
||||
by such Contributor that are necessarily infringed by their
|
||||
Contribution(s) alone or by combination of their Contribution(s)
|
||||
with the Work to which such Contribution(s) was submitted. If You
|
||||
institute patent litigation against any entity (including a
|
||||
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
||||
or a Contribution incorporated within the Work constitutes direct
|
||||
or contributory patent infringement, then any patent licenses
|
||||
granted to You under this License for that Work shall terminate
|
||||
as of the date such litigation is filed.
|
||||
|
||||
4. Redistribution. You may reproduce and distribute copies of the
|
||||
Work or Derivative Works thereof in any medium, with or without
|
||||
modifications, and in Source or Object form, provided that You
|
||||
meet the following conditions:
|
||||
|
||||
(a) You must give any other recipients of the Work or
|
||||
Derivative Works a copy of this License; and
|
||||
|
||||
(b) You must cause any modified files to carry prominent notices
|
||||
stating that You changed the files; and
|
||||
|
||||
(c) You must retain, in the Source form of any Derivative Works
|
||||
that You distribute, all copyright, patent, trademark, and
|
||||
attribution notices from the Source form of the Work,
|
||||
excluding those notices that do not pertain to any part of
|
||||
the Derivative Works; and
|
||||
|
||||
(d) If the Work includes a "NOTICE" text file as part of its
|
||||
distribution, then any Derivative Works that You distribute must
|
||||
include a readable copy of the attribution notices contained
|
||||
within such NOTICE file, excluding those notices that do not
|
||||
pertain to any part of the Derivative Works, in at least one
|
||||
of the following places: within a NOTICE text file distributed
|
||||
as part of the Derivative Works; within the Source form or
|
||||
documentation, if provided along with the Derivative Works; or,
|
||||
within a display generated by the Derivative Works, if and
|
||||
wherever such third-party notices normally appear. The contents
|
||||
of the NOTICE file are for informational purposes only and
|
||||
do not modify the License. You may add Your own attribution
|
||||
notices within Derivative Works that You distribute, alongside
|
||||
or as an addendum to the NOTICE text from the Work, provided
|
||||
that such additional attribution notices cannot be construed
|
||||
as modifying the License.
|
||||
|
||||
You may add Your own copyright statement to Your modifications and
|
||||
may provide additional or different license terms and conditions
|
||||
for use, reproduction, or distribution of Your modifications, or
|
||||
for any such Derivative Works as a whole, provided Your use,
|
||||
reproduction, and distribution of the Work otherwise complies with
|
||||
the conditions stated in this License.
|
||||
|
||||
5. Submission of Contributions. Unless You explicitly state otherwise,
|
||||
any Contribution intentionally submitted for inclusion in the Work
|
||||
by You to the Licensor shall be under the terms and conditions of
|
||||
this License, without any additional terms or conditions.
|
||||
Notwithstanding the above, nothing herein shall supersede or modify
|
||||
the terms of any separate license agreement you may have executed
|
||||
with Licensor regarding such Contributions.
|
||||
|
||||
6. Trademarks. This License does not grant permission to use the trade
|
||||
names, trademarks, service marks, or product names of the Licensor,
|
||||
except as required for reasonable and customary use in describing the
|
||||
origin of the Work and reproducing the content of the NOTICE file.
|
||||
|
||||
7. Disclaimer of Warranty. Unless required by applicable law or
|
||||
agreed to in writing, Licensor provides the Work (and each
|
||||
Contributor provides its Contributions) on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
||||
implied, including, without limitation, any warranties or conditions
|
||||
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
||||
PARTICULAR PURPOSE. You are solely responsible for determining the
|
||||
appropriateness of using or redistributing the Work and assume any
|
||||
risks associated with Your exercise of permissions under this License.
|
||||
|
||||
8. Limitation of Liability. In no event and under no legal theory,
|
||||
whether in tort (including negligence), contract, or otherwise,
|
||||
unless required by applicable law (such as deliberate and grossly
|
||||
negligent acts) or agreed to in writing, shall any Contributor be
|
||||
liable to You for damages, including any direct, indirect, special,
|
||||
incidental, or consequential damages of any character arising as a
|
||||
result of this License or out of the use or inability to use the
|
||||
Work (including but not limited to damages for loss of goodwill,
|
||||
work stoppage, computer failure or malfunction, or any and all
|
||||
other commercial damages or losses), even if such Contributor
|
||||
has been advised of the possibility of such damages.
|
||||
|
||||
9. Accepting Warranty or Additional Liability. While redistributing
|
||||
the Work or Derivative Works thereof, You may choose to offer,
|
||||
and charge a fee for, acceptance of support, warranty, indemnity,
|
||||
or other liability obligations and/or rights consistent with this
|
||||
License. However, in accepting such obligations, You may act only
|
||||
on Your own behalf and on Your sole responsibility, not on behalf
|
||||
of any other Contributor, and only if You agree to indemnify,
|
||||
defend, and hold each Contributor harmless for any liability
|
||||
incurred by, or claims asserted against, such Contributor by reason
|
||||
of your accepting any such warranty or additional liability.
|
||||
|
||||
END OF TERMS AND CONDITIONS
|
||||
|
||||
APPENDIX: How to apply the Apache License to your work.
|
||||
|
||||
To apply the Apache License to your work, attach the following
|
||||
boilerplate notice, with the fields enclosed by brackets "{}"
|
||||
replaced with your own identifying information. (Don't include
|
||||
the brackets!) The text should be enclosed in the appropriate
|
||||
comment syntax for the file format. We also recommend that a
|
||||
file or class name and description of purpose be included on the
|
||||
same "printed page" as the copyright notice for easier
|
||||
identification within third-party archives.
|
||||
|
||||
Copyright {yyyy} {name of copyright owner}
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
|
||||
25
Makefile
Normal file
25
Makefile
Normal file
@@ -0,0 +1,25 @@
|
||||
export CC = gcc
|
||||
export CXX = g++
|
||||
export CFLAGS = -Wall -O3 -msse2 -Wno-unknown-pragmas -fopenmp
|
||||
|
||||
# specify tensor path
|
||||
BIN = xgboost
|
||||
OBJ =
|
||||
.PHONY: clean all
|
||||
|
||||
all: $(BIN) $(OBJ)
|
||||
export LDFLAGS= -pthread -lm
|
||||
|
||||
xgboost: regression/xgboost_reg_main.cpp regression/*.h booster/*.h booster/*/*.hpp booster/*.hpp
|
||||
|
||||
$(BIN) :
|
||||
$(CXX) $(CFLAGS) $(LDFLAGS) -o $@ $(filter %.cpp %.o %.c, $^)
|
||||
|
||||
$(OBJ) :
|
||||
$(CXX) -c $(CFLAGS) -o $@ $(firstword $(filter %.cpp %.c, $^) )
|
||||
|
||||
install:
|
||||
cp -f -r $(BIN) $(INSTALL_PATH)
|
||||
|
||||
clean:
|
||||
$(RM) $(OBJ) $(BIN) *~
|
||||
40
README.md
40
README.md
@@ -1,4 +1,40 @@
|
||||
xgboost
|
||||
xgboost: eXtreme Gradient Boosting
|
||||
=======
|
||||
A General purpose gradient boosting (tree) library.
|
||||
|
||||
General Purpose Gradient Boosting Library
|
||||
Authors:
|
||||
* Tianqi Chen, project creater
|
||||
* Kailong Chen, contributes regression module
|
||||
|
||||
Turorial and Documentation: https://github.com/tqchen/xgboost/wiki
|
||||
|
||||
Features
|
||||
=======
|
||||
* Sparse feature format:
|
||||
- Sparse feature format allows easy handling of missing values, and improve computation efficiency.
|
||||
* Push the limit on single machine:
|
||||
- Efficient implementation that optimizes memory and computation.
|
||||
* Layout of gradient boosting algorithm to support generic tasks, see project wiki.
|
||||
|
||||
Supported key components
|
||||
=======
|
||||
* Gradient boosting models:
|
||||
- regression tree (GBRT)
|
||||
- linear model/lasso
|
||||
* Objectives to support tasks:
|
||||
- regression
|
||||
- classification
|
||||
* OpenMP implementation
|
||||
|
||||
Planned components
|
||||
=======
|
||||
* More objective to support tasks:
|
||||
- ranking
|
||||
- matrix factorization
|
||||
- structured prediction
|
||||
|
||||
File extension convention
|
||||
=======
|
||||
* .h are interface, utils and data structures, with detailed comment;
|
||||
* .cpp are implementations that will be compiled, with less comment;
|
||||
* .hpp are implementations that will be included by .cpp, with less comment
|
||||
|
||||
200
booster/linear/xgboost_linear.hpp
Normal file
200
booster/linear/xgboost_linear.hpp
Normal file
@@ -0,0 +1,200 @@
|
||||
#ifndef XGBOOST_LINEAR_HPP
|
||||
#define XGBOOST_LINEAR_HPP
|
||||
/*!
|
||||
* \file xgboost_linear.h
|
||||
* \brief Implementation of Linear booster, with L1/L2 regularization: Elastic Net
|
||||
* the update rule is coordinate descent, require column major format
|
||||
* \author Tianqi Chen: tianqi.tchen@gmail.com
|
||||
*/
|
||||
#include <vector>
|
||||
#include <algorithm>
|
||||
|
||||
#include "../xgboost.h"
|
||||
#include "../../utils/xgboost_utils.h"
|
||||
|
||||
namespace xgboost{
|
||||
namespace booster{
|
||||
/*! \brief linear model, with L1/L2 regularization */
|
||||
template<typename FMatrix>
|
||||
class LinearBooster : public InterfaceBooster<FMatrix>{
|
||||
public:
|
||||
LinearBooster( void ){ silent = 0;}
|
||||
virtual ~LinearBooster( void ){}
|
||||
public:
|
||||
virtual void SetParam( const char *name, const char *val ){
|
||||
if( !strcmp( name, "silent") ) silent = atoi( val );
|
||||
if( model.weight.size() == 0 ) model.param.SetParam( name, val );
|
||||
param.SetParam( name, val );
|
||||
}
|
||||
virtual void LoadModel( utils::IStream &fi ){
|
||||
model.LoadModel( fi );
|
||||
}
|
||||
virtual void SaveModel( utils::IStream &fo ) const{
|
||||
model.SaveModel( fo );
|
||||
}
|
||||
virtual void InitModel( void ){
|
||||
model.InitModel();
|
||||
}
|
||||
public:
|
||||
virtual void DoBoost( std::vector<float> &grad,
|
||||
std::vector<float> &hess,
|
||||
const FMatrix &fmat,
|
||||
const std::vector<unsigned> &root_index ){
|
||||
utils::Assert( grad.size() < UINT_MAX, "number of instance exceed what we can handle" );
|
||||
this->UpdateWeights( grad, hess, fmat );
|
||||
}
|
||||
inline float Predict( const FMatrix &fmat, bst_uint ridx, unsigned root_index ){
|
||||
float sum = model.bias();
|
||||
for( typename FMatrix::RowIter it = fmat.GetRow(ridx); it.Next(); ){
|
||||
sum += model.weight[ it.findex() ] * it.fvalue();
|
||||
}
|
||||
return sum;
|
||||
}
|
||||
virtual float Predict( const std::vector<float> &feat,
|
||||
const std::vector<bool> &funknown,
|
||||
unsigned rid = 0 ){
|
||||
float sum = model.bias();
|
||||
for( size_t i = 0; i < feat.size(); i ++ ){
|
||||
if( funknown[i] ) continue;
|
||||
sum += model.weight[ i ] * feat[ i ];
|
||||
}
|
||||
return sum;
|
||||
}
|
||||
|
||||
protected:
|
||||
// training parameter
|
||||
struct ParamTrain{
|
||||
/*! \brief learning_rate */
|
||||
float learning_rate;
|
||||
/*! \brief regularization weight for L2 norm */
|
||||
float reg_lambda;
|
||||
/*! \brief regularization weight for L1 norm */
|
||||
float reg_alpha;
|
||||
/*! \brief regularization weight for L2 norm in bias */
|
||||
float reg_lambda_bias;
|
||||
|
||||
ParamTrain( void ){
|
||||
reg_alpha = 0.0f; reg_lambda = 0.0f; reg_lambda_bias = 0.0f;
|
||||
learning_rate = 1.0f;
|
||||
}
|
||||
inline void SetParam( const char *name, const char *val ){
|
||||
// sync-names
|
||||
if( !strcmp( "eta", name ) ) learning_rate = (float)atof( val );
|
||||
if( !strcmp( "lambda", name ) ) reg_lambda = (float)atof( val );
|
||||
if( !strcmp( "alpha", name ) ) reg_alpha = (float)atof( val );
|
||||
if( !strcmp( "lambda_bias", name ) ) reg_lambda_bias = (float)atof( val );
|
||||
// real names
|
||||
if( !strcmp( "learning_rate", name ) ) learning_rate = (float)atof( val );
|
||||
if( !strcmp( "reg_lambda", name ) ) reg_lambda = (float)atof( val );
|
||||
if( !strcmp( "reg_alpha", name ) ) reg_alpha = (float)atof( val );
|
||||
if( !strcmp( "reg_lambda_bias", name ) ) reg_lambda_bias = (float)atof( val );
|
||||
}
|
||||
// given original weight calculate delta
|
||||
inline double CalcDelta( double sum_grad, double sum_hess, double w ){
|
||||
if( sum_hess < 1e-5f ) return 0.0f;
|
||||
double tmp = w - ( sum_grad + reg_lambda*w )/( sum_hess + reg_lambda );
|
||||
if ( tmp >=0 ){
|
||||
return std::max(-( sum_grad + reg_lambda*w + reg_alpha)/(sum_hess+reg_lambda),-w);
|
||||
}else{
|
||||
return std::min(-( sum_grad + reg_lambda*w - reg_alpha)/(sum_hess+reg_lambda),-w);
|
||||
}
|
||||
}
|
||||
// given original weight calculate delta bias
|
||||
inline double CalcDeltaBias( double sum_grad, double sum_hess, double w ){
|
||||
return - (sum_grad + reg_lambda_bias*w) / (sum_hess + reg_lambda_bias );
|
||||
}
|
||||
};
|
||||
|
||||
// model for linear booster
|
||||
class Model{
|
||||
public:
|
||||
// model parameter
|
||||
struct Param{
|
||||
// number of feature dimension
|
||||
int num_feature;
|
||||
// reserved field
|
||||
int reserved[ 32 ];
|
||||
// constructor
|
||||
Param( void ){
|
||||
num_feature = 0;
|
||||
memset( reserved, 0, sizeof(reserved) );
|
||||
}
|
||||
inline void SetParam( const char *name, const char *val ){
|
||||
if( !strcmp( name, "num_feature" ) ) num_feature = atoi( val );
|
||||
}
|
||||
};
|
||||
public:
|
||||
Param param;
|
||||
// weight for each of feature, bias is the last one
|
||||
std::vector<float> weight;
|
||||
public:
|
||||
// initialize the model parameter
|
||||
inline void InitModel( void ){
|
||||
// bias is the last weight
|
||||
weight.resize( param.num_feature + 1 );
|
||||
std::fill( weight.begin(), weight.end(), 0.0f );
|
||||
}
|
||||
// save the model to file
|
||||
inline void SaveModel( utils::IStream &fo ) const{
|
||||
fo.Write( ¶m, sizeof(Param) );
|
||||
fo.Write( &weight[0], sizeof(float) * weight.size() );
|
||||
}
|
||||
// load model from file
|
||||
inline void LoadModel( utils::IStream &fi ){
|
||||
utils::Assert( fi.Read( ¶m, sizeof(Param) ) != 0, "Load LinearBooster" );
|
||||
weight.resize( param.num_feature + 1 );
|
||||
utils::Assert( fi.Read( &weight[0], sizeof(float) * weight.size() ) != 0, "Load LinearBooster" );
|
||||
}
|
||||
// model bias
|
||||
inline float &bias( void ){
|
||||
return weight.back();
|
||||
}
|
||||
};
|
||||
private:
|
||||
int silent;
|
||||
protected:
|
||||
Model model;
|
||||
ParamTrain param;
|
||||
protected:
|
||||
// update weights, should work for any FMatrix
|
||||
inline void UpdateWeights( std::vector<float> &grad,
|
||||
const std::vector<float> &hess,
|
||||
const FMatrix &smat ){
|
||||
{// optimize bias
|
||||
double sum_grad = 0.0, sum_hess = 0.0;
|
||||
for( size_t i = 0; i < grad.size(); i ++ ){
|
||||
sum_grad += grad[ i ]; sum_hess += hess[ i ];
|
||||
}
|
||||
// remove bias effect
|
||||
double dw = param.learning_rate * param.CalcDeltaBias( sum_grad, sum_hess, model.bias() );
|
||||
model.bias() += dw;
|
||||
// update grad value
|
||||
for( size_t i = 0; i < grad.size(); i ++ ){
|
||||
grad[ i ] += dw * hess[ i ];
|
||||
}
|
||||
}
|
||||
|
||||
// optimize weight
|
||||
const unsigned nfeat= (unsigned)smat.NumCol();
|
||||
for( unsigned i = 0; i < nfeat; i ++ ){
|
||||
if( !smat.GetSortedCol( i ).Next() ) continue;
|
||||
double sum_grad = 0.0, sum_hess = 0.0;
|
||||
for( typename FMatrix::ColIter it = smat.GetSortedCol(i); it.Next(); ){
|
||||
const float v = it.fvalue();
|
||||
sum_grad += grad[ it.rindex() ] * v;
|
||||
sum_hess += hess[ it.rindex() ] * v * v;
|
||||
}
|
||||
float w = model.weight[ i ];
|
||||
double dw = param.learning_rate * param.CalcDelta( sum_grad, sum_hess, w );
|
||||
model.weight[ i ] += dw;
|
||||
// update grad value
|
||||
for( typename FMatrix::ColIter it = smat.GetSortedCol(i); it.Next(); ){
|
||||
const float v = it.fvalue();
|
||||
grad[ it.rindex() ] += hess[ it.rindex() ] * v * dw;
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
};
|
||||
};
|
||||
#endif
|
||||
147
booster/tree/xgboost_base_treemaker.hpp
Normal file
147
booster/tree/xgboost_base_treemaker.hpp
Normal file
@@ -0,0 +1,147 @@
|
||||
#ifndef XGBOOST_BASE_TREEMAKER_HPP
|
||||
#define XGBOOST_BASE_TREEMAKER_HPP
|
||||
/*!
|
||||
* \file xgboost_base_treemaker.hpp
|
||||
* \brief implementation of base data structure for regression tree maker,
|
||||
* gives common operations of tree construction steps template
|
||||
*
|
||||
* \author Tianqi Chen: tianqi.tchen@gmail.com
|
||||
*/
|
||||
#include <vector>
|
||||
#include "xgboost_tree_model.h"
|
||||
|
||||
namespace xgboost{
|
||||
namespace booster{
|
||||
class BaseTreeMaker{
|
||||
protected:
|
||||
BaseTreeMaker( RegTree &tree,
|
||||
const TreeParamTrain ¶m )
|
||||
: tree( tree ), param( param ){}
|
||||
protected:
|
||||
// statistics that is helpful to decide a split
|
||||
struct SplitEntry{
|
||||
/*! \brief loss change after split this node */
|
||||
float loss_chg;
|
||||
/*! \brief split index */
|
||||
unsigned sindex;
|
||||
/*! \brief split value */
|
||||
float split_value;
|
||||
/*! \brief constructor */
|
||||
SplitEntry( void ){
|
||||
loss_chg = 0.0f;
|
||||
split_value = 0.0f; sindex = 0;
|
||||
}
|
||||
// This function gives better priority to lower index when loss_chg equals
|
||||
// not the best way, but helps to give consistent result during multi-thread execution
|
||||
inline bool NeedReplace( float loss_chg, unsigned split_index ) const{
|
||||
if( this->split_index() <= split_index ){
|
||||
return loss_chg > this->loss_chg;
|
||||
}else{
|
||||
return !(this->loss_chg > loss_chg);
|
||||
}
|
||||
}
|
||||
inline bool Update( const SplitEntry &e ){
|
||||
if( this->NeedReplace( e.loss_chg, e.split_index() ) ){
|
||||
this->loss_chg = e.loss_chg;
|
||||
this->sindex = e.sindex;
|
||||
this->split_value = e.split_value;
|
||||
return true;
|
||||
} else{
|
||||
return false;
|
||||
}
|
||||
}
|
||||
inline bool Update( float loss_chg, unsigned split_index, float split_value, bool default_left ){
|
||||
if( this->NeedReplace( loss_chg, split_index ) ){
|
||||
this->loss_chg = loss_chg;
|
||||
if( default_left ) split_index |= (1U << 31);
|
||||
this->sindex = split_index;
|
||||
this->split_value = split_value;
|
||||
return true;
|
||||
}else{
|
||||
return false;
|
||||
}
|
||||
}
|
||||
inline unsigned split_index( void ) const{
|
||||
return sindex & ( (1U<<31) - 1U );
|
||||
}
|
||||
inline bool default_left( void ) const{
|
||||
return (sindex >> 31) != 0;
|
||||
}
|
||||
};
|
||||
struct NodeEntry{
|
||||
/*! \brief sum gradient statistics */
|
||||
double sum_grad;
|
||||
/*! \brief sum hessian statistics */
|
||||
double sum_hess;
|
||||
/*! \brief loss of this node, without split */
|
||||
float root_gain;
|
||||
/*! \brief weight calculated related to current data */
|
||||
float weight;
|
||||
/*! \brief current best solution */
|
||||
SplitEntry best;
|
||||
NodeEntry( void ){
|
||||
sum_grad = sum_hess = 0.0;
|
||||
weight = root_gain = 0.0f;
|
||||
}
|
||||
};
|
||||
private:
|
||||
// try to prune off current leaf, return true if successful
|
||||
inline void TryPruneLeaf( int nid, int depth ){
|
||||
if( tree[ nid ].is_root() ) return;
|
||||
int pid = tree[ nid ].parent();
|
||||
RegTree::NodeStat &s = tree.stat( pid );
|
||||
++ s.leaf_child_cnt;
|
||||
|
||||
if( s.leaf_child_cnt >= 2 && param.need_prune( s.loss_chg, depth - 1 ) ){
|
||||
this->stat_num_pruned += 2;
|
||||
// need to be pruned
|
||||
tree.ChangeToLeaf( pid, param.learning_rate * s.base_weight );
|
||||
// tail recursion
|
||||
this->TryPruneLeaf( pid, depth - 1 );
|
||||
}
|
||||
}
|
||||
protected:
|
||||
/*! \brief do prunning of a tree */
|
||||
inline int DoPrune( void ){
|
||||
this->stat_num_pruned = 0;
|
||||
// initialize auxiliary statistics
|
||||
for( int nid = 0; nid < tree.param.num_nodes; ++ nid ){
|
||||
tree.stat( nid ).leaf_child_cnt = 0;
|
||||
tree.stat( nid ).loss_chg = snode[ nid ].best.loss_chg;
|
||||
tree.stat( nid ).sum_hess = static_cast<float>( snode[ nid ].sum_hess );
|
||||
}
|
||||
for( int nid = 0; nid < tree.param.num_nodes; ++ nid ){
|
||||
if( tree[ nid ].is_leaf() ) this->TryPruneLeaf( nid, tree.GetDepth(nid) );
|
||||
}
|
||||
return this->stat_num_pruned;
|
||||
}
|
||||
protected:
|
||||
/*! \brief update queue expand add in new leaves */
|
||||
inline void UpdateQueueExpand( std::vector<int> &qexpand ){
|
||||
std::vector<int> newnodes;
|
||||
for( size_t i = 0; i < qexpand.size(); ++ i ){
|
||||
const int nid = qexpand[i];
|
||||
if( !tree[ nid ].is_leaf() ){
|
||||
newnodes.push_back( tree[nid].cleft() );
|
||||
newnodes.push_back( tree[nid].cright() );
|
||||
}
|
||||
}
|
||||
// use new nodes for qexpand
|
||||
qexpand = newnodes;
|
||||
}
|
||||
protected:
|
||||
// local helper tmp data structure
|
||||
// statistics
|
||||
int stat_num_pruned;
|
||||
/*! \brief queue of nodes to be expanded */
|
||||
std::vector<int> qexpand;
|
||||
/*! \brief TreeNode Data: statistics for each constructed node, the derived class must maintain this */
|
||||
std::vector<NodeEntry> snode;
|
||||
protected:
|
||||
// original data that supports tree construction
|
||||
RegTree &tree;
|
||||
const TreeParamTrain ¶m;
|
||||
};
|
||||
}; // namespace booster
|
||||
}; // namespace xgboost
|
||||
#endif // XGBOOST_BASE_TREEMAKER_HPP
|
||||
335
booster/tree/xgboost_col_treemaker.hpp
Normal file
335
booster/tree/xgboost_col_treemaker.hpp
Normal file
@@ -0,0 +1,335 @@
|
||||
#ifndef XGBOOST_COL_TREEMAKER_HPP
|
||||
#define XGBOOST_COL_TREEMAKER_HPP
|
||||
/*!
|
||||
* \file xgboost_col_treemaker.hpp
|
||||
* \brief implementation of regression tree maker,
|
||||
* use a column based approach, with OpenMP
|
||||
* \author Tianqi Chen: tianqi.tchen@gmail.com
|
||||
*/
|
||||
// use openmp
|
||||
#include <vector>
|
||||
#include "xgboost_tree_model.h"
|
||||
#include "../../utils/xgboost_omp.h"
|
||||
#include "../../utils/xgboost_random.h"
|
||||
#include "../../utils/xgboost_fmap.h"
|
||||
#include "xgboost_base_treemaker.hpp"
|
||||
|
||||
namespace xgboost{
|
||||
namespace booster{
|
||||
template<typename FMatrix>
|
||||
class ColTreeMaker : protected BaseTreeMaker{
|
||||
public:
|
||||
ColTreeMaker( RegTree &tree,
|
||||
const TreeParamTrain ¶m,
|
||||
const std::vector<float> &grad,
|
||||
const std::vector<float> &hess,
|
||||
const FMatrix &smat,
|
||||
const std::vector<unsigned> &root_index,
|
||||
const utils::FeatConstrain &constrain )
|
||||
: BaseTreeMaker( tree, param ),
|
||||
grad(grad), hess(hess),
|
||||
smat(smat), root_index(root_index), constrain(constrain) {
|
||||
utils::Assert( grad.size() == hess.size(), "booster:invalid input" );
|
||||
utils::Assert( smat.NumRow() == hess.size(), "booster:invalid input" );
|
||||
utils::Assert( root_index.size() == 0 || root_index.size() == hess.size(), "booster:invalid input" );
|
||||
utils::Assert( smat.HaveColAccess(), "ColTreeMaker: need column access matrix" );
|
||||
}
|
||||
inline void Make( int& stat_max_depth, int& stat_num_pruned ){
|
||||
this->InitData();
|
||||
this->InitNewNode( this->qexpand );
|
||||
stat_max_depth = 0;
|
||||
|
||||
for( int depth = 0; depth < param.max_depth; ++ depth ){
|
||||
this->FindSplit( depth );
|
||||
this->UpdateQueueExpand( this->qexpand );
|
||||
this->InitNewNode( this->qexpand );
|
||||
// if nothing left to be expand, break
|
||||
if( qexpand.size() == 0 ) break;
|
||||
stat_max_depth = depth + 1;
|
||||
}
|
||||
// set all the rest expanding nodes to leaf
|
||||
for( size_t i = 0; i < qexpand.size(); ++ i ){
|
||||
const int nid = qexpand[i];
|
||||
tree[ nid ].set_leaf( snode[nid].weight * param.learning_rate );
|
||||
}
|
||||
// start prunning the tree
|
||||
stat_num_pruned = this->DoPrune();
|
||||
}
|
||||
private:
|
||||
/*! \brief per thread x per node entry to store tmp data */
|
||||
struct ThreadEntry{
|
||||
/*! \brief sum gradient statistics */
|
||||
double sum_grad;
|
||||
/*! \brief sum hessian statistics */
|
||||
double sum_hess;
|
||||
/*! \brief last feature value scanned */
|
||||
float last_fvalue;
|
||||
/*! \brief current best solution */
|
||||
SplitEntry best;
|
||||
/*! \brief constructor */
|
||||
ThreadEntry( void ){
|
||||
this->ClearStats();
|
||||
}
|
||||
/*! \brief clear statistics */
|
||||
inline void ClearStats( void ){
|
||||
sum_grad = sum_hess = 0.0;
|
||||
}
|
||||
};
|
||||
private:
|
||||
// make leaf nodes for all qexpand, update node statistics, mark leaf value
|
||||
inline void InitNewNode( const std::vector<int> &qexpand ){
|
||||
{// setup statistics space for each tree node
|
||||
for( size_t i = 0; i < stemp.size(); ++ i ){
|
||||
stemp[i].resize( tree.param.num_nodes, ThreadEntry() );
|
||||
}
|
||||
snode.resize( tree.param.num_nodes, NodeEntry() );
|
||||
}
|
||||
|
||||
const unsigned ndata = static_cast<unsigned>( position.size() );
|
||||
|
||||
#pragma omp parallel for schedule( static )
|
||||
for( unsigned i = 0; i < ndata; ++ i ){
|
||||
const int tid = omp_get_thread_num();
|
||||
if( position[i] < 0 ) continue;
|
||||
stemp[tid][ position[i] ].sum_grad += grad[i];
|
||||
stemp[tid][ position[i] ].sum_hess += hess[i];
|
||||
}
|
||||
|
||||
for( size_t j = 0; j < qexpand.size(); ++ j ){
|
||||
const int nid = qexpand[ j ];
|
||||
double sum_grad = 0.0, sum_hess = 0.0;
|
||||
for( size_t tid = 0; tid < stemp.size(); tid ++ ){
|
||||
sum_grad += stemp[tid][nid].sum_grad;
|
||||
sum_hess += stemp[tid][nid].sum_hess;
|
||||
}
|
||||
// update node statistics
|
||||
snode[nid].sum_grad = sum_grad;
|
||||
snode[nid].sum_hess = sum_hess;
|
||||
snode[nid].root_gain = param.CalcRootGain( sum_grad, sum_hess );
|
||||
if( !tree[nid].is_root() ){
|
||||
snode[nid].weight = param.CalcWeight( sum_grad, sum_hess, tree.stat( tree[nid].parent() ).base_weight );
|
||||
tree.stat(nid).base_weight = snode[nid].weight;
|
||||
}else{
|
||||
snode[nid].weight = param.CalcWeight( sum_grad, sum_hess, 0.0f );
|
||||
tree.stat(nid).base_weight = snode[nid].weight;
|
||||
}
|
||||
}
|
||||
}
|
||||
private:
|
||||
// enumerate the split values of specific feature
|
||||
template<typename Iter>
|
||||
inline void EnumerateSplit( Iter it, const unsigned fid, std::vector<ThreadEntry> &temp, bool is_forward_search ){
|
||||
// clear all the temp statistics
|
||||
for( size_t j = 0; j < qexpand.size(); ++ j ){
|
||||
temp[ qexpand[j] ].ClearStats();
|
||||
}
|
||||
|
||||
while( it.Next() ){
|
||||
const bst_uint ridx = it.rindex();
|
||||
const int nid = position[ ridx ];
|
||||
if( nid < 0 ) continue;
|
||||
|
||||
const float fvalue = it.fvalue();
|
||||
ThreadEntry &e = temp[ nid ];
|
||||
|
||||
// test if first hit, this is fine, because we set 0 during init
|
||||
if( e.sum_hess == 0.0 ){
|
||||
e.sum_grad = grad[ ridx ];
|
||||
e.sum_hess = hess[ ridx ];
|
||||
e.last_fvalue = fvalue;
|
||||
}else{
|
||||
// try to find a split
|
||||
if( fabsf(fvalue - e.last_fvalue) > rt_2eps && e.sum_hess >= param.min_child_weight ){
|
||||
const double csum_hess = snode[ nid ].sum_hess - e.sum_hess;
|
||||
if( csum_hess >= param.min_child_weight ){
|
||||
const double csum_grad = snode[nid].sum_grad - e.sum_grad;
|
||||
const double loss_chg =
|
||||
+ param.CalcGain( e.sum_grad, e.sum_hess, snode[nid].weight )
|
||||
+ param.CalcGain( csum_grad , csum_hess , snode[nid].weight )
|
||||
- snode[nid].root_gain;
|
||||
e.best.Update( loss_chg, fid, (fvalue + e.last_fvalue) * 0.5f, !is_forward_search );
|
||||
}
|
||||
}
|
||||
// update the statistics
|
||||
e.sum_grad += grad[ ridx ];
|
||||
e.sum_hess += hess[ ridx ];
|
||||
e.last_fvalue = fvalue;
|
||||
}
|
||||
}
|
||||
// finish updating all statistics, check if it is possible to include all sum statistics
|
||||
for( size_t i = 0; i < qexpand.size(); ++ i ){
|
||||
const int nid = qexpand[ i ];
|
||||
ThreadEntry &e = temp[ nid ];
|
||||
const double csum_hess = snode[nid].sum_hess - e.sum_hess;
|
||||
|
||||
if( e.sum_hess >= param.min_child_weight && csum_hess >= param.min_child_weight ){
|
||||
const double csum_grad = snode[nid].sum_grad - e.sum_grad;
|
||||
const double loss_chg =
|
||||
+ param.CalcGain( e.sum_grad, e.sum_hess, snode[nid].weight )
|
||||
+ param.CalcGain( csum_grad, csum_hess, snode[nid].weight )
|
||||
- snode[nid].root_gain;
|
||||
const float delta = is_forward_search ? rt_eps:-rt_eps;
|
||||
e.best.Update( loss_chg, fid, e.last_fvalue + delta, !is_forward_search );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// find splits at current level
|
||||
inline void FindSplit( int depth ){
|
||||
const unsigned nsize = static_cast<unsigned>( feat_index.size() );
|
||||
|
||||
#pragma omp parallel for schedule( dynamic, 1 )
|
||||
for( unsigned i = 0; i < nsize; ++ i ){
|
||||
const unsigned fid = feat_index[i];
|
||||
const int tid = omp_get_thread_num();
|
||||
if( param.need_forward_search() ){
|
||||
this->EnumerateSplit( smat.GetSortedCol(fid), fid, stemp[tid], true );
|
||||
}
|
||||
if( param.need_backward_search() ){
|
||||
this->EnumerateSplit( smat.GetReverseSortedCol(fid), fid, stemp[tid], false );
|
||||
}
|
||||
}
|
||||
|
||||
// after this each thread's stemp will get the best candidates, aggregate results
|
||||
for( size_t i = 0; i < qexpand.size(); ++ i ){
|
||||
const int nid = qexpand[ i ];
|
||||
NodeEntry &e = snode[ nid ];
|
||||
for( int tid = 0; tid < this->nthread; ++ tid ){
|
||||
e.best.Update( stemp[ tid ][ nid ].best );
|
||||
}
|
||||
|
||||
// now we know the solution in snode[ nid ], set split
|
||||
if( e.best.loss_chg > rt_eps ){
|
||||
tree.AddChilds( nid );
|
||||
tree[ nid ].set_split( e.best.split_index(), e.best.split_value, e.best.default_left() );
|
||||
} else{
|
||||
tree[ nid ].set_leaf( e.weight * param.learning_rate );
|
||||
}
|
||||
}
|
||||
|
||||
{// reset position
|
||||
// step 1, set default direct nodes to default, and leaf nodes to -1,
|
||||
const unsigned ndata = static_cast<unsigned>( position.size() );
|
||||
#pragma omp parallel for schedule( static )
|
||||
for( unsigned i = 0; i < ndata; ++ i ){
|
||||
const int nid = position[i];
|
||||
if( nid >= 0 ){
|
||||
if( tree[ nid ].is_leaf() ){
|
||||
position[i] = -1;
|
||||
}else{
|
||||
// push to default branch, correct latter
|
||||
position[i] = tree[nid].default_left() ? tree[nid].cleft(): tree[nid].cright();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// step 2, classify the non-default data into right places
|
||||
std::vector<unsigned> fsplits;
|
||||
|
||||
for( size_t i = 0; i < qexpand.size(); ++ i ){
|
||||
const int nid = qexpand[i];
|
||||
if( !tree[nid].is_leaf() ) fsplits.push_back( tree[nid].split_index() );
|
||||
}
|
||||
std::sort( fsplits.begin(), fsplits.end() );
|
||||
fsplits.resize( std::unique( fsplits.begin(), fsplits.end() ) - fsplits.begin() );
|
||||
|
||||
const unsigned nfeats = static_cast<unsigned>( fsplits.size() );
|
||||
#pragma omp parallel for schedule( dynamic, 1 )
|
||||
for( unsigned i = 0; i < nfeats; ++ i ){
|
||||
const unsigned fid = fsplits[i];
|
||||
for( typename FMatrix::ColIter it = smat.GetSortedCol( fid ); it.Next(); ){
|
||||
const bst_uint ridx = it.rindex();
|
||||
int nid = position[ ridx ];
|
||||
if( nid == -1 ) continue;
|
||||
// go back to parent, correct those who are not default
|
||||
nid = tree[ nid ].parent();
|
||||
if( tree[ nid ].split_index() == fid ){
|
||||
if( it.fvalue() < tree[nid].split_cond() ){
|
||||
position[ ridx ] = tree[ nid ].cleft();
|
||||
}else{
|
||||
position[ ridx ] = tree[ nid ].cright();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
private:
|
||||
// initialize temp data structure
|
||||
inline void InitData( void ){
|
||||
{
|
||||
position.resize( grad.size() );
|
||||
if( root_index.size() == 0 ){
|
||||
std::fill( position.begin(), position.end(), 0 );
|
||||
}else{
|
||||
for( size_t i = 0; i < root_index.size(); ++ i ){
|
||||
position[i] = root_index[i];
|
||||
utils::Assert( root_index[i] < (unsigned)tree.param.num_roots, "root index exceed setting" );
|
||||
}
|
||||
}
|
||||
// mark delete for the deleted datas
|
||||
for( size_t i = 0; i < grad.size(); ++ i ){
|
||||
if( hess[i] < 0.0f ) position[i] = -1;
|
||||
}
|
||||
if( param.subsample < 1.0f - 1e-6f ){
|
||||
for( size_t i = 0; i < grad.size(); ++ i ){
|
||||
if( hess[i] < 0.0f ) continue;
|
||||
if( random::SampleBinary( param.subsample) == 0 ){
|
||||
position[ i ] = -1;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
{// initialize feature index
|
||||
int ncol = static_cast<int>( smat.NumCol() );
|
||||
for( int i = 0; i < ncol; i ++ ){
|
||||
if( smat.GetSortedCol(i).Next() && constrain.NotBanned(i) ){
|
||||
feat_index.push_back( i );
|
||||
}
|
||||
}
|
||||
random::Shuffle( feat_index );
|
||||
}
|
||||
{// setup temp space for each thread
|
||||
if( param.nthread != 0 ){
|
||||
omp_set_num_threads( param.nthread );
|
||||
}
|
||||
#pragma omp parallel
|
||||
{
|
||||
this->nthread = omp_get_num_threads();
|
||||
}
|
||||
|
||||
// reserve a small space
|
||||
stemp.resize( this->nthread, std::vector<ThreadEntry>() );
|
||||
for( size_t i = 0; i < stemp.size(); ++ i ){
|
||||
stemp[i].reserve( 256 );
|
||||
}
|
||||
snode.reserve( 256 );
|
||||
}
|
||||
|
||||
{// expand query
|
||||
qexpand.reserve( 256 ); qexpand.clear();
|
||||
for( int i = 0; i < tree.param.num_roots; ++ i ){
|
||||
qexpand.push_back( i );
|
||||
}
|
||||
}
|
||||
}
|
||||
private:
|
||||
// number of omp thread used during training
|
||||
int nthread;
|
||||
// Per feature: shuffle index of each feature index
|
||||
std::vector<int> feat_index;
|
||||
// Instance Data: current node position in the tree of each instance
|
||||
std::vector<int> position;
|
||||
// PerThread x PerTreeNode: statistics for per thread construction
|
||||
std::vector< std::vector<ThreadEntry> > stemp;
|
||||
private:
|
||||
const std::vector<float> &grad;
|
||||
const std::vector<float> &hess;
|
||||
const FMatrix &smat;
|
||||
const std::vector<unsigned> &root_index;
|
||||
const utils::FeatConstrain &constrain;
|
||||
};
|
||||
};
|
||||
};
|
||||
#endif
|
||||
386
booster/tree/xgboost_row_treemaker.hpp
Normal file
386
booster/tree/xgboost_row_treemaker.hpp
Normal file
@@ -0,0 +1,386 @@
|
||||
#ifndef XGBOOST_ROW_TREEMAKER_HPP
|
||||
#define XGBOOST_ROW_TREEMAKER_HPP
|
||||
/*!
|
||||
* \file xgboost_row_treemaker.hpp
|
||||
* \brief implementation of regression tree maker,
|
||||
* use a row based approach
|
||||
* \author Tianqi Chen: tianqi.tchen@gmail.com
|
||||
*/
|
||||
// use openmp
|
||||
#include <vector>
|
||||
#include "xgboost_tree_model.h"
|
||||
#include "../../utils/xgboost_omp.h"
|
||||
#include "../../utils/xgboost_random.h"
|
||||
#include "../../utils/xgboost_fmap.h"
|
||||
#include "xgboost_base_treemaker.hpp"
|
||||
|
||||
namespace xgboost{
|
||||
namespace booster{
|
||||
template<typename FMatrix>
|
||||
class RowTreeMaker : protected BaseTreeMaker{
|
||||
public:
|
||||
RowTreeMaker( RegTree &tree,
|
||||
const TreeParamTrain ¶m,
|
||||
const std::vector<float> &grad,
|
||||
const std::vector<float> &hess,
|
||||
const FMatrix &smat,
|
||||
const std::vector<unsigned> &root_index,
|
||||
const utils::FeatConstrain &constrain )
|
||||
: BaseTreeMaker( tree, param ),
|
||||
grad(grad), hess(hess),
|
||||
smat(smat), root_index(root_index), constrain(constrain) {
|
||||
utils::Assert( grad.size() == hess.size(), "booster:invalid input" );
|
||||
utils::Assert( smat.NumRow() == hess.size(), "booster:invalid input" );
|
||||
utils::Assert( root_index.size() == 0 || root_index.size() == hess.size(), "booster:invalid input" );
|
||||
{// setup temp space for each thread
|
||||
if( param.nthread != 0 ){
|
||||
omp_set_num_threads( param.nthread );
|
||||
}
|
||||
#pragma omp parallel
|
||||
{
|
||||
this->nthread = omp_get_num_threads();
|
||||
}
|
||||
tmp_rptr.resize( this->nthread, std::vector<size_t>() );
|
||||
snode.reserve( 256 );
|
||||
}
|
||||
}
|
||||
inline void Make( int& stat_max_depth, int& stat_num_pruned ){
|
||||
this->InitData();
|
||||
this->InitNewNode( this->qexpand );
|
||||
stat_max_depth = 0;
|
||||
|
||||
for( int depth = 0; depth < param.max_depth; ++ depth ){
|
||||
this->FindSplit( this->qexpand, depth );
|
||||
this->UpdateQueueExpand( this->qexpand );
|
||||
this->InitNewNode( this->qexpand );
|
||||
// if nothing left to be expand, break
|
||||
if( qexpand.size() == 0 ) break;
|
||||
stat_max_depth = depth + 1;
|
||||
}
|
||||
// set all the rest expanding nodes to leaf
|
||||
for( size_t i = 0; i < qexpand.size(); ++ i ){
|
||||
const int nid = qexpand[i];
|
||||
tree[ nid ].set_leaf( snode[nid].weight * param.learning_rate );
|
||||
}
|
||||
// start prunning the tree
|
||||
stat_num_pruned = this->DoPrune();
|
||||
}
|
||||
// expand a specific node
|
||||
inline bool Expand( const std::vector<bst_uint> &valid_index, int nid ){
|
||||
if( valid_index.size() == 0 ) return false;
|
||||
this->InitDataExpand( valid_index, nid );
|
||||
this->InitNewNode( this->qexpand );
|
||||
this->FindSplit( nid, tmp_rptr[0] );
|
||||
|
||||
// update node statistics
|
||||
for( size_t i = 0; i < qexpand.size(); ++ i ){
|
||||
const int nid = qexpand[i];
|
||||
tree.stat( nid ).loss_chg = snode[ nid ].best.loss_chg;
|
||||
tree.stat( nid ).sum_hess = static_cast<float>( snode[ nid ].sum_hess );
|
||||
}
|
||||
// change the leaf
|
||||
this->UpdateQueueExpand( this->qexpand );
|
||||
this->InitNewNode( this->qexpand );
|
||||
|
||||
// set all the rest expanding nodes to leaf
|
||||
for( size_t i = 0; i < qexpand.size(); ++ i ){
|
||||
const int nid = qexpand[i];
|
||||
|
||||
tree[ nid ].set_leaf( snode[nid].weight * param.learning_rate );
|
||||
tree.stat( nid ).loss_chg = 0.0f;
|
||||
tree.stat( nid ).sum_hess = static_cast<float>( snode[ nid ].sum_hess );
|
||||
tree.param.max_depth = std::max( tree.param.max_depth, tree.GetDepth( nid ) );
|
||||
}
|
||||
if( qexpand.size() != 0 ) {
|
||||
return true;
|
||||
}else{
|
||||
return false;
|
||||
}
|
||||
}
|
||||
// collapse specific node
|
||||
inline void Collapse( const std::vector<bst_uint> &valid_index, int nid ){
|
||||
if( valid_index.size() == 0 ) return;
|
||||
this->InitDataExpand( valid_index, nid );
|
||||
this->InitNewNode( this->qexpand );
|
||||
tree.stat( nid ).loss_chg = 0.0f;
|
||||
tree.stat( nid ).sum_hess = static_cast<float>( snode[ nid ].sum_hess );
|
||||
tree.CollapseToLeaf( nid, snode[nid].weight * param.learning_rate );
|
||||
}
|
||||
private:
|
||||
// make leaf nodes for all qexpand, update node statistics, mark leaf value
|
||||
inline void InitNewNode( const std::vector<int> &qexpand ){
|
||||
snode.resize( tree.param.num_nodes, NodeEntry() );
|
||||
|
||||
for( size_t j = 0; j < qexpand.size(); ++j ){
|
||||
const int nid = qexpand[ j ];
|
||||
double sum_grad = 0.0, sum_hess = 0.0;
|
||||
|
||||
for( bst_uint i = node_bound[nid].first; i < node_bound[nid].second; ++i ){
|
||||
const bst_uint ridx = row_index_set[i];
|
||||
sum_grad += grad[ridx]; sum_hess += hess[ridx];
|
||||
}
|
||||
// update node statistics
|
||||
snode[nid].sum_grad = sum_grad;
|
||||
snode[nid].sum_hess = sum_hess;
|
||||
|
||||
snode[nid].root_gain = param.CalcRootGain( sum_grad, sum_hess );
|
||||
if( !tree[nid].is_root() ){
|
||||
snode[nid].weight = param.CalcWeight( sum_grad, sum_hess, tree.stat( tree[nid].parent() ).base_weight );
|
||||
tree.stat(nid).base_weight = snode[nid].weight;
|
||||
}else{
|
||||
snode[nid].weight = param.CalcWeight( sum_grad, sum_hess, 0.0f );
|
||||
tree.stat(nid).base_weight = snode[nid].weight;
|
||||
}
|
||||
}
|
||||
}
|
||||
private:
|
||||
// enumerate the split values of specific feature
|
||||
template<typename Iter>
|
||||
inline void EnumerateSplit( Iter it, SplitEntry &best, const int nid, const unsigned fid, bool is_forward_search ){
|
||||
float last_fvalue = 0.0f;
|
||||
double sum_hess = 0.0, sum_grad = 0.0;
|
||||
const NodeEntry enode = snode[ nid ];
|
||||
|
||||
while( it.Next() ){
|
||||
const bst_uint ridx = it.rindex();
|
||||
const float fvalue = it.fvalue();
|
||||
|
||||
if( sum_hess == 0.0 ){
|
||||
sum_grad = grad[ ridx ];
|
||||
sum_hess = hess[ ridx ];
|
||||
last_fvalue = fvalue;
|
||||
}else{
|
||||
// try to find a split
|
||||
if( fabsf(fvalue - last_fvalue) > rt_2eps && sum_hess >= param.min_child_weight ){
|
||||
const double csum_hess = enode.sum_hess - sum_hess;
|
||||
if( csum_hess >= param.min_child_weight ){
|
||||
const double csum_grad = enode.sum_grad - sum_grad;
|
||||
const double loss_chg =
|
||||
+ param.CalcGain( sum_grad, sum_hess, enode.weight )
|
||||
+ param.CalcGain( csum_grad, csum_hess, enode.weight )
|
||||
- enode.root_gain;
|
||||
best.Update( loss_chg, fid, (fvalue + last_fvalue) * 0.5f, !is_forward_search );
|
||||
}else{
|
||||
// the rest part doesn't meet split condition anyway, return
|
||||
return;
|
||||
}
|
||||
}
|
||||
// update the statistics
|
||||
sum_grad += grad[ ridx ];
|
||||
sum_hess += hess[ ridx ];
|
||||
last_fvalue = fvalue;
|
||||
}
|
||||
}
|
||||
|
||||
const double csum_hess = enode.sum_hess - sum_hess;
|
||||
if( sum_hess >= param.min_child_weight && csum_hess >= param.min_child_weight ){
|
||||
const double csum_grad = enode.sum_grad - sum_grad;
|
||||
const double loss_chg =
|
||||
+ param.CalcGain( sum_grad, sum_hess, enode.weight )
|
||||
+ param.CalcGain( csum_grad, csum_hess, enode.weight )
|
||||
- snode[nid].root_gain;
|
||||
const float delta = is_forward_search ? rt_eps:-rt_eps;
|
||||
best.Update( loss_chg, fid, last_fvalue + delta, !is_forward_search );
|
||||
}
|
||||
}
|
||||
private:
|
||||
inline void FindSplit( const std::vector<int> &qexpand, int depth ){
|
||||
int nexpand = (int)qexpand.size();
|
||||
if( depth < 3 ){
|
||||
for( int i = 0; i < nexpand; ++ i ){
|
||||
this->FindSplit( qexpand[i], tmp_rptr[0] );
|
||||
}
|
||||
}else{
|
||||
// if get to enough depth, parallelize over node
|
||||
#pragma omp parallel for schedule(dynamic,1)
|
||||
for( int i = 0; i < nexpand; ++ i ){
|
||||
const int tid = omp_get_thread_num();
|
||||
utils::Assert( tid < (int)tmp_rptr.size(), "BUG: FindSplit, tid exceed tmp_rptr size" );
|
||||
this->FindSplit( qexpand[i], tmp_rptr[tid] );
|
||||
}
|
||||
}
|
||||
}
|
||||
private:
|
||||
inline void MakeSplit( int nid, unsigned gid ){
|
||||
node_bound.resize( tree.param.num_nodes );
|
||||
// re-organize the row_index_set after split on nid
|
||||
const unsigned split_index = tree[nid].split_index();
|
||||
const float split_value = tree[nid].split_cond();
|
||||
|
||||
std::vector<bst_uint> right;
|
||||
bst_uint top = node_bound[nid].first;
|
||||
for( bst_uint i = node_bound[ nid ].first; i < node_bound[ nid ].second; ++i ){
|
||||
const bst_uint ridx = row_index_set[i];
|
||||
bool goleft = tree[ nid ].default_left();
|
||||
for( typename FMatrix::RowIter it = smat.GetRow(ridx,gid); it.Next(); ){
|
||||
if( it.findex() == split_index ){
|
||||
if( it.fvalue() < split_value ){
|
||||
goleft = true; break;
|
||||
}else{
|
||||
goleft = false; break;
|
||||
}
|
||||
}
|
||||
}
|
||||
if( goleft ) {
|
||||
row_index_set[ top ++ ] = ridx;
|
||||
}else{
|
||||
right.push_back( ridx );
|
||||
}
|
||||
}
|
||||
node_bound[ tree[nid].cleft() ] = std::make_pair( node_bound[nid].first, top );
|
||||
node_bound[ tree[nid].cright() ] = std::make_pair( top, node_bound[nid].second );
|
||||
|
||||
utils::Assert( node_bound[nid].second - top == (bst_uint)right.size(), "BUG:MakeSplit" );
|
||||
for( size_t i = 0; i < right.size(); ++ i ){
|
||||
row_index_set[ top ++ ] = right[ i ];
|
||||
}
|
||||
}
|
||||
|
||||
// find splits at current level
|
||||
inline void FindSplit( int nid, std::vector<size_t> &tmp_rptr ){
|
||||
if( tmp_rptr.size() == 0 ){
|
||||
tmp_rptr.resize( tree.param.num_feature + 1, 0 );
|
||||
}
|
||||
const bst_uint begin = node_bound[ nid ].first;
|
||||
const bst_uint end = node_bound[ nid ].second;
|
||||
const unsigned ncgroup = smat.NumColGroup();
|
||||
unsigned best_group = 0;
|
||||
|
||||
for( unsigned gid = 0; gid < ncgroup; ++gid ){
|
||||
// records the columns
|
||||
std::vector<FMatrixS::REntry> centry;
|
||||
// records the active features
|
||||
std::vector<size_t> aclist;
|
||||
utils::SparseCSRMBuilder<FMatrixS::REntry,true> builder( tmp_rptr, centry, aclist );
|
||||
builder.InitBudget( tree.param.num_feature );
|
||||
for( bst_uint i = begin; i < end; ++i ){
|
||||
const bst_uint ridx = row_index_set[i];
|
||||
for( typename FMatrix::RowIter it = smat.GetRow(ridx,gid); it.Next(); ){
|
||||
const bst_uint findex = it.findex();
|
||||
if( constrain.NotBanned( findex ) ) builder.AddBudget( findex );
|
||||
}
|
||||
}
|
||||
builder.InitStorage();
|
||||
for( bst_uint i = begin; i < end; ++i ){
|
||||
const bst_uint ridx = row_index_set[i];
|
||||
for( typename FMatrix::RowIter it = smat.GetRow(ridx,gid); it.Next(); ){
|
||||
const bst_uint findex = it.findex();
|
||||
if( constrain.NotBanned( findex ) ) {
|
||||
builder.PushElem( findex, FMatrixS::REntry( ridx, it.fvalue() ) );
|
||||
}
|
||||
}
|
||||
}
|
||||
// --- end of building column major matrix ---
|
||||
// after this point, tmp_rptr and entry is ready to use
|
||||
int naclist = (int)aclist.size();
|
||||
// best entry for each thread
|
||||
SplitEntry nbest, tbest;
|
||||
#pragma omp parallel private(tbest)
|
||||
{
|
||||
#pragma omp for schedule(dynamic,1)
|
||||
for( int j = 0; j < naclist; ++j ){
|
||||
bst_uint findex = static_cast<bst_uint>( aclist[j] );
|
||||
// local sort can be faster when the features are sparse
|
||||
std::sort( centry.begin() + tmp_rptr[findex], centry.begin() + tmp_rptr[findex+1], FMatrixS::REntry::cmp_fvalue );
|
||||
if( param.need_forward_search() ){
|
||||
this->EnumerateSplit( FMatrixS::ColIter( ¢ry[tmp_rptr[findex]]-1, ¢ry[tmp_rptr[findex+1]] - 1 ),
|
||||
tbest, nid, findex, true );
|
||||
}
|
||||
if( param.need_backward_search() ){
|
||||
this->EnumerateSplit( FMatrixS::ColBackIter( ¢ry[tmp_rptr[findex+1]], ¢ry[tmp_rptr[findex]] ),
|
||||
tbest, nid, findex, false );
|
||||
}
|
||||
}
|
||||
#pragma omp critical
|
||||
{
|
||||
nbest.Update( tbest );
|
||||
}
|
||||
}
|
||||
// if current solution gives the best
|
||||
if( snode[nid].best.Update( nbest ) ){
|
||||
best_group = gid;
|
||||
}
|
||||
// cleanup tmp_rptr for next usage
|
||||
builder.Cleanup();
|
||||
}
|
||||
|
||||
// at this point, we already know the best split
|
||||
if( snode[nid].best.loss_chg > rt_eps ){
|
||||
const SplitEntry &e = snode[nid].best;
|
||||
tree.AddChilds( nid );
|
||||
tree[ nid ].set_split( e.split_index(), e.split_value, e.default_left() );
|
||||
this->MakeSplit( nid, best_group );
|
||||
}else{
|
||||
tree[ nid ].set_leaf( snode[nid].weight * param.learning_rate );
|
||||
}
|
||||
}
|
||||
private:
|
||||
// initialize temp data structure
|
||||
inline void InitData( void ){
|
||||
std::vector<bst_uint> valid_index;
|
||||
for( size_t i = 0; i < grad.size(); ++i ){
|
||||
if( hess[ i ] < 0.0f ) continue;
|
||||
if( param.subsample > 1.0f-1e-6f || random::SampleBinary( param.subsample ) != 0 ){
|
||||
valid_index.push_back( static_cast<bst_uint>(i) );
|
||||
}
|
||||
}
|
||||
node_bound.resize( tree.param.num_roots );
|
||||
|
||||
if( root_index.size() == 0 ){
|
||||
row_index_set = valid_index;
|
||||
// set bound of root node
|
||||
node_bound[0] = std::make_pair( 0, (bst_uint)row_index_set.size() );
|
||||
}else{
|
||||
std::vector<size_t> rptr;
|
||||
utils::SparseCSRMBuilder<bst_uint> builder( rptr, row_index_set );
|
||||
builder.InitBudget( tree.param.num_roots );
|
||||
for( size_t i = 0; i < valid_index.size(); ++i ){
|
||||
const bst_uint rid = valid_index[ i ];
|
||||
utils::Assert( root_index[ rid ] < (unsigned)tree.param.num_roots, "root id exceed number of roots" );
|
||||
builder.AddBudget( root_index[ rid ] );
|
||||
}
|
||||
builder.InitStorage();
|
||||
for( size_t i = 0; i < valid_index.size(); ++i ){
|
||||
const bst_uint rid = valid_index[ i ];
|
||||
builder.PushElem( root_index[ rid ], rid );
|
||||
}
|
||||
for( size_t i = 1; i < rptr.size(); ++ i ){
|
||||
node_bound[i-1] = std::make_pair( rptr[ i - 1 ], rptr[ i ] );
|
||||
}
|
||||
}
|
||||
|
||||
{// expand query
|
||||
qexpand.reserve( 256 ); qexpand.clear();
|
||||
for( int i = 0; i < tree.param.num_roots; ++ i ){
|
||||
qexpand.push_back( i );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// initialize temp data structure
|
||||
inline void InitDataExpand( const std::vector<bst_uint> &valid_index, int nid ){
|
||||
row_index_set = valid_index;
|
||||
node_bound.resize( tree.param.num_nodes );
|
||||
node_bound[ nid ] = std::make_pair( 0, (bst_uint)row_index_set.size() );
|
||||
|
||||
qexpand.clear(); qexpand.push_back( nid );
|
||||
}
|
||||
private:
|
||||
// number of omp thread used during training
|
||||
int nthread;
|
||||
// tmp row pointer, per thread, used for tmp data construction
|
||||
std::vector< std::vector<size_t> > tmp_rptr;
|
||||
// Instance row indexes corresponding to each node
|
||||
std::vector<bst_uint> row_index_set;
|
||||
// lower and upper bound of each nodes' row_index
|
||||
std::vector< std::pair<bst_uint, bst_uint> > node_bound;
|
||||
private:
|
||||
const std::vector<float> &grad;
|
||||
const std::vector<float> &hess;
|
||||
const FMatrix &smat;
|
||||
const std::vector<unsigned> &root_index;
|
||||
const utils::FeatConstrain &constrain;
|
||||
};
|
||||
};
|
||||
};
|
||||
#endif
|
||||
430
booster/tree/xgboost_svdf_tree.hpp
Normal file
430
booster/tree/xgboost_svdf_tree.hpp
Normal file
@@ -0,0 +1,430 @@
|
||||
#ifndef XGBOOST_APEX_TREE_HPP
|
||||
#define XGBOOST_APEX_TREE_HPP
|
||||
/*!
|
||||
* \file xgboost_svdf_tree.hpp
|
||||
* \brief implementation of regression tree constructor, with layerwise support
|
||||
* this file is adapted from GBRT implementation in SVDFeature project
|
||||
* \author Tianqi Chen: tqchen@apex.sjtu.edu.cn, tianqi.tchen@gmail.com
|
||||
*/
|
||||
#include <algorithm>
|
||||
#include "xgboost_tree_model.h"
|
||||
#include "../../utils/xgboost_random.h"
|
||||
#include "../../utils/xgboost_matrix_csr.h"
|
||||
|
||||
namespace xgboost{
|
||||
namespace booster{
|
||||
inline void assert_sorted( unsigned *idset, int len ){
|
||||
if( !rt_debug || !check_bug ) return;
|
||||
for( int i = 1; i < len; i ++ ){
|
||||
utils::Assert( idset[i-1] < idset[i], "idset not sorted" );
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
namespace booster{
|
||||
// selecter of rtree to find the suitable candidate
|
||||
class RTSelecter{
|
||||
public:
|
||||
struct Entry{
|
||||
float loss_chg;
|
||||
size_t start;
|
||||
int len;
|
||||
unsigned sindex;
|
||||
float split_value;
|
||||
Entry(){}
|
||||
Entry( float loss_chg, size_t start, int len, unsigned split_index, float split_value, bool default_left ){
|
||||
this->loss_chg = loss_chg;
|
||||
this->start = start;
|
||||
this->len = len;
|
||||
if( default_left ) split_index |= (1U << 31);
|
||||
this->sindex = split_index;
|
||||
this->split_value = split_value;
|
||||
}
|
||||
inline unsigned split_index( void ) const{
|
||||
return sindex & ( (1U<<31) - 1U );
|
||||
}
|
||||
inline bool default_left( void ) const{
|
||||
return (sindex >> 31) != 0;
|
||||
}
|
||||
};
|
||||
private:
|
||||
Entry best_entry;
|
||||
const TreeParamTrain ¶m;
|
||||
public:
|
||||
RTSelecter( const TreeParamTrain &p ):param( p ){
|
||||
memset( &best_entry, 0, sizeof(best_entry) );
|
||||
best_entry.loss_chg = 0.0f;
|
||||
}
|
||||
inline void push_back( const Entry &e ){
|
||||
if( e.loss_chg > best_entry.loss_chg ) best_entry = e;
|
||||
}
|
||||
inline const Entry & select( void ){
|
||||
return best_entry;
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
// updater of rtree, allows the parameters to be stored inside, key solver
|
||||
template<typename FMatrix>
|
||||
class RTreeUpdater{
|
||||
protected:
|
||||
// training task, element of single task
|
||||
struct Task{
|
||||
// node id in tree
|
||||
int nid;
|
||||
// idset pointer, instance id in [idset,idset+len)
|
||||
unsigned *idset;
|
||||
// length of idset
|
||||
unsigned len;
|
||||
// base_weight of parent
|
||||
float parent_base_weight;
|
||||
Task(){}
|
||||
Task( int nid, unsigned *idset, unsigned len, float pweight = 0.0f ){
|
||||
this->nid = nid;
|
||||
this->idset = idset;
|
||||
this->len = len;
|
||||
this->parent_base_weight = pweight;
|
||||
}
|
||||
};
|
||||
|
||||
// sparse column entry
|
||||
struct SCEntry{
|
||||
// feature value
|
||||
float fvalue;
|
||||
// row index in grad
|
||||
unsigned rindex;
|
||||
SCEntry(){}
|
||||
SCEntry( float fvalue, unsigned rindex ){
|
||||
this->fvalue = fvalue; this->rindex = rindex;
|
||||
}
|
||||
inline bool operator<( const SCEntry &p ) const{
|
||||
return fvalue < p.fvalue;
|
||||
}
|
||||
};
|
||||
private:
|
||||
// training parameter
|
||||
const TreeParamTrain ¶m;
|
||||
// parameters, reference
|
||||
RegTree &tree;
|
||||
std::vector<float> &grad;
|
||||
std::vector<float> &hess;
|
||||
const FMatrix &smat;
|
||||
const std::vector<unsigned> &group_id;
|
||||
private:
|
||||
// maximum depth up to now
|
||||
int max_depth;
|
||||
// number of nodes being pruned
|
||||
int num_pruned;
|
||||
// stack to store current task
|
||||
std::vector<Task> task_stack;
|
||||
// temporal space for index set
|
||||
std::vector<unsigned> idset;
|
||||
private:
|
||||
// task management: NOTE DFS here
|
||||
inline void add_task( Task tsk ){
|
||||
task_stack.push_back( tsk );
|
||||
}
|
||||
inline bool next_task( Task &tsk ){
|
||||
if( task_stack.size() == 0 ) return false;
|
||||
tsk = task_stack.back();
|
||||
task_stack.pop_back();
|
||||
return true;
|
||||
}
|
||||
private:
|
||||
// try to prune off current leaf, return true if successful
|
||||
inline void try_prune_leaf( int nid, int depth ){
|
||||
if( tree[ nid ].is_root() ) return;
|
||||
int pid = tree[ nid ].parent();
|
||||
RegTree::NodeStat &s = tree.stat( pid );
|
||||
s.leaf_child_cnt ++;
|
||||
|
||||
if( s.leaf_child_cnt >= 2 && param.need_prune( s.loss_chg, depth - 1 ) ){
|
||||
// need to be pruned
|
||||
tree.ChangeToLeaf( pid, param.learning_rate * s.base_weight );
|
||||
// add statistics to number of nodes pruned
|
||||
num_pruned += 2;
|
||||
// tail recursion
|
||||
this->try_prune_leaf( pid, depth - 1 );
|
||||
}
|
||||
}
|
||||
// make leaf for current node :)
|
||||
inline void make_leaf( Task tsk, double sum_grad, double sum_hess, bool compute ){
|
||||
for( unsigned i = 0; i < tsk.len; i ++ ){
|
||||
const unsigned ridx = tsk.idset[i];
|
||||
if( compute ){
|
||||
sum_grad += grad[ ridx ];
|
||||
sum_hess += hess[ ridx ];
|
||||
}
|
||||
}
|
||||
tree.stat( tsk.nid ).sum_hess = static_cast<float>( sum_hess );
|
||||
tree[ tsk.nid ].set_leaf( param.learning_rate * param.CalcWeight( sum_grad, sum_hess, tsk.parent_base_weight ) );
|
||||
this->try_prune_leaf( tsk.nid, tree.GetDepth( tsk.nid ) );
|
||||
}
|
||||
private:
|
||||
// make split for current task, re-arrange positions in idset
|
||||
inline void make_split( Task tsk, const SCEntry *entry, int num, float loss_chg, double sum_hess, double base_weight ){
|
||||
// before split, first prepare statistics
|
||||
RegTree::NodeStat &s = tree.stat( tsk.nid );
|
||||
s.loss_chg = loss_chg;
|
||||
s.leaf_child_cnt = 0;
|
||||
s.sum_hess = static_cast<float>( sum_hess );
|
||||
s.base_weight = static_cast<float>( base_weight );
|
||||
|
||||
// add childs to current node
|
||||
tree.AddChilds( tsk.nid );
|
||||
// assert that idset is sorted
|
||||
assert_sorted( tsk.idset, tsk.len );
|
||||
// use merge sort style to get the solution
|
||||
std::vector<unsigned> qset;
|
||||
for( int i = 0; i < num; i ++ ){
|
||||
qset.push_back( entry[i].rindex );
|
||||
}
|
||||
std::sort( qset.begin(), qset.end() );
|
||||
// do merge sort style, make the other set, remove elements in qset
|
||||
for( unsigned i = 0, top = 0; i < tsk.len; i ++ ){
|
||||
if( top < qset.size() ){
|
||||
if( tsk.idset[ i ] != qset[ top ] ){
|
||||
tsk.idset[ i - top ] = tsk.idset[ i ];
|
||||
}else{
|
||||
top ++;
|
||||
}
|
||||
}else{
|
||||
tsk.idset[ i - qset.size() ] = tsk.idset[ i ];
|
||||
}
|
||||
}
|
||||
// get two parts
|
||||
RegTree::Node &n = tree[ tsk.nid ];
|
||||
Task def_part( n.default_left() ? n.cleft() : n.cright(), tsk.idset, tsk.len - qset.size(), s.base_weight );
|
||||
Task spl_part( n.default_left() ? n.cright(): n.cleft() , tsk.idset + def_part.len, qset.size(), s.base_weight );
|
||||
// fill back split part
|
||||
for( unsigned i = 0; i < spl_part.len; i ++ ){
|
||||
spl_part.idset[ i ] = qset[ i ];
|
||||
}
|
||||
// add tasks to the queue
|
||||
this->add_task( def_part );
|
||||
this->add_task( spl_part );
|
||||
}
|
||||
|
||||
// enumerate split point of the tree
|
||||
inline void enumerate_split( RTSelecter &sglobal, int tlen,
|
||||
double rsum_grad, double rsum_hess, double root_gain,
|
||||
const SCEntry *entry, size_t start, size_t end,
|
||||
int findex, float parent_base_weight ){
|
||||
// local selecter
|
||||
RTSelecter slocal( param );
|
||||
|
||||
if( param.need_forward_search() ){
|
||||
// forward process, default right
|
||||
double csum_grad = 0.0, csum_hess = 0.0;
|
||||
for( size_t j = start; j < end; j ++ ){
|
||||
const unsigned ridx = entry[ j ].rindex;
|
||||
csum_grad += grad[ ridx ];
|
||||
csum_hess += hess[ ridx ];
|
||||
// check for split
|
||||
if( j == end - 1 || entry[j].fvalue + rt_2eps < entry[ j + 1 ].fvalue ){
|
||||
if( csum_hess < param.min_child_weight ) continue;
|
||||
const double dsum_hess = rsum_hess - csum_hess;
|
||||
if( dsum_hess < param.min_child_weight ) break;
|
||||
// change of loss
|
||||
double loss_chg =
|
||||
param.CalcGain( csum_grad, csum_hess, parent_base_weight ) +
|
||||
param.CalcGain( rsum_grad - csum_grad, dsum_hess, parent_base_weight ) - root_gain;
|
||||
|
||||
const int clen = static_cast<int>( j + 1 - start );
|
||||
// add candidate to selecter
|
||||
slocal.push_back( RTSelecter::Entry( loss_chg, start, clen, findex,
|
||||
j == end - 1 ? entry[j].fvalue + rt_eps : 0.5 * (entry[j].fvalue+entry[j+1].fvalue),
|
||||
false ) );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if( param.need_backward_search() ){
|
||||
// backward process, default left
|
||||
double csum_grad = 0.0, csum_hess = 0.0;
|
||||
for( size_t j = end; j > start; j -- ){
|
||||
const unsigned ridx = entry[ j - 1 ].rindex;
|
||||
csum_grad += grad[ ridx ];
|
||||
csum_hess += hess[ ridx ];
|
||||
// check for split
|
||||
if( j == start + 1 || entry[ j - 2 ].fvalue + rt_2eps < entry[ j - 1 ].fvalue ){
|
||||
if( csum_hess < param.min_child_weight ) continue;
|
||||
const double dsum_hess = rsum_hess - csum_hess;
|
||||
if( dsum_hess < param.min_child_weight ) break;
|
||||
double loss_chg = param.CalcGain( csum_grad, csum_hess, parent_base_weight ) +
|
||||
param.CalcGain( rsum_grad - csum_grad, dsum_hess, parent_base_weight ) - root_gain;
|
||||
const int clen = static_cast<int>( end - j + 1 );
|
||||
// add candidate to selecter
|
||||
slocal.push_back( RTSelecter::Entry( loss_chg, j - 1, clen, findex,
|
||||
j == start + 1 ? entry[j-1].fvalue - rt_eps : 0.5 * (entry[j-2].fvalue + entry[j-1].fvalue),
|
||||
true ) );
|
||||
}
|
||||
}
|
||||
}
|
||||
sglobal.push_back( slocal.select() );
|
||||
}
|
||||
|
||||
private:
|
||||
// temporal storage for expand column major
|
||||
std::vector<size_t> tmp_rptr;
|
||||
// find split for current task, another implementation of expand in column major manner
|
||||
// should be more memory frugal, avoid global sorting across feature
|
||||
inline void expand( Task tsk ){
|
||||
// assert that idset is sorted
|
||||
// if reach maximum depth, make leaf from current node
|
||||
int depth = tree.GetDepth( tsk.nid );
|
||||
// update statistiss
|
||||
if( depth > max_depth ) max_depth = depth;
|
||||
// if bigger than max depth
|
||||
if( depth >= param.max_depth ){
|
||||
this->make_leaf( tsk, 0.0, 0.0, true ); return;
|
||||
}
|
||||
// convert to column major CSR format
|
||||
const int nrows = tree.param.num_feature;
|
||||
if( tmp_rptr.size() == 0 ){
|
||||
// initialize tmp storage in first usage
|
||||
tmp_rptr.resize( nrows + 1 );
|
||||
std::fill( tmp_rptr.begin(), tmp_rptr.end(), 0 );
|
||||
}
|
||||
// records the columns
|
||||
std::vector<SCEntry> entry;
|
||||
// records the active features
|
||||
std::vector<size_t> aclist;
|
||||
utils::SparseCSRMBuilder<SCEntry,true> builder( tmp_rptr, entry, aclist );
|
||||
builder.InitBudget( nrows );
|
||||
// statistics of root
|
||||
double rsum_grad = 0.0, rsum_hess = 0.0;
|
||||
for( unsigned i = 0; i < tsk.len; i ++ ){
|
||||
const unsigned ridx = tsk.idset[i];
|
||||
rsum_grad += grad[ ridx ];
|
||||
rsum_hess += hess[ ridx ];
|
||||
|
||||
for( typename FMatrix::RowIter it = smat.GetRow(ridx); it.Next(); ){
|
||||
builder.AddBudget( it.findex() );
|
||||
}
|
||||
}
|
||||
|
||||
// if minimum split weight is not meet
|
||||
if( param.cannot_split( rsum_hess, depth ) ){
|
||||
this->make_leaf( tsk, rsum_grad, rsum_hess, false ); builder.Cleanup(); return;
|
||||
}
|
||||
|
||||
builder.InitStorage();
|
||||
for( unsigned i = 0; i < tsk.len; i ++ ){
|
||||
const unsigned ridx = tsk.idset[i];
|
||||
for( typename FMatrix::RowIter it = smat.GetRow(ridx); it.Next(); ){
|
||||
builder.PushElem( it.findex(), SCEntry( it.fvalue(), ridx ) );
|
||||
}
|
||||
}
|
||||
// --- end of building column major matrix ---
|
||||
// after this point, tmp_rptr and entry is ready to use
|
||||
|
||||
// global selecter
|
||||
RTSelecter sglobal( param );
|
||||
// gain root
|
||||
const double root_gain = param.CalcRootGain( rsum_grad, rsum_hess );
|
||||
// KEY: layerwise, weight of current node if it is leaf
|
||||
const double base_weight = param.CalcWeight( rsum_grad, rsum_hess, tsk.parent_base_weight );
|
||||
// enumerate feature index
|
||||
for( size_t i = 0; i < aclist.size(); i ++ ){
|
||||
int findex = static_cast<int>( aclist[i] );
|
||||
size_t start = tmp_rptr[ findex ];
|
||||
size_t end = tmp_rptr[ findex + 1 ];
|
||||
utils::Assert( start < end, "bug" );
|
||||
// local sort can be faster when the features are sparse
|
||||
std::sort( entry.begin() + start, entry.begin() + end );
|
||||
// local selecter
|
||||
this->enumerate_split( sglobal, tsk.len,
|
||||
rsum_grad, rsum_hess, root_gain,
|
||||
&entry[0], start, end, findex, base_weight );
|
||||
}
|
||||
// Cleanup tmp_rptr for next use
|
||||
builder.Cleanup();
|
||||
// get the best solution
|
||||
const RTSelecter::Entry &e = sglobal.select();
|
||||
// allowed to split
|
||||
if( e.loss_chg > rt_eps ){
|
||||
// add splits
|
||||
tree[ tsk.nid ].set_split( e.split_index(), e.split_value, e.default_left() );
|
||||
// re-arrange idset, push tasks
|
||||
this->make_split( tsk, &entry[ e.start ], e.len, e.loss_chg, rsum_hess, base_weight );
|
||||
}else{
|
||||
// make leaf if we didn't meet requirement
|
||||
this->make_leaf( tsk, rsum_grad, rsum_hess, false );
|
||||
}
|
||||
}
|
||||
private:
|
||||
// initialize the tasks
|
||||
inline void init_tasks( size_t ngrads ){
|
||||
// add group partition if necessary
|
||||
if( group_id.size() == 0 ){
|
||||
if( param.subsample > 1.0f - 1e-6f ){
|
||||
idset.resize( 0 );
|
||||
for( size_t i = 0; i < ngrads; i ++ ){
|
||||
if( hess[i] < 0.0f ) continue;
|
||||
idset.push_back( (unsigned)i );
|
||||
}
|
||||
}else{
|
||||
idset.resize( 0 );
|
||||
for( size_t i = 0; i < ngrads; i ++ ){
|
||||
if( random::SampleBinary( param.subsample ) != 0 ){
|
||||
idset.push_back( (unsigned)i );
|
||||
}
|
||||
}
|
||||
}
|
||||
this->add_task( Task( 0, &idset[0], idset.size() ) ); return;
|
||||
}
|
||||
|
||||
utils::Assert( group_id.size() == ngrads, "number of groups must be exact" );
|
||||
{// new method for grouping, use CSR builder
|
||||
std::vector<size_t> rptr;
|
||||
utils::SparseCSRMBuilder<unsigned> builder( rptr, idset );
|
||||
builder.InitBudget( tree.param.num_roots );
|
||||
for( size_t i = 0; i < group_id.size(); i ++ ){
|
||||
// drop invalid elements
|
||||
if( hess[ i ] < 0.0f ) continue;
|
||||
utils::Assert( group_id[ i ] < (unsigned)tree.param.num_roots,
|
||||
"group id exceed number of roots" );
|
||||
builder.AddBudget( group_id[ i ] );
|
||||
}
|
||||
builder.InitStorage();
|
||||
for( size_t i = 0; i < group_id.size(); i ++ ){
|
||||
// drop invalid elements
|
||||
if( hess[ i ] < 0.0f ) continue;
|
||||
builder.PushElem( group_id[ i ], static_cast<unsigned>(i) );
|
||||
}
|
||||
for( size_t i = 1; i < rptr.size(); i ++ ){
|
||||
const size_t start = rptr[ i - 1 ], end = rptr[ i ];
|
||||
if( start < end ){
|
||||
this->add_task( Task( i - 1, &idset[ start ], end - start ) );
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
public:
|
||||
RTreeUpdater( const TreeParamTrain &pparam,
|
||||
RegTree &ptree,
|
||||
std::vector<float> &pgrad,
|
||||
std::vector<float> &phess,
|
||||
const FMatrix &psmat,
|
||||
const std::vector<unsigned> &pgroup_id ):
|
||||
param( pparam ), tree( ptree ), grad( pgrad ), hess( phess ),
|
||||
smat( psmat ), group_id( pgroup_id ){
|
||||
}
|
||||
inline int do_boost( int &num_pruned ){
|
||||
this->init_tasks( grad.size() );
|
||||
this->max_depth = 0;
|
||||
this->num_pruned = 0;
|
||||
Task tsk;
|
||||
while( this->next_task( tsk ) ){
|
||||
this->expand( tsk );
|
||||
}
|
||||
num_pruned = this->num_pruned;
|
||||
return max_depth;
|
||||
}
|
||||
};
|
||||
};
|
||||
};
|
||||
#endif
|
||||
|
||||
|
||||
268
booster/tree/xgboost_tree.hpp
Normal file
268
booster/tree/xgboost_tree.hpp
Normal file
@@ -0,0 +1,268 @@
|
||||
#ifndef XGBOOST_TREE_HPP
|
||||
#define XGBOOST_TREE_HPP
|
||||
/*!
|
||||
* \file xgboost_tree.hpp
|
||||
* \brief implementation of regression tree
|
||||
* \author Tianqi Chen: tianqi.tchen@gmail.com
|
||||
*/
|
||||
#include "xgboost_tree_model.h"
|
||||
|
||||
namespace xgboost{
|
||||
namespace booster{
|
||||
const bool rt_debug = false;
|
||||
// whether to check bugs
|
||||
const bool check_bug = false;
|
||||
|
||||
const float rt_eps = 1e-5f;
|
||||
const float rt_2eps = rt_eps * 2.0f;
|
||||
|
||||
inline double sqr( double a ){
|
||||
return a * a;
|
||||
}
|
||||
};
|
||||
};
|
||||
#include "../../utils/xgboost_fmap.h"
|
||||
#include "xgboost_svdf_tree.hpp"
|
||||
#include "xgboost_col_treemaker.hpp"
|
||||
#include "xgboost_row_treemaker.hpp"
|
||||
|
||||
namespace xgboost{
|
||||
namespace booster{
|
||||
// regression tree, construction algorithm is seperated from this class
|
||||
// see RegTreeUpdater
|
||||
template<typename FMatrix>
|
||||
class RegTreeTrainer : public InterfaceBooster<FMatrix>{
|
||||
public:
|
||||
RegTreeTrainer( void ){
|
||||
silent = 0; tree_maker = 1;
|
||||
// interact mode
|
||||
interact_type = 0;
|
||||
interact_node = 0;
|
||||
// normally we won't have more than 64 OpenMP threads
|
||||
threadtemp.resize( 64, ThreadEntry() );
|
||||
}
|
||||
virtual ~RegTreeTrainer( void ){}
|
||||
public:
|
||||
virtual void SetParam( const char *name, const char *val ){
|
||||
if( !strcmp( name, "silent") ) silent = atoi( val );
|
||||
if( !strcmp( name, "tree_maker") ) tree_maker = atoi( val );
|
||||
if( !strncmp( name, "interact:", 9) ){
|
||||
const char *ename = name + 9;
|
||||
interact_node = atoi( val );
|
||||
if( !strcmp( ename, "expand") ) {
|
||||
interact_type = 1;
|
||||
}
|
||||
if( !strcmp( ename, "remove") ) {
|
||||
interact_type = 2;
|
||||
}
|
||||
}
|
||||
param.SetParam( name, val );
|
||||
constrain.SetParam( name, val );
|
||||
tree.param.SetParam( name, val );
|
||||
}
|
||||
virtual void LoadModel( utils::IStream &fi ){
|
||||
tree.LoadModel( fi );
|
||||
}
|
||||
virtual void SaveModel( utils::IStream &fo ) const{
|
||||
tree.SaveModel( fo );
|
||||
}
|
||||
virtual void InitModel( void ){
|
||||
tree.InitModel();
|
||||
}
|
||||
public:
|
||||
virtual void DoBoost( std::vector<float> &grad,
|
||||
std::vector<float> &hess,
|
||||
const FMatrix &smat,
|
||||
const std::vector<unsigned> &root_index ){
|
||||
utils::Assert( grad.size() < UINT_MAX, "number of instance exceed what we can handle" );
|
||||
|
||||
// interactive update
|
||||
if( interact_type != 0 ){
|
||||
switch( interact_type ){
|
||||
case 1: this->ExpandNode( grad, hess, smat, root_index, interact_node ); return;
|
||||
case 2: this->CollapseNode( grad, hess, smat, root_index, interact_node ); return;
|
||||
default: utils::Error("unknown interact type");
|
||||
}
|
||||
}
|
||||
|
||||
if( !silent ){
|
||||
printf( "\nbuild GBRT with %u instances\n", (unsigned)grad.size() );
|
||||
}
|
||||
int num_pruned;
|
||||
switch( tree_maker ){
|
||||
case 0: {
|
||||
utils::Assert( !constrain.HasConstrain(), "tree maker 0 does not support constrain" );
|
||||
RTreeUpdater<FMatrix> updater( param, tree, grad, hess, smat, root_index );
|
||||
tree.param.max_depth = updater.do_boost( num_pruned );
|
||||
break;
|
||||
}
|
||||
case 1:{
|
||||
ColTreeMaker<FMatrix> maker( tree, param, grad, hess, smat, root_index, constrain );
|
||||
maker.Make( tree.param.max_depth, num_pruned );
|
||||
break;
|
||||
}
|
||||
case 2:{
|
||||
RowTreeMaker<FMatrix> maker( tree, param, grad, hess, smat, root_index, constrain );
|
||||
maker.Make( tree.param.max_depth, num_pruned );
|
||||
break;
|
||||
}
|
||||
default: utils::Error("unknown tree maker");
|
||||
}
|
||||
if( !silent ){
|
||||
printf( "tree train end, %d roots, %d extra nodes, %d pruned nodes ,max_depth=%d\n",
|
||||
tree.param.num_roots, tree.num_extra_nodes(), num_pruned, tree.MaxDepth() );
|
||||
}
|
||||
}
|
||||
virtual float Predict( const FMatrix &fmat, bst_uint ridx, unsigned gid = 0 ){
|
||||
ThreadEntry &e = this->InitTmp();
|
||||
this->PrepareTmp( fmat.GetRow(ridx), e );
|
||||
int pid = this->GetLeafIndex( e.feat, e.funknown, gid );
|
||||
this->DropTmp( fmat.GetRow(ridx), e );
|
||||
return tree[ pid ].leaf_value();
|
||||
}
|
||||
virtual int GetLeafIndex( const std::vector<float> &feat,
|
||||
const std::vector<bool> &funknown,
|
||||
unsigned gid = 0 ){
|
||||
// start from groups that belongs to current data
|
||||
int pid = (int)gid;
|
||||
// tranverse tree
|
||||
while( !tree[ pid ].is_leaf() ){
|
||||
unsigned split_index = tree[ pid ].split_index();
|
||||
pid = this->GetNext( pid, feat[ split_index ], funknown[ split_index ] );
|
||||
}
|
||||
return pid;
|
||||
}
|
||||
|
||||
virtual void PredPath( std::vector<int> &path, const FMatrix &fmat, bst_uint ridx, unsigned gid = 0 ){
|
||||
path.clear();
|
||||
ThreadEntry &e = this->InitTmp();
|
||||
this->PrepareTmp( fmat.GetRow(ridx), e );
|
||||
|
||||
int pid = (int)gid;
|
||||
path.push_back( pid );
|
||||
// tranverse tree
|
||||
while( !tree[ pid ].is_leaf() ){
|
||||
unsigned split_index = tree[ pid ].split_index();
|
||||
pid = this->GetNext( pid, e.feat[ split_index ], e.funknown[ split_index ] );
|
||||
path.push_back( pid );
|
||||
}
|
||||
this->DropTmp( fmat.GetRow(ridx), e );
|
||||
}
|
||||
virtual float Predict( const std::vector<float> &feat,
|
||||
const std::vector<bool> &funknown,
|
||||
unsigned gid = 0 ){
|
||||
utils::Assert( feat.size() >= (size_t)tree.param.num_feature,
|
||||
"input data smaller than num feature" );
|
||||
int pid = this->GetLeafIndex( feat, funknown, gid );
|
||||
return tree[ pid ].leaf_value();
|
||||
}
|
||||
virtual void DumpModel( FILE *fo, const utils::FeatMap &fmap, bool with_stats ){
|
||||
tree.DumpModel( fo, fmap, with_stats );
|
||||
}
|
||||
private:
|
||||
inline void CollapseNode( std::vector<float> &grad,
|
||||
std::vector<float> &hess,
|
||||
const FMatrix &fmat,
|
||||
const std::vector<unsigned> &root_index,
|
||||
int nid ){
|
||||
std::vector<bst_uint> valid_index;
|
||||
for( size_t i = 0; i < grad.size(); i ++ ){
|
||||
ThreadEntry &e = this->InitTmp();
|
||||
this->PrepareTmp( fmat.GetRow(i), e );
|
||||
int pid = root_index.size() == 0 ? 0 : (int)root_index[i];
|
||||
// tranverse tree
|
||||
while( !tree[ pid ].is_leaf() ){
|
||||
unsigned split_index = tree[ pid ].split_index();
|
||||
pid = this->GetNext( pid, e.feat[ split_index ], e.funknown[ split_index ] );
|
||||
if( pid == nid ){
|
||||
valid_index.push_back( static_cast<bst_uint>(i) ); break;
|
||||
}
|
||||
}
|
||||
this->DropTmp( fmat.GetRow(i), e );
|
||||
}
|
||||
RowTreeMaker<FMatrix> maker( tree, param, grad, hess, fmat, root_index, constrain );
|
||||
maker.Collapse( valid_index, nid );
|
||||
if( !silent ){
|
||||
printf( "tree collapse end, max_depth=%d\n", tree.param.max_depth );
|
||||
}
|
||||
}
|
||||
inline void ExpandNode( std::vector<float> &grad,
|
||||
std::vector<float> &hess,
|
||||
const FMatrix &fmat,
|
||||
const std::vector<unsigned> &root_index,
|
||||
int nid ){
|
||||
std::vector<bst_uint> valid_index;
|
||||
for( size_t i = 0; i < grad.size(); i ++ ){
|
||||
ThreadEntry &e = this->InitTmp();
|
||||
this->PrepareTmp( fmat.GetRow(i), e );
|
||||
unsigned rtidx = root_index.size() == 0 ? 0 : root_index[i];
|
||||
int pid = this->GetLeafIndex( e.feat, e.funknown, rtidx );
|
||||
this->DropTmp( fmat.GetRow(i), e );
|
||||
if( pid == nid ) valid_index.push_back( static_cast<bst_uint>(i) );
|
||||
}
|
||||
RowTreeMaker<FMatrix> maker( tree, param, grad, hess, fmat, root_index, constrain );
|
||||
bool success = maker.Expand( valid_index, nid );
|
||||
if( !silent ){
|
||||
printf( "tree expand end, success=%d, max_depth=%d\n", (int)success, tree.MaxDepth() );
|
||||
}
|
||||
}
|
||||
private:
|
||||
// silent
|
||||
int silent;
|
||||
RegTree tree;
|
||||
TreeParamTrain param;
|
||||
private:
|
||||
// some training parameters
|
||||
// tree maker
|
||||
int tree_maker;
|
||||
// interaction
|
||||
int interact_type;
|
||||
int interact_node;
|
||||
// feature constrain
|
||||
utils::FeatConstrain constrain;
|
||||
private:
|
||||
struct ThreadEntry{
|
||||
std::vector<float> feat;
|
||||
std::vector<bool> funknown;
|
||||
};
|
||||
std::vector<ThreadEntry> threadtemp;
|
||||
private:
|
||||
inline ThreadEntry& InitTmp( void ){
|
||||
const int tid = omp_get_thread_num();
|
||||
utils::Assert( tid < (int)threadtemp.size(), "RTreeUpdater: threadtemp pool is too small" );
|
||||
ThreadEntry &e = threadtemp[ tid ];
|
||||
if( e.feat.size() != (size_t)tree.param.num_feature ){
|
||||
e.feat.resize( tree.param.num_feature );
|
||||
e.funknown.resize( tree.param.num_feature );
|
||||
std::fill( e.funknown.begin(), e.funknown.end(), true );
|
||||
}
|
||||
return e;
|
||||
}
|
||||
inline void PrepareTmp( typename FMatrix::RowIter it, ThreadEntry &e ){
|
||||
while( it.Next() ){
|
||||
const bst_uint findex = it.findex();
|
||||
utils::Assert( findex < (unsigned)tree.param.num_feature , "input feature execeed bound" );
|
||||
e.funknown[ findex ] = false;
|
||||
e.feat[ findex ] = it.fvalue();
|
||||
}
|
||||
}
|
||||
inline void DropTmp( typename FMatrix::RowIter it, ThreadEntry &e ){
|
||||
while( it.Next() ){
|
||||
e.funknown[ it.findex() ] = true;
|
||||
}
|
||||
}
|
||||
|
||||
inline int GetNext( int pid, float fvalue, bool is_unknown ){
|
||||
float split_value = tree[ pid ].split_cond();
|
||||
if( is_unknown ){
|
||||
return tree[ pid ].cdefault();
|
||||
}else{
|
||||
if( fvalue < split_value ) return tree[ pid ].cleft();
|
||||
else return tree[ pid ].cright();
|
||||
}
|
||||
}
|
||||
};
|
||||
};
|
||||
};
|
||||
|
||||
#endif
|
||||
554
booster/tree/xgboost_tree_model.h
Normal file
554
booster/tree/xgboost_tree_model.h
Normal file
@@ -0,0 +1,554 @@
|
||||
#ifndef XGBOOST_TREE_MODEL_H
|
||||
#define XGBOOST_TREE_MODEL_H
|
||||
/*!
|
||||
* \file xgboost_tree_model.h
|
||||
* \brief generic definition of model structure used in tree models
|
||||
* used to support learning of boosting tree
|
||||
* \author Tianqi Chen: tianqi.tchen@gmail.com
|
||||
*/
|
||||
#include <cstring>
|
||||
#include "../../utils/xgboost_utils.h"
|
||||
#include "../../utils/xgboost_stream.h"
|
||||
|
||||
namespace xgboost{
|
||||
namespace booster{
|
||||
/*!
|
||||
* \brief template class of TreeModel
|
||||
* \tparam TSplitCond data type to indicate split condition
|
||||
* \tparam TNodeStat auxiliary statistics of node to help tree building
|
||||
*/
|
||||
template<typename TSplitCond,typename TNodeStat>
|
||||
class TreeModel{
|
||||
public:
|
||||
/*! \brief data type to indicate split condition */
|
||||
typedef TNodeStat NodeStat;
|
||||
/*! \brief auxiliary statistics of node to help tree building */
|
||||
typedef TSplitCond SplitCond;
|
||||
public:
|
||||
/*! \brief parameters of the tree */
|
||||
struct Param{
|
||||
/*! \brief number of start root */
|
||||
int num_roots;
|
||||
/*! \brief total number of nodes */
|
||||
int num_nodes;
|
||||
/*!\brief number of deleted nodes */
|
||||
int num_deleted;
|
||||
/*! \brief maximum depth, this is a statistics of the tree */
|
||||
int max_depth;
|
||||
/*! \brief number of features used for tree construction */
|
||||
int num_feature;
|
||||
/*! \brief reserved part */
|
||||
int reserved[ 32 ];
|
||||
/*! \brief constructor */
|
||||
Param( void ){
|
||||
max_depth = 0;
|
||||
memset( reserved, 0, sizeof( reserved ) );
|
||||
}
|
||||
/*!
|
||||
* \brief set parameters from outside
|
||||
* \param name name of the parameter
|
||||
* \param val value of the parameter
|
||||
*/
|
||||
inline void SetParam( const char *name, const char *val ){
|
||||
if( !strcmp("num_roots", name ) ) num_roots = atoi( val );
|
||||
if( !strcmp("num_feature", name ) ) num_feature = atoi( val );
|
||||
}
|
||||
};
|
||||
/*! \brief tree node */
|
||||
class Node{
|
||||
private:
|
||||
friend class TreeModel<TSplitCond,TNodeStat>;
|
||||
/*!
|
||||
* \brief in leaf node, we have weights, in non-leaf nodes,
|
||||
* we have split condition
|
||||
*/
|
||||
union Info{
|
||||
float leaf_value;
|
||||
TSplitCond split_cond;
|
||||
};
|
||||
private:
|
||||
// pointer to parent, highest bit is used to indicate whether it's a left child or not
|
||||
int parent_;
|
||||
// pointer to left, right
|
||||
int cleft_, cright_;
|
||||
// split feature index, left split or right split depends on the highest bit
|
||||
unsigned sindex_;
|
||||
// extra info
|
||||
Info info_;
|
||||
private:
|
||||
inline void set_parent( int pidx, bool is_left_child = true ){
|
||||
if( is_left_child ) pidx |= (1U << 31);
|
||||
this->parent_ = pidx;
|
||||
}
|
||||
public:
|
||||
/*! \brief index of left child */
|
||||
inline int cleft( void ) const{
|
||||
return this->cleft_;
|
||||
}
|
||||
/*! \brief index of right child */
|
||||
inline int cright( void ) const{
|
||||
return this->cright_;
|
||||
}
|
||||
/*! \brief index of default child when feature is missing */
|
||||
inline int cdefault( void ) const{
|
||||
return this->default_left() ? this->cleft() : this->cright();
|
||||
}
|
||||
/*! \brief feature index of split condition */
|
||||
inline unsigned split_index( void ) const{
|
||||
return sindex_ & ( (1U<<31) - 1U );
|
||||
}
|
||||
/*! \brief when feature is unknown, whether goes to left child */
|
||||
inline bool default_left( void ) const{
|
||||
return (sindex_ >> 31) != 0;
|
||||
}
|
||||
/*! \brief whether current node is leaf node */
|
||||
inline bool is_leaf( void ) const{
|
||||
return cleft_ == -1;
|
||||
}
|
||||
/*! \brief get leaf value of leaf node */
|
||||
inline float leaf_value( void ) const{
|
||||
return (this->info_).leaf_value;
|
||||
}
|
||||
/*! \brief get split condition of the node */
|
||||
inline TSplitCond split_cond( void ) const{
|
||||
return (this->info_).split_cond;
|
||||
}
|
||||
/*! \brief get parent of the node */
|
||||
inline int parent( void ) const{
|
||||
return parent_ & ( (1U << 31) - 1 );
|
||||
}
|
||||
/*! \brief whether current node is left child */
|
||||
inline bool is_left_child( void ) const{
|
||||
return ( parent_ & (1U << 31)) != 0;
|
||||
}
|
||||
/*! \brief whether current node is root */
|
||||
inline bool is_root( void ) const{
|
||||
return parent_ == -1;
|
||||
}
|
||||
/*!
|
||||
* \brief set the right child
|
||||
* \param nide node id to right child
|
||||
*/
|
||||
inline void set_right_child( int nid ){
|
||||
this->cright_ = nid;
|
||||
}
|
||||
/*!
|
||||
* \brief set split condition of current node
|
||||
* \param split_index feature index to split
|
||||
* \param split_cond split condition
|
||||
* \param default_left the default direction when feature is unknown
|
||||
*/
|
||||
inline void set_split( unsigned split_index, TSplitCond split_cond, bool default_left = false ){
|
||||
if( default_left ) split_index |= (1U << 31);
|
||||
this->sindex_ = split_index;
|
||||
(this->info_).split_cond = split_cond;
|
||||
}
|
||||
/*!
|
||||
* \brief set the leaf value of the node
|
||||
* \param value leaf value
|
||||
* \param right right index, could be used to store
|
||||
* additional information
|
||||
*/
|
||||
inline void set_leaf( float value, int right = -1 ){
|
||||
(this->info_).leaf_value = value;
|
||||
this->cleft_ = -1;
|
||||
this->cright_ = right;
|
||||
}
|
||||
};
|
||||
protected:
|
||||
// vector of nodes
|
||||
std::vector<Node> nodes;
|
||||
// stats of nodes
|
||||
std::vector<TNodeStat> stats;
|
||||
protected:
|
||||
// free node space, used during training process
|
||||
std::vector<int> deleted_nodes;
|
||||
// allocate a new node,
|
||||
// !!!!!! NOTE: may cause BUG here, nodes.resize
|
||||
inline int AllocNode( void ){
|
||||
if( param.num_deleted != 0 ){
|
||||
int nd = deleted_nodes.back();
|
||||
deleted_nodes.pop_back();
|
||||
param.num_deleted --;
|
||||
return nd;
|
||||
}
|
||||
int nd = param.num_nodes ++;
|
||||
nodes.resize( param.num_nodes );
|
||||
stats.resize( param.num_nodes );
|
||||
return nd;
|
||||
}
|
||||
// delete a tree node
|
||||
inline void DeleteNode( int nid ){
|
||||
utils::Assert( nid >= param.num_roots, "can not delete root");
|
||||
deleted_nodes.push_back( nid );
|
||||
nodes[ nid ].set_parent( -1 );
|
||||
param.num_deleted ++;
|
||||
}
|
||||
public:
|
||||
/*!
|
||||
* \brief change a non leaf node to a leaf node, delete its children
|
||||
* \param rid node id of the node
|
||||
* \param new leaf value
|
||||
*/
|
||||
inline void ChangeToLeaf( int rid, float value ){
|
||||
utils::Assert( nodes[ nodes[rid].cleft() ].is_leaf(), "can not delete a non termial child");
|
||||
utils::Assert( nodes[ nodes[rid].cright() ].is_leaf(), "can not delete a non termial child");
|
||||
this->DeleteNode( nodes[ rid ].cleft() );
|
||||
this->DeleteNode( nodes[ rid ].cright() );
|
||||
nodes[ rid ].set_leaf( value );
|
||||
}
|
||||
/*!
|
||||
* \brief collapse a non leaf node to a leaf node, delete its children
|
||||
* \param rid node id of the node
|
||||
* \param new leaf value
|
||||
*/
|
||||
inline void CollapseToLeaf( int rid, float value ){
|
||||
if( nodes[rid].is_leaf() ) return;
|
||||
if( !nodes[ nodes[rid].cleft() ].is_leaf() ){
|
||||
CollapseToLeaf( nodes[rid].cleft(), 0.0f );
|
||||
}
|
||||
if( !nodes[ nodes[rid].cright() ].is_leaf() ){
|
||||
CollapseToLeaf( nodes[rid].cright(), 0.0f );
|
||||
}
|
||||
this->ChangeToLeaf( rid, value );
|
||||
}
|
||||
public:
|
||||
/*! \brief model parameter */
|
||||
Param param;
|
||||
public:
|
||||
/*! \brief constructor */
|
||||
TreeModel( void ){
|
||||
param.num_nodes = 1;
|
||||
param.num_roots = 1;
|
||||
param.num_deleted = 0;
|
||||
nodes.resize( 1 );
|
||||
}
|
||||
/*! \brief get node given nid */
|
||||
inline Node &operator[]( int nid ){
|
||||
return nodes[ nid ];
|
||||
}
|
||||
/*! \brief get node statistics given nid */
|
||||
inline NodeStat &stat( int nid ){
|
||||
return stats[ nid ];
|
||||
}
|
||||
/*! \brief initialize the model */
|
||||
inline void InitModel( void ){
|
||||
param.num_nodes = param.num_roots;
|
||||
nodes.resize( param.num_nodes );
|
||||
stats.resize( param.num_nodes );
|
||||
for( int i = 0; i < param.num_nodes; i ++ ){
|
||||
nodes[i].set_leaf( 0.0f );
|
||||
nodes[i].set_parent( -1 );
|
||||
}
|
||||
}
|
||||
/*!
|
||||
* \brief load model from stream
|
||||
* \param fi input stream
|
||||
*/
|
||||
inline void LoadModel( utils::IStream &fi ){
|
||||
utils::Assert( fi.Read( ¶m, sizeof(Param) ) > 0, "TreeModel" );
|
||||
nodes.resize( param.num_nodes ); stats.resize( param.num_nodes );
|
||||
utils::Assert( fi.Read( &nodes[0], sizeof(Node) * nodes.size() ) > 0, "TreeModel::Node" );
|
||||
utils::Assert( fi.Read( &stats[0], sizeof(NodeStat) * stats.size() ) > 0, "TreeModel::Node" );
|
||||
|
||||
deleted_nodes.resize( 0 );
|
||||
for( int i = param.num_roots; i < param.num_nodes; i ++ ){
|
||||
if( nodes[i].is_root() ) deleted_nodes.push_back( i );
|
||||
}
|
||||
utils::Assert( (int)deleted_nodes.size() == param.num_deleted, "number of deleted nodes do not match" );
|
||||
}
|
||||
/*!
|
||||
* \brief save model to stream
|
||||
* \param fo output stream
|
||||
*/
|
||||
inline void SaveModel( utils::IStream &fo ) const{
|
||||
utils::Assert( param.num_nodes == (int)nodes.size() );
|
||||
utils::Assert( param.num_nodes == (int)stats.size() );
|
||||
fo.Write( ¶m, sizeof(Param) );
|
||||
fo.Write( &nodes[0], sizeof(Node) * nodes.size() );
|
||||
fo.Write( &stats[0], sizeof(NodeStat) * nodes.size() );
|
||||
}
|
||||
/*!
|
||||
* \brief add child nodes to node
|
||||
* \param nid node id to add childs
|
||||
*/
|
||||
inline void AddChilds( int nid ){
|
||||
int pleft = this->AllocNode();
|
||||
int pright = this->AllocNode();
|
||||
nodes[ nid ].cleft_ = pleft;
|
||||
nodes[ nid ].cright_ = pright;
|
||||
nodes[ nodes[ nid ].cleft() ].set_parent( nid, true );
|
||||
nodes[ nodes[ nid ].cright() ].set_parent( nid, false );
|
||||
}
|
||||
/*!
|
||||
* \brief only add a right child to a leaf node
|
||||
* \param node id to add right child
|
||||
*/
|
||||
inline void AddRightChild( int nid ){
|
||||
int pright = this->AllocNode();
|
||||
nodes[ nid ].right = pright;
|
||||
nodes[ nodes[ nid ].right ].set_parent( nid, false );
|
||||
}
|
||||
/*!
|
||||
* \brief get current depth
|
||||
* \param nid node id
|
||||
* \param pass_rchild whether right child is not counted in depth
|
||||
*/
|
||||
inline int GetDepth( int nid, bool pass_rchild = false ) const{
|
||||
int depth = 0;
|
||||
while( !nodes[ nid ].is_root() ){
|
||||
if( !pass_rchild || nodes[ nid ].is_left_child() ) depth ++;
|
||||
nid = nodes[ nid ].parent();
|
||||
}
|
||||
return depth;
|
||||
}
|
||||
/*!
|
||||
* \brief get maximum depth
|
||||
* \param nid node id
|
||||
*/
|
||||
inline int MaxDepth( int nid ) const{
|
||||
if( nodes[nid].is_leaf() ) return 0;
|
||||
return std::max( MaxDepth( nodes[nid].cleft() )+1,
|
||||
MaxDepth( nodes[nid].cright() )+1 );
|
||||
}
|
||||
/*!
|
||||
* \brief get maximum depth
|
||||
*/
|
||||
inline int MaxDepth( void ){
|
||||
int maxd = 0;
|
||||
for( int i = 0; i < param.num_roots; ++ i ){
|
||||
maxd = std::max( maxd, MaxDepth( i ) );
|
||||
}
|
||||
return maxd;
|
||||
}
|
||||
/*! \brief number of extra nodes besides the root */
|
||||
inline int num_extra_nodes( void ) const {
|
||||
return param.num_nodes - param.num_roots - param.num_deleted;
|
||||
}
|
||||
/*! \brief dump model to text file */
|
||||
inline void DumpModel( FILE *fo, const utils::FeatMap& fmap, bool with_stats ){
|
||||
this->Dump( 0, fo, fmap, 0, with_stats );
|
||||
}
|
||||
private:
|
||||
void Dump( int nid, FILE *fo, const utils::FeatMap& fmap, int depth, bool with_stats ){
|
||||
for( int i = 0; i < depth; ++ i ){
|
||||
fprintf( fo, "\t" );
|
||||
}
|
||||
if( nodes[ nid ].is_leaf() ){
|
||||
fprintf( fo, "%d:leaf=%f ", nid, nodes[ nid ].leaf_value() );
|
||||
if( with_stats ){
|
||||
stat( nid ).Print( fo, true );
|
||||
}
|
||||
fprintf( fo, "\n" );
|
||||
}else{
|
||||
// right then left,
|
||||
TSplitCond cond = nodes[ nid ].split_cond();
|
||||
const unsigned split_index = nodes[ nid ].split_index();
|
||||
|
||||
if( split_index < fmap.size() ){
|
||||
switch( fmap.type(split_index) ){
|
||||
case utils::FeatMap::kIndicator:{
|
||||
int nyes = nodes[ nid ].default_left()?nodes[nid].cright():nodes[nid].cleft();
|
||||
fprintf( fo, "%d:[%s] yes=%d,no=%d",
|
||||
nid, fmap.name( split_index ),
|
||||
nyes, nodes[nid].cdefault() );
|
||||
break;
|
||||
}
|
||||
case utils::FeatMap::kInteger:{
|
||||
fprintf( fo, "%d:[%s<%d] yes=%d,no=%d,missing=%d",
|
||||
nid, fmap.name(split_index), int( float(cond)+1.0f),
|
||||
nodes[ nid ].cleft(), nodes[ nid ].cright(),
|
||||
nodes[ nid ].cdefault() );
|
||||
break;
|
||||
}
|
||||
case utils::FeatMap::kFloat:
|
||||
case utils::FeatMap::kQuantitive:{
|
||||
fprintf( fo, "%d:[%s<%f] yes=%d,no=%d,missing=%d",
|
||||
nid, fmap.name(split_index), float(cond),
|
||||
nodes[ nid ].cleft(), nodes[ nid ].cright(),
|
||||
nodes[ nid ].cdefault() );
|
||||
break;
|
||||
}
|
||||
default: utils::Error("unknown fmap type");
|
||||
}
|
||||
}else{
|
||||
fprintf( fo, "%d:[f%u<%f] yes=%d,no=%d,missing=%d",
|
||||
nid, split_index, float(cond),
|
||||
nodes[ nid ].cleft(), nodes[ nid ].cright(),
|
||||
nodes[ nid ].cdefault() );
|
||||
}
|
||||
if( with_stats ){
|
||||
fprintf( fo, " ");
|
||||
stat( nid ).Print( fo, false );
|
||||
}
|
||||
fprintf( fo, "\n" );
|
||||
this->Dump( nodes[ nid ].cleft(), fo, fmap, depth+1, with_stats );
|
||||
this->Dump( nodes[ nid ].cright(), fo, fmap, depth+1, with_stats );
|
||||
}
|
||||
}
|
||||
};
|
||||
};
|
||||
|
||||
namespace booster{
|
||||
/*! \brief training parameters for regression tree */
|
||||
struct TreeParamTrain{
|
||||
// learning step size for a time
|
||||
float learning_rate;
|
||||
// minimum loss change required for a split
|
||||
float min_split_loss;
|
||||
// maximum depth of a tree
|
||||
int max_depth;
|
||||
//----- the rest parameters are less important ----
|
||||
// minimum amount of hessian(weight) allowed in a child
|
||||
float min_child_weight;
|
||||
// weight decay parameter used to control leaf fitting
|
||||
float reg_lambda;
|
||||
// reg method
|
||||
int reg_method;
|
||||
// default direction choice
|
||||
int default_direction;
|
||||
// whether we want to do subsample
|
||||
float subsample;
|
||||
// whether to use layerwise aware regularization
|
||||
int use_layerwise;
|
||||
// number of threads to be used for tree construction, if OpenMP is enabled, if equals 0, use system default
|
||||
int nthread;
|
||||
/*! \brief constructor */
|
||||
TreeParamTrain( void ){
|
||||
learning_rate = 0.3f;
|
||||
min_child_weight = 1.0f;
|
||||
max_depth = 6;
|
||||
reg_lambda = 1.0f;
|
||||
reg_method = 2;
|
||||
default_direction = 0;
|
||||
subsample = 1.0f;
|
||||
use_layerwise = 0;
|
||||
nthread = 0;
|
||||
}
|
||||
/*!
|
||||
* \brief set parameters from outside
|
||||
* \param name name of the parameter
|
||||
* \param val value of the parameter
|
||||
*/
|
||||
inline void SetParam( const char *name, const char *val ){
|
||||
// sync-names
|
||||
if( !strcmp( name, "gamma") ) min_split_loss = (float)atof( val );
|
||||
if( !strcmp( name, "eta") ) learning_rate = (float)atof( val );
|
||||
if( !strcmp( name, "lambda") ) reg_lambda = (float)atof( val );
|
||||
// normal tree prameters
|
||||
if( !strcmp( name, "learning_rate") ) learning_rate = (float)atof( val );
|
||||
if( !strcmp( name, "min_child_weight") ) min_child_weight = (float)atof( val );
|
||||
if( !strcmp( name, "min_split_loss") ) min_split_loss = (float)atof( val );
|
||||
if( !strcmp( name, "max_depth") ) max_depth = atoi( val );
|
||||
if( !strcmp( name, "reg_lambda") ) reg_lambda = (float)atof( val );
|
||||
if( !strcmp( name, "reg_method") ) reg_method = (float)atof( val );
|
||||
if( !strcmp( name, "subsample") ) subsample = (float)atof( val );
|
||||
if( !strcmp( name, "use_layerwise") ) use_layerwise = atoi( val );
|
||||
if( !strcmp( name, "nthread") ) nthread = atoi( val );
|
||||
if( !strcmp( name, "default_direction") ) {
|
||||
if( !strcmp( val, "learn") ) default_direction = 0;
|
||||
if( !strcmp( val, "left") ) default_direction = 1;
|
||||
if( !strcmp( val, "right") ) default_direction = 2;
|
||||
}
|
||||
}
|
||||
protected:
|
||||
// functions for L1 cost
|
||||
static inline double ThresholdL1( double w, double lambda ){
|
||||
if( w > +lambda ) return w - lambda;
|
||||
if( w < -lambda ) return w + lambda;
|
||||
return 0.0;
|
||||
}
|
||||
inline double CalcWeight( double sum_grad, double sum_hess )const{
|
||||
if( sum_hess < min_child_weight ){
|
||||
return 0.0;
|
||||
}else{
|
||||
switch( reg_method ){
|
||||
case 1: return - ThresholdL1( sum_grad, reg_lambda ) / sum_hess;
|
||||
case 2: return - sum_grad / ( sum_hess + reg_lambda );
|
||||
// elstic net
|
||||
case 3: return - ThresholdL1( sum_grad, 0.5 * reg_lambda ) / ( sum_hess + 0.5 * reg_lambda );
|
||||
default: return - sum_grad / sum_hess;
|
||||
}
|
||||
}
|
||||
}
|
||||
private:
|
||||
inline static double Sqr( double a ){
|
||||
return a * a;
|
||||
}
|
||||
public:
|
||||
// calculate the cost of loss function
|
||||
inline double CalcGain( double sum_grad, double sum_hess ) const{
|
||||
if( sum_hess < min_child_weight ){
|
||||
return 0.0;
|
||||
}
|
||||
switch( reg_method ){
|
||||
case 1 : return Sqr( ThresholdL1( sum_grad, reg_lambda ) ) / sum_hess;
|
||||
case 2 : return Sqr( sum_grad ) / ( sum_hess + reg_lambda );
|
||||
// elstic net
|
||||
case 3 : return Sqr( ThresholdL1( sum_grad, 0.5 * reg_lambda ) ) / ( sum_hess + 0.5 * reg_lambda );
|
||||
default: return Sqr( sum_grad ) / sum_hess;
|
||||
}
|
||||
}
|
||||
// KEY:layerwise
|
||||
// calculate cost of root
|
||||
inline double CalcRootGain( double sum_grad, double sum_hess ) const{
|
||||
if( use_layerwise == 0 ) return this->CalcGain( sum_grad, sum_hess );
|
||||
else return 0.0;
|
||||
}
|
||||
// KEY:layerwise
|
||||
// calculate the cost after split
|
||||
// base_weight: the base_weight of parent
|
||||
inline double CalcGain( double sum_grad, double sum_hess, double base_weight ) const{
|
||||
if( use_layerwise == 0 ) return this->CalcGain( sum_grad, sum_hess );
|
||||
else return this->CalcGain( sum_grad + sum_hess * base_weight, sum_hess );
|
||||
}
|
||||
// calculate the weight of leaf
|
||||
inline double CalcWeight( double sum_grad, double sum_hess, double parent_base_weight )const{
|
||||
if( use_layerwise == 0 ) return CalcWeight( sum_grad, sum_hess );
|
||||
else return parent_base_weight + CalcWeight( sum_grad + parent_base_weight * sum_hess, sum_hess );
|
||||
}
|
||||
/*! \brief whether need forward small to big search: default right */
|
||||
inline bool need_forward_search( void ) const{
|
||||
return this->default_direction != 1;
|
||||
}
|
||||
/*! \brief whether need forward big to small search: default left */
|
||||
inline bool need_backward_search( void ) const{
|
||||
return this->default_direction != 2;
|
||||
}
|
||||
/*! \brief given the loss change, whether we need to invode prunning */
|
||||
inline bool need_prune( double loss_chg, int depth ) const{
|
||||
return loss_chg < this->min_split_loss;
|
||||
}
|
||||
/*! \brief whether we can split with current hessian */
|
||||
inline bool cannot_split( double sum_hess, int depth ) const{
|
||||
return sum_hess < this->min_child_weight * 2.0;
|
||||
}
|
||||
};
|
||||
};
|
||||
|
||||
namespace booster{
|
||||
/*! \brief node statistics used in regression tree */
|
||||
struct RTreeNodeStat{
|
||||
/*! \brief loss chg caused by current split */
|
||||
float loss_chg;
|
||||
/*! \brief sum of hessian values, used to measure coverage of data */
|
||||
float sum_hess;
|
||||
/*! \brief weight of current node */
|
||||
float base_weight;
|
||||
/*! \brief number of child that is leaf node known up to now */
|
||||
int leaf_child_cnt;
|
||||
/*! \brief print information of current stats to fo */
|
||||
inline void Print( FILE *fo, bool is_leaf ) const{
|
||||
if( !is_leaf ){
|
||||
fprintf( fo, "gain=%f,cover=%f", loss_chg, sum_hess );
|
||||
}else{
|
||||
fprintf( fo, "cover=%f", sum_hess );
|
||||
}
|
||||
}
|
||||
};
|
||||
/*! \brief most comment structure of regression tree */
|
||||
class RegTree: public TreeModel<bst_float,RTreeNodeStat>{
|
||||
};
|
||||
};
|
||||
};
|
||||
#endif
|
||||
39
booster/xgboost-inl.hpp
Normal file
39
booster/xgboost-inl.hpp
Normal file
@@ -0,0 +1,39 @@
|
||||
#ifndef XGBOOST_INL_HPP
|
||||
#define XGBOOST_INL_HPP
|
||||
/*!
|
||||
* \file xgboost-inl.hpp
|
||||
* \brief bootser implementations
|
||||
* \author Tianqi Chen: tianqi.tchen@gmail.com
|
||||
*/
|
||||
// implementation of boosters go to here
|
||||
|
||||
// A good design should have minimum functions defined interface, user should only operate on interface
|
||||
// I break it a bit, by using template and let user 'see' the implementation
|
||||
// The user should pretend that they only can use the interface, and we are all cool
|
||||
// I find this is the only way so far I can think of to make boosters invariant of data structure,
|
||||
// while keep everything fast
|
||||
#include "xgboost.h"
|
||||
#include "../utils/xgboost_utils.h"
|
||||
#include "tree/xgboost_tree.hpp"
|
||||
#include "linear/xgboost_linear.hpp"
|
||||
|
||||
namespace xgboost{
|
||||
namespace booster{
|
||||
/*!
|
||||
* \brief create a gradient booster, given type of booster
|
||||
* \param booster_type type of gradient booster, can be used to specify implements
|
||||
* \tparam FMatrix input data type for booster
|
||||
* \return the pointer to the gradient booster created
|
||||
*/
|
||||
template<typename FMatrix>
|
||||
inline InterfaceBooster<FMatrix> *CreateBooster( int booster_type ){
|
||||
switch( booster_type ){
|
||||
case 0: return new RegTreeTrainer<FMatrix>();
|
||||
case 1: return new LinearBooster<FMatrix>();
|
||||
default: utils::Error("unknown booster_type"); return NULL;
|
||||
}
|
||||
}
|
||||
}; // namespace booster
|
||||
}; // namespace xgboost
|
||||
|
||||
#endif // XGBOOST_INL_HPP
|
||||
157
booster/xgboost.h
Normal file
157
booster/xgboost.h
Normal file
@@ -0,0 +1,157 @@
|
||||
#ifndef XGBOOST_H
|
||||
#define XGBOOST_H
|
||||
/*!
|
||||
* \file xgboost.h
|
||||
* \brief the general gradient boosting interface
|
||||
*
|
||||
* common practice of this header: use IBooster and CreateBooster<FMatrixS>
|
||||
*
|
||||
* \author Tianqi Chen: tianqi.tchen@gmail.com
|
||||
*/
|
||||
#include <vector>
|
||||
#include "../utils/xgboost_utils.h"
|
||||
#include "../utils/xgboost_fmap.h"
|
||||
#include "../utils/xgboost_stream.h"
|
||||
#include "../utils/xgboost_config.h"
|
||||
#include "xgboost_data.h"
|
||||
|
||||
/*! \brief namespace for xboost package */
|
||||
namespace xgboost{
|
||||
/*! \brief namespace for boosters */
|
||||
namespace booster{
|
||||
/*!
|
||||
* \brief interface of a gradient boosting learner
|
||||
* \tparam FMatrix the feature matrix format that the booster takes
|
||||
*/
|
||||
template<typename FMatrix>
|
||||
class InterfaceBooster{
|
||||
public:
|
||||
// interface for model setting and loading
|
||||
// calling procedure:
|
||||
// (1) booster->SetParam to setting necessary parameters
|
||||
// (2) if it is first time usage of the model:
|
||||
// call booster->InitModel
|
||||
// else:
|
||||
// call booster->LoadModel
|
||||
// (3) booster->DoBoost to update the model
|
||||
// (4) booster->Predict to get new prediction
|
||||
/*!
|
||||
* \brief set parameters from outside
|
||||
* \param name name of the parameter
|
||||
* \param val value of the parameter
|
||||
*/
|
||||
virtual void SetParam( const char *name, const char *val ) = 0;
|
||||
/*!
|
||||
* \brief load model from stream
|
||||
* \param fi input stream
|
||||
*/
|
||||
virtual void LoadModel( utils::IStream &fi ) = 0;
|
||||
/*!
|
||||
* \brief save model to stream
|
||||
* \param fo output stream
|
||||
*/
|
||||
virtual void SaveModel( utils::IStream &fo ) const = 0;
|
||||
/*!
|
||||
* \brief initialize solver before training, called before training
|
||||
* this function is reserved for solver to allocate necessary space and do other preparation
|
||||
*/
|
||||
virtual void InitModel( void ) = 0;
|
||||
public:
|
||||
/*!
|
||||
* \brief do gradient boost training for one step, using the information given,
|
||||
* Note: content of grad and hess can change after DoBoost
|
||||
* \param grad first order gradient of each instance
|
||||
* \param hess second order gradient of each instance
|
||||
* \param feats features of each instance
|
||||
* \param root_index pre-partitioned root index of each instance,
|
||||
* root_index.size() can be 0 which indicates that no pre-partition involved
|
||||
*/
|
||||
virtual void DoBoost( std::vector<float> &grad,
|
||||
std::vector<float> &hess,
|
||||
const FMatrix &feats,
|
||||
const std::vector<unsigned> &root_index ) = 0;
|
||||
/*!
|
||||
* \brief predict the path ids along a trees, for given sparse feature vector. When booster is a tree
|
||||
* \param path the result of path
|
||||
* \param feats feature matrix
|
||||
* \param row_index row index in the feature matrix
|
||||
* \param root_index root id of current instance, default = 0
|
||||
*/
|
||||
virtual void PredPath( std::vector<int> &path, const FMatrix &feats,
|
||||
bst_uint row_index, unsigned root_index = 0 ){
|
||||
utils::Error( "not implemented" );
|
||||
}
|
||||
/*!
|
||||
* \brief predict values for given sparse feature vector
|
||||
*
|
||||
* NOTE: in tree implementation, Sparse Predict is OpenMP threadsafe, but not threadsafe in general,
|
||||
* dense version of Predict to ensures threadsafety
|
||||
* \param feats feature matrix
|
||||
* \param row_index row index in the feature matrix
|
||||
* \param root_index root id of current instance, default = 0
|
||||
* \return prediction
|
||||
*/
|
||||
virtual float Predict( const FMatrix &feats, bst_uint row_index, unsigned root_index = 0 ){
|
||||
utils::Error( "not implemented" );
|
||||
return 0.0f;
|
||||
}
|
||||
/*!
|
||||
* \brief predict values for given dense feature vector
|
||||
* \param feat feature vector in dense format
|
||||
* \param funknown indicator that the feature is missing
|
||||
* \param rid root id of current instance, default = 0
|
||||
* \return prediction
|
||||
*/
|
||||
virtual float Predict( const std::vector<float> &feat,
|
||||
const std::vector<bool> &funknown,
|
||||
unsigned rid = 0 ){
|
||||
utils::Error( "not implemented" );
|
||||
return 0.0f;
|
||||
}
|
||||
/*!
|
||||
* \brief print information
|
||||
* \param fo output stream
|
||||
*/
|
||||
virtual void PrintInfo( FILE *fo ){}
|
||||
/*!
|
||||
* \brief dump model into text file
|
||||
* \param fo output stream
|
||||
* \param fmap feature map that may help give interpretations of feature
|
||||
* \param with_stats whether print statistics
|
||||
*/
|
||||
virtual void DumpModel( FILE *fo, const utils::FeatMap& fmap, bool with_stats = false ){
|
||||
utils::Error( "not implemented" );
|
||||
}
|
||||
public:
|
||||
/*! \brief virtual destructor */
|
||||
virtual ~InterfaceBooster( void ){}
|
||||
};
|
||||
};
|
||||
namespace booster{
|
||||
/*!
|
||||
* \brief this will is the most commonly used booster interface
|
||||
* we try to make booster invariant of data structures, but most cases, FMatrixS is what we wnat
|
||||
*/
|
||||
typedef InterfaceBooster<FMatrixS> IBooster;
|
||||
};
|
||||
};
|
||||
|
||||
namespace xgboost{
|
||||
namespace booster{
|
||||
/*!
|
||||
* \brief create a gradient booster, given type of booster
|
||||
* normally we use FMatrixS, by calling CreateBooster<FMatrixS>
|
||||
* \param booster_type type of gradient booster, can be used to specify implements
|
||||
* \tparam FMatrix input data type for booster
|
||||
* \return the pointer to the gradient booster created
|
||||
*/
|
||||
template<typename FMatrix>
|
||||
inline InterfaceBooster<FMatrix> *CreateBooster( int booster_type );
|
||||
};
|
||||
};
|
||||
|
||||
// this file includes the template implementations of all boosters
|
||||
// the cost of using template is that the user can 'see' all the implementations, which is OK
|
||||
// ignore implementations and focus on the interface:)
|
||||
#include "xgboost-inl.hpp"
|
||||
#endif
|
||||
393
booster/xgboost_data.h
Normal file
393
booster/xgboost_data.h
Normal file
@@ -0,0 +1,393 @@
|
||||
#ifndef XGBOOST_DATA_H
|
||||
#define XGBOOST_DATA_H
|
||||
|
||||
/*!
|
||||
* \file xgboost_data.h
|
||||
* \brief the input data structure for gradient boosting
|
||||
* \author Tianqi Chen: tianqi.tchen@gmail.com
|
||||
*/
|
||||
|
||||
#include <vector>
|
||||
#include <climits>
|
||||
#include "../utils/xgboost_utils.h"
|
||||
#include "../utils/xgboost_stream.h"
|
||||
#include "../utils/xgboost_matrix_csr.h"
|
||||
|
||||
namespace xgboost{
|
||||
namespace booster{
|
||||
/*! \brief interger type used in boost */
|
||||
typedef int bst_int;
|
||||
/*! \brief unsigned interger type used in boost */
|
||||
typedef unsigned bst_uint;
|
||||
/*! \brief float type used in boost */
|
||||
typedef float bst_float;
|
||||
/*! \brief debug option for booster */
|
||||
const bool bst_debug = false;
|
||||
};
|
||||
};
|
||||
|
||||
namespace xgboost{
|
||||
namespace booster{
|
||||
/**
|
||||
* \brief This is a interface, defining the way to access features,
|
||||
* by column or by row. This interface is used to make implementation
|
||||
* of booster does not depend on how feature is stored.
|
||||
*
|
||||
* Why template instead of virtual class: for efficiency
|
||||
* feature matrix is going to be used by most inner loop of the algorithm
|
||||
*
|
||||
* \tparam Derived type of actual implementation
|
||||
* \sa FMatrixS: most of time FMatrixS is sufficient, refer to it if you find it confusing
|
||||
*/
|
||||
template<typename Derived>
|
||||
struct FMatrix{
|
||||
public:
|
||||
/*! \brief exmaple iterator over one row */
|
||||
struct RowIter{
|
||||
/*!
|
||||
* \brief move to next position
|
||||
* \return whether there is element in next position
|
||||
*/
|
||||
|
||||
inline bool Next( void );
|
||||
/*! \return feature index in current position */
|
||||
inline bst_uint findex( void ) const;
|
||||
/*! \return feature value in current position */
|
||||
inline bst_float fvalue( void ) const;
|
||||
};
|
||||
/*! \brief example iterator over one column */
|
||||
struct ColIter{
|
||||
/*!
|
||||
* \brief move to next position
|
||||
* \return whether there is element in next position
|
||||
*/
|
||||
inline bool Next( void );
|
||||
/*! \return row index of current position */
|
||||
inline bst_uint rindex( void ) const;
|
||||
/*! \return feature value in current position */
|
||||
inline bst_float fvalue( void ) const;
|
||||
};
|
||||
/*! \brief backward iterator over column */
|
||||
struct ColBackIter : public ColIter {};
|
||||
public:
|
||||
/*!
|
||||
* \brief get number of rows
|
||||
* \return number of rows
|
||||
*/
|
||||
inline size_t NumRow( void ) const;
|
||||
/*!
|
||||
* \brief get number of columns
|
||||
* \return number of columns
|
||||
*/
|
||||
inline size_t NumCol( void ) const;
|
||||
/*!
|
||||
* \brief get row iterator
|
||||
* \param ridx row index
|
||||
* \return row iterator
|
||||
*/
|
||||
inline RowIter GetRow( size_t ridx ) const;
|
||||
/*!
|
||||
* \brief get number of column groups, this ise used together with GetRow( ridx, gid )
|
||||
* \return number of column group
|
||||
*/
|
||||
inline unsigned NumColGroup( void ) const{
|
||||
return 1;
|
||||
}
|
||||
/*!
|
||||
* \brief get row iterator, return iterator of specific column group
|
||||
* \param ridx row index
|
||||
* \param gid colmun group id
|
||||
* \return row iterator, only iterates over features of specified column group
|
||||
*/
|
||||
inline RowIter GetRow( size_t ridx, unsigned gid ) const;
|
||||
|
||||
/*! \return whether column access is enabled */
|
||||
inline bool HaveColAccess( void ) const;
|
||||
/*!
|
||||
* \brief get column iterator, the columns must be sorted by feature value
|
||||
* \param ridx column index
|
||||
* \return column iterator
|
||||
*/
|
||||
inline ColIter GetSortedCol( size_t ridx ) const;
|
||||
/*!
|
||||
* \brief get column backward iterator, starts from biggest fvalue, and iterator back
|
||||
* \param ridx column index
|
||||
* \return reverse column iterator
|
||||
*/
|
||||
inline ColBackIter GetReverseSortedCol( size_t ridx ) const;
|
||||
};
|
||||
};
|
||||
};
|
||||
|
||||
namespace xgboost{
|
||||
namespace booster{
|
||||
/*!
|
||||
* \brief feature matrix to store training instance, in sparse CSR format
|
||||
*/
|
||||
class FMatrixS: public FMatrix<FMatrixS>{
|
||||
public:
|
||||
/*! \brief one entry in a row */
|
||||
struct REntry{
|
||||
/*! \brief feature index */
|
||||
bst_uint findex;
|
||||
/*! \brief feature value */
|
||||
bst_float fvalue;
|
||||
/*! \brief constructor */
|
||||
REntry( void ){}
|
||||
/*! \brief constructor */
|
||||
REntry( bst_uint findex, bst_float fvalue ) : findex(findex), fvalue(fvalue){}
|
||||
inline static bool cmp_fvalue( const REntry &a, const REntry &b ){
|
||||
return a.fvalue < b.fvalue;
|
||||
}
|
||||
};
|
||||
/*! \brief one row of sparse feature matrix */
|
||||
struct Line{
|
||||
/*! \brief array of feature index */
|
||||
const REntry *data_;
|
||||
/*! \brief size of the data */
|
||||
bst_uint len;
|
||||
/*! \brief get k-th element */
|
||||
inline const REntry& operator[]( unsigned i ) const{
|
||||
return data_[i];
|
||||
}
|
||||
};
|
||||
/*! \brief row iterator */
|
||||
struct RowIter{
|
||||
const REntry *dptr_, *end_;
|
||||
RowIter( const REntry* dptr, const REntry* end )
|
||||
:dptr_(dptr),end_(end){}
|
||||
inline bool Next( void ){
|
||||
if( dptr_ == end_ ) return false;
|
||||
else{
|
||||
++ dptr_; return true;
|
||||
}
|
||||
}
|
||||
inline bst_uint findex( void ) const{
|
||||
return dptr_->findex;
|
||||
}
|
||||
inline bst_float fvalue( void ) const{
|
||||
return dptr_->fvalue;
|
||||
}
|
||||
};
|
||||
/*! \brief column iterator */
|
||||
struct ColIter: public RowIter{
|
||||
ColIter( const REntry* dptr, const REntry* end )
|
||||
:RowIter( dptr, end ){}
|
||||
inline bst_uint rindex( void ) const{
|
||||
return this->findex();
|
||||
}
|
||||
};
|
||||
/*! \brief reverse column iterator */
|
||||
struct ColBackIter: public ColIter{
|
||||
ColBackIter( const REntry* dptr, const REntry* end )
|
||||
:ColIter( dptr, end ){}
|
||||
// shadows RowIter::Next
|
||||
inline bool Next( void ){
|
||||
if( dptr_ == end_ ) return false;
|
||||
else{
|
||||
-- dptr_; return true;
|
||||
}
|
||||
}
|
||||
};
|
||||
public:
|
||||
/*! \brief constructor */
|
||||
FMatrixS( void ){ this->Clear(); }
|
||||
/*! \brief get number of rows */
|
||||
inline size_t NumRow( void ) const{
|
||||
return row_ptr_.size() - 1;
|
||||
}
|
||||
/*!
|
||||
* \brief get number of nonzero entries
|
||||
* \return number of nonzero entries
|
||||
*/
|
||||
inline size_t NumEntry( void ) const{
|
||||
return row_data_.size();
|
||||
}
|
||||
/*! \brief clear the storage */
|
||||
inline void Clear( void ){
|
||||
row_ptr_.clear();
|
||||
row_ptr_.push_back( 0 );
|
||||
row_data_.clear();
|
||||
col_ptr_.clear();
|
||||
col_data_.clear();
|
||||
}
|
||||
/*! \brief get sparse part of current row */
|
||||
inline Line operator[]( size_t sidx ) const{
|
||||
Line sp;
|
||||
utils::Assert( !bst_debug || sidx < this->NumRow(), "row id exceed bound" );
|
||||
sp.len = static_cast<bst_uint>( row_ptr_[ sidx + 1 ] - row_ptr_[ sidx ] );
|
||||
sp.data_ = &row_data_[ row_ptr_[ sidx ] ];
|
||||
return sp;
|
||||
}
|
||||
/*!
|
||||
* \brief add a row to the matrix, with data stored in STL container
|
||||
* \param findex feature index
|
||||
* \param fvalue feature value
|
||||
* \param fstart start bound of feature
|
||||
* \param fend end bound range of feature
|
||||
* \return the row id added line
|
||||
*/
|
||||
inline size_t AddRow( const std::vector<bst_uint> &findex,
|
||||
const std::vector<bst_float> &fvalue,
|
||||
unsigned fstart = 0, unsigned fend = UINT_MAX ){
|
||||
utils::Assert( findex.size() == fvalue.size() );
|
||||
unsigned cnt = 0;
|
||||
for( size_t i = 0; i < findex.size(); i ++ ){
|
||||
if( findex[i] < fstart || findex[i] >= fend ) continue;
|
||||
row_data_.push_back( REntry( findex[i], fvalue[i] ) );
|
||||
cnt ++;
|
||||
}
|
||||
row_ptr_.push_back( row_ptr_.back() + cnt );
|
||||
return row_ptr_.size() - 2;
|
||||
}
|
||||
/*! \brief get row iterator*/
|
||||
inline RowIter GetRow( size_t ridx ) const{
|
||||
utils::Assert( !bst_debug || ridx < this->NumRow(), "row id exceed bound" );
|
||||
return RowIter( &row_data_[ row_ptr_[ridx] ] - 1, &row_data_[ row_ptr_[ridx+1] ] - 1 );
|
||||
}
|
||||
/*! \brief get row iterator*/
|
||||
inline RowIter GetRow( size_t ridx, unsigned gid ) const{
|
||||
utils::Assert( gid == 0, "FMatrixS only have 1 column group" );
|
||||
return FMatrixS::GetRow( ridx );
|
||||
}
|
||||
public:
|
||||
/*! \return whether column access is enabled */
|
||||
inline bool HaveColAccess( void ) const{
|
||||
return col_ptr_.size() != 0 && col_data_.size() == row_data_.size();
|
||||
}
|
||||
/*! \brief get number of colmuns */
|
||||
inline size_t NumCol( void ) const{
|
||||
utils::Assert( this->HaveColAccess() );
|
||||
return col_ptr_.size() - 1;
|
||||
}
|
||||
/*! \brief get col iterator*/
|
||||
inline ColIter GetSortedCol( size_t cidx ) const{
|
||||
utils::Assert( !bst_debug || cidx < this->NumCol(), "col id exceed bound" );
|
||||
return ColIter( &col_data_[ col_ptr_[cidx] ] - 1, &col_data_[ col_ptr_[cidx+1] ] - 1 );
|
||||
}
|
||||
/*! \brief get col iterator */
|
||||
inline ColBackIter GetReverseSortedCol( size_t cidx ) const{
|
||||
utils::Assert( !bst_debug || cidx < this->NumCol(), "col id exceed bound" );
|
||||
return ColBackIter( &col_data_[ col_ptr_[cidx+1] ], &col_data_[ col_ptr_[cidx] ] );
|
||||
}
|
||||
/*!
|
||||
* \brief intialize the data so that we have both column and row major
|
||||
* access, call this whenever we need column access
|
||||
*/
|
||||
inline void InitData( void ){
|
||||
utils::SparseCSRMBuilder<REntry> builder( col_ptr_, col_data_ );
|
||||
builder.InitBudget( 0 );
|
||||
for( size_t i = 0; i < this->NumRow(); i ++ ){
|
||||
for( RowIter it = this->GetRow(i); it.Next(); ){
|
||||
builder.AddBudget( it.findex() );
|
||||
}
|
||||
}
|
||||
builder.InitStorage();
|
||||
for( size_t i = 0; i < this->NumRow(); i ++ ){
|
||||
for( RowIter it = this->GetRow(i); it.Next(); ){
|
||||
builder.PushElem( it.findex(), REntry( (bst_uint)i, it.fvalue() ) );
|
||||
}
|
||||
}
|
||||
// sort columns
|
||||
unsigned ncol = static_cast<unsigned>( this->NumCol() );
|
||||
for( unsigned i = 0; i < ncol; i ++ ){
|
||||
std::sort( &col_data_[ col_ptr_[ i ] ], &col_data_[ col_ptr_[ i+1 ] ], REntry::cmp_fvalue );
|
||||
}
|
||||
}
|
||||
/*!
|
||||
* \brief save data to binary stream
|
||||
* note: since we have size_t in ptr,
|
||||
* the function is not consistent between 64bit and 32bit machine
|
||||
* \param fo output stream
|
||||
*/
|
||||
inline void SaveBinary( utils::IStream &fo ) const{
|
||||
FMatrixS::SaveBinary( fo, row_ptr_, row_data_ );
|
||||
int col_access = this->HaveColAccess() ? 1 : 0;
|
||||
fo.Write( &col_access, sizeof(int) );
|
||||
if( col_access != 0 ){
|
||||
FMatrixS::SaveBinary( fo, col_ptr_, col_data_ );
|
||||
}
|
||||
}
|
||||
/*!
|
||||
* \brief load data from binary stream
|
||||
* note: since we have size_t in ptr,
|
||||
* the function is not consistent between 64bit and 32bit machin
|
||||
* \param fi input stream
|
||||
*/
|
||||
inline void LoadBinary( utils::IStream &fi ){
|
||||
FMatrixS::LoadBinary( fi, row_ptr_, row_data_ );
|
||||
int col_access;
|
||||
fi.Read( &col_access, sizeof(int) );
|
||||
if( col_access != 0 ){
|
||||
FMatrixS::LoadBinary( fi, col_ptr_, col_data_ );
|
||||
}
|
||||
}
|
||||
/*!
|
||||
* \brief load from text file
|
||||
* \param fi input file pointer
|
||||
*/
|
||||
inline void LoadText( FILE *fi ){
|
||||
this->Clear();
|
||||
int ninst;
|
||||
while( fscanf( fi, "%d", &ninst ) == 1 ){
|
||||
std::vector<booster::bst_uint> findex;
|
||||
std::vector<booster::bst_float> fvalue;
|
||||
while( ninst -- ){
|
||||
unsigned index; float value;
|
||||
utils::Assert( fscanf( fi, "%u:%f", &index, &value ) == 2, "load Text" );
|
||||
findex.push_back( index ); fvalue.push_back( value );
|
||||
}
|
||||
this->AddRow( findex, fvalue );
|
||||
}
|
||||
// initialize column support as well
|
||||
this->InitData();
|
||||
}
|
||||
private:
|
||||
/*!
|
||||
* \brief save data to binary stream
|
||||
* \param fo output stream
|
||||
* \param ptr pointer data
|
||||
* \param data data content
|
||||
*/
|
||||
inline static void SaveBinary( utils::IStream &fo,
|
||||
const std::vector<size_t> &ptr,
|
||||
const std::vector<REntry> &data ){
|
||||
size_t nrow = ptr.size() - 1;
|
||||
fo.Write( &nrow, sizeof(size_t) );
|
||||
fo.Write( &ptr[0], ptr.size() * sizeof(size_t) );
|
||||
if( data.size() != 0 ){
|
||||
fo.Write( &data[0] , data.size() * sizeof(REntry) );
|
||||
}
|
||||
}
|
||||
/*!
|
||||
* \brief load data from binary stream
|
||||
* \param fi input stream
|
||||
* \param ptr pointer data
|
||||
* \param data data content
|
||||
*/
|
||||
inline static void LoadBinary( utils::IStream &fi,
|
||||
std::vector<size_t> &ptr,
|
||||
std::vector<REntry> &data ){
|
||||
size_t nrow;
|
||||
utils::Assert( fi.Read( &nrow, sizeof(size_t) ) != 0, "Load FMatrixS" );
|
||||
ptr.resize( nrow + 1 );
|
||||
utils::Assert( fi.Read( &ptr[0], ptr.size() * sizeof(size_t) ), "Load FMatrixS" );
|
||||
|
||||
data.resize( ptr.back() );
|
||||
if( data.size() != 0 ){
|
||||
utils::Assert( fi.Read( &data[0] , data.size() * sizeof(REntry) ) , "Load FMatrixS" );
|
||||
}
|
||||
}
|
||||
protected:
|
||||
/*! \brief row pointer of CSR sparse storage */
|
||||
std::vector<size_t> row_ptr_;
|
||||
/*! \brief data in the row */
|
||||
std::vector<REntry> row_data_;
|
||||
/*! \brief column pointer of CSC format */
|
||||
std::vector<size_t> col_ptr_;
|
||||
/*! \brief column datas */
|
||||
std::vector<REntry> col_data_;
|
||||
};
|
||||
};
|
||||
};
|
||||
#endif
|
||||
385
booster/xgboost_gbmbase.h
Normal file
385
booster/xgboost_gbmbase.h
Normal file
@@ -0,0 +1,385 @@
|
||||
#ifndef XGBOOST_GBMBASE_H
|
||||
#define XGBOOST_GBMBASE_H
|
||||
|
||||
#include <cstring>
|
||||
#include "xgboost.h"
|
||||
#include "xgboost_data.h"
|
||||
#include "../utils/xgboost_omp.h"
|
||||
#include "../utils/xgboost_config.h"
|
||||
/*!
|
||||
* \file xgboost_gbmbase.h
|
||||
* \brief a base model class,
|
||||
* that assembles the ensembles of booster together and do model update
|
||||
* this class can be used as base code to create booster variants
|
||||
*
|
||||
* The detailed implementation of boosters should start by using the class
|
||||
* provided by this file
|
||||
*
|
||||
* \author Tianqi Chen: tianqi.tchen@gmail.com
|
||||
*/
|
||||
namespace xgboost{
|
||||
namespace booster{
|
||||
/*!
|
||||
* \brief a base model class,
|
||||
* that assembles the ensembles of booster together and provide single routines to do prediction buffer and update
|
||||
* this class can be used as base code to create booster variants
|
||||
* *
|
||||
* relation to xgboost.h:
|
||||
* (1) xgboost.h provides a interface to a single booster(e.g. a single regression tree )
|
||||
* while GBMBaseModel builds upon IBooster to build a class that
|
||||
* ensembls the boosters together;
|
||||
* (2) GBMBaseModel provides prediction buffering scheme to speedup training;
|
||||
* (3) Summary: GBMBaseModel is a standard wrapper for boosting ensembles;
|
||||
*
|
||||
* Usage of this class, the number index gives calling dependencies:
|
||||
* (1) model.SetParam to set the parameters
|
||||
* (2) model.LoadModel to load old models or model.InitModel to create a new model
|
||||
* (3) model.InitTrainer before calling model.Predict and model.DoBoost
|
||||
* (4) model.Predict to get predictions given a instance
|
||||
* (4) model.DoBoost to update the ensembles, add new booster to the model
|
||||
* (4) model.SaveModel to save learned results
|
||||
*
|
||||
* Bufferring: each instance comes with a buffer_index in Predict.
|
||||
* when mparam.num_pbuffer != 0, a unique buffer index can be
|
||||
* assigned to each instance to buffer previous results of boosters,
|
||||
* this helps to speedup training, so consider assign buffer_index
|
||||
* for each training instances, if buffer_index = -1, the code
|
||||
* recalculate things from scratch and will still works correctly
|
||||
*/
|
||||
class GBMBase{
|
||||
public:
|
||||
/*! \brief number of thread used */
|
||||
GBMBase( void ){}
|
||||
/*! \brief destructor */
|
||||
virtual ~GBMBase( void ){
|
||||
this->FreeSpace();
|
||||
}
|
||||
/*!
|
||||
* \brief set parameters from outside
|
||||
* \param name name of the parameter
|
||||
* \param val value of the parameter
|
||||
*/
|
||||
inline void SetParam( const char *name, const char *val ){
|
||||
if( !strncmp( name, "bst:", 4 ) ){
|
||||
cfg.PushBack( name + 4, val );
|
||||
}
|
||||
if( !strcmp( name, "silent") ){
|
||||
cfg.PushBack( name, val );
|
||||
}
|
||||
tparam.SetParam( name, val );
|
||||
if( boosters.size() == 0 ) mparam.SetParam( name, val );
|
||||
}
|
||||
/*!
|
||||
* \brief load model from stream
|
||||
* \param fi input stream
|
||||
*/
|
||||
inline void LoadModel( utils::IStream &fi ){
|
||||
if( boosters.size() != 0 ) this->FreeSpace();
|
||||
utils::Assert( fi.Read( &mparam, sizeof(ModelParam) ) != 0 );
|
||||
boosters.resize( mparam.num_boosters );
|
||||
for( size_t i = 0; i < boosters.size(); i ++ ){
|
||||
boosters[ i ] = booster::CreateBooster<FMatrixS>( mparam.booster_type );
|
||||
boosters[ i ]->LoadModel( fi );
|
||||
}
|
||||
{// load info
|
||||
booster_info.resize( mparam.num_boosters );
|
||||
if( mparam.num_boosters != 0 ){
|
||||
utils::Assert( fi.Read( &booster_info[0], sizeof(int)*mparam.num_boosters ) != 0 );
|
||||
}
|
||||
}
|
||||
if( mparam.num_pbuffer != 0 ){
|
||||
pred_buffer.resize ( mparam.num_pbuffer );
|
||||
pred_counter.resize( mparam.num_pbuffer );
|
||||
utils::Assert( fi.Read( &pred_buffer[0] , pred_buffer.size()*sizeof(float) ) != 0 );
|
||||
utils::Assert( fi.Read( &pred_counter[0], pred_counter.size()*sizeof(unsigned) ) != 0 );
|
||||
}
|
||||
}
|
||||
/*!
|
||||
* \brief save model to stream
|
||||
* \param fo output stream
|
||||
*/
|
||||
inline void SaveModel( utils::IStream &fo ) const {
|
||||
utils::Assert( mparam.num_boosters == (int)boosters.size() );
|
||||
fo.Write( &mparam, sizeof(ModelParam) );
|
||||
for( size_t i = 0; i < boosters.size(); i ++ ){
|
||||
boosters[ i ]->SaveModel( fo );
|
||||
}
|
||||
if( booster_info.size() != 0 ){
|
||||
fo.Write( &booster_info[0], sizeof(int) * booster_info.size() );
|
||||
}
|
||||
if( mparam.num_pbuffer != 0 ){
|
||||
fo.Write( &pred_buffer[0] , pred_buffer.size()*sizeof(float) );
|
||||
fo.Write( &pred_counter[0], pred_counter.size()*sizeof(unsigned) );
|
||||
}
|
||||
}
|
||||
/*!
|
||||
* \brief initialize the current data storage for model, if the model is used first time, call this function
|
||||
*/
|
||||
inline void InitModel( void ){
|
||||
pred_buffer.clear(); pred_counter.clear();
|
||||
pred_buffer.resize ( mparam.num_pbuffer, 0.0 );
|
||||
pred_counter.resize( mparam.num_pbuffer, 0 );
|
||||
utils::Assert( mparam.num_boosters == 0 );
|
||||
utils::Assert( boosters.size() == 0 );
|
||||
}
|
||||
/*!
|
||||
* \brief initialize solver before training, called before training
|
||||
* this function is reserved for solver to allocate necessary space and do other preparation
|
||||
*/
|
||||
inline void InitTrainer( void ){
|
||||
if( tparam.nthread != 0 ){
|
||||
omp_set_num_threads( tparam.nthread );
|
||||
}
|
||||
// make sure all the boosters get the latest parameters
|
||||
for( size_t i = 0; i < this->boosters.size(); i ++ ){
|
||||
this->ConfigBooster( this->boosters[i] );
|
||||
}
|
||||
}
|
||||
/*!
|
||||
* \brief DumpModel
|
||||
* \param fo text file
|
||||
* \param fmap feature map that may help give interpretations of feature
|
||||
* \param with_stats whether print statistics
|
||||
*/
|
||||
inline void DumpModel( FILE *fo, const utils::FeatMap& fmap, bool with_stats ){
|
||||
for( size_t i = 0; i < boosters.size(); i ++ ){
|
||||
fprintf( fo, "booster[%d]\n", (int)i );
|
||||
boosters[i]->DumpModel( fo, fmap, with_stats );
|
||||
}
|
||||
}
|
||||
/*!
|
||||
* \brief Dump path of all trees
|
||||
* \param fo text file
|
||||
* \param data input data
|
||||
*/
|
||||
inline void DumpPath( FILE *fo, const FMatrixS &data ){
|
||||
for( size_t i = 0; i < data.NumRow(); ++ i ){
|
||||
for( size_t j = 0; j < boosters.size(); ++ j ){
|
||||
if( j != 0 ) fprintf( fo, "\t" );
|
||||
std::vector<int> path;
|
||||
boosters[j]->PredPath( path, data, i );
|
||||
fprintf( fo, "%d", path[0] );
|
||||
for( size_t k = 1; k < path.size(); ++ k ){
|
||||
fprintf( fo, ",%d", path[k] );
|
||||
}
|
||||
}
|
||||
fprintf( fo, "\n" );
|
||||
}
|
||||
}
|
||||
public:
|
||||
/*!
|
||||
* \brief do gradient boost training for one step, using the information given
|
||||
* Note: content of grad and hess can change after DoBoost
|
||||
* \param grad first order gradient of each instance
|
||||
* \param hess second order gradient of each instance
|
||||
* \param feats features of each instance
|
||||
* \param root_index pre-partitioned root index of each instance,
|
||||
* root_index.size() can be 0 which indicates that no pre-partition involved
|
||||
*/
|
||||
inline void DoBoost( std::vector<float> &grad,
|
||||
std::vector<float> &hess,
|
||||
const booster::FMatrixS &feats,
|
||||
const std::vector<unsigned> &root_index ) {
|
||||
booster::IBooster *bst = this->GetUpdateBooster();
|
||||
bst->DoBoost( grad, hess, feats, root_index );
|
||||
}
|
||||
/*!
|
||||
* \brief predict values for given sparse feature vector
|
||||
* NOTE: in tree implementation, this is only OpenMP threadsafe, but not threadsafe
|
||||
* \param feats feature matrix
|
||||
* \param row_index row index in the feature matrix
|
||||
* \param buffer_index the buffer index of the current feature line, default -1 means no buffer assigned
|
||||
* \param root_index root id of current instance, default = 0
|
||||
* \return prediction
|
||||
*/
|
||||
inline float Predict( const FMatrixS &feats, bst_uint row_index, int buffer_index = -1, unsigned root_index = 0 ){
|
||||
size_t istart = 0;
|
||||
float psum = 0.0f;
|
||||
|
||||
// load buffered results if any
|
||||
if( mparam.do_reboost == 0 && buffer_index >= 0 ){
|
||||
utils::Assert( buffer_index < mparam.num_pbuffer, "buffer index exceed num_pbuffer" );
|
||||
istart = this->pred_counter[ buffer_index ];
|
||||
psum = this->pred_buffer [ buffer_index ];
|
||||
}
|
||||
|
||||
for( size_t i = istart; i < this->boosters.size(); i ++ ){
|
||||
psum += this->boosters[ i ]->Predict( feats, row_index, root_index );
|
||||
}
|
||||
// updated the buffered results
|
||||
if( mparam.do_reboost == 0 && buffer_index >= 0 ){
|
||||
this->pred_counter[ buffer_index ] = static_cast<unsigned>( boosters.size() );
|
||||
this->pred_buffer [ buffer_index ] = psum;
|
||||
}
|
||||
return psum;
|
||||
}
|
||||
public:
|
||||
//--------trial code for interactive update an existing booster------
|
||||
//-------- usually not needed, ignore this region ---------
|
||||
/*!
|
||||
* \brief same as Predict, but removes the prediction of booster to be updated
|
||||
* this function must be called once and only once for every data with pbuffer
|
||||
*/
|
||||
inline float InteractPredict( const FMatrixS &feats, bst_uint row_index, int buffer_index = -1, unsigned root_index = 0 ){
|
||||
float psum = this->Predict( feats, row_index, buffer_index, root_index );
|
||||
if( tparam.reupdate_booster != -1 ){
|
||||
const int bid = tparam.reupdate_booster;
|
||||
utils::Assert( bid >= 0 && bid < (int)boosters.size(), "interact:booster_index exceed existing bound" );
|
||||
psum -= boosters[ bid ]->Predict( feats, row_index, root_index );
|
||||
if( mparam.do_reboost == 0 && buffer_index >= 0 ){
|
||||
this->pred_buffer[ buffer_index ] = psum;
|
||||
}
|
||||
}
|
||||
return psum;
|
||||
}
|
||||
/*! \brief delete the specified booster */
|
||||
inline void DelteBooster( void ){
|
||||
const int bid = tparam.reupdate_booster;
|
||||
utils::Assert( bid >= 0 && bid < mparam.num_boosters , "must specify booster index for deletion");
|
||||
delete boosters[ bid ];
|
||||
for( int i = bid + 1; i < mparam.num_boosters; ++ i ){
|
||||
boosters[i-1] = boosters[ i ];
|
||||
booster_info[i-1] = booster_info[ i ];
|
||||
}
|
||||
boosters.resize( mparam.num_boosters -= 1 );
|
||||
booster_info.resize( boosters.size() );
|
||||
}
|
||||
/*! \brief update the prediction buffer, after booster have been updated */
|
||||
inline void InteractRePredict( const FMatrixS &feats, bst_uint row_index, int buffer_index = -1, unsigned root_index = 0 ){
|
||||
if( tparam.reupdate_booster != -1 ){
|
||||
const int bid = tparam.reupdate_booster;
|
||||
utils::Assert( bid >= 0 && bid < (int)boosters.size(), "interact:booster_index exceed existing bound" );
|
||||
if( mparam.do_reboost == 0 && buffer_index >= 0 ){
|
||||
this->pred_buffer[ buffer_index ] += boosters[ bid ]->Predict( feats, row_index, root_index );
|
||||
}
|
||||
}
|
||||
}
|
||||
//-----------non public fields afterwards-------------
|
||||
protected:
|
||||
/*! \brief free space of the model */
|
||||
inline void FreeSpace( void ){
|
||||
for( size_t i = 0; i < boosters.size(); i ++ ){
|
||||
delete boosters[i];
|
||||
}
|
||||
boosters.clear(); booster_info.clear(); mparam.num_boosters = 0;
|
||||
}
|
||||
/*! \brief configure a booster */
|
||||
inline void ConfigBooster( booster::IBooster *bst ){
|
||||
cfg.BeforeFirst();
|
||||
while( cfg.Next() ){
|
||||
bst->SetParam( cfg.name(), cfg.val() );
|
||||
}
|
||||
}
|
||||
/*!
|
||||
* \brief get a booster to update
|
||||
* \return the booster created
|
||||
*/
|
||||
inline booster::IBooster *GetUpdateBooster( void ){
|
||||
if( tparam.reupdate_booster != -1 ){
|
||||
const int bid = tparam.reupdate_booster;
|
||||
utils::Assert( bid >= 0 && bid < (int)boosters.size(), "interact:booster_index exceed existing bound" );
|
||||
this->ConfigBooster( boosters[bid] );
|
||||
return boosters[ bid ];
|
||||
}
|
||||
|
||||
if( mparam.do_reboost == 0 || boosters.size() == 0 ){
|
||||
mparam.num_boosters += 1;
|
||||
boosters.push_back( booster::CreateBooster<FMatrixS>( mparam.booster_type ) );
|
||||
booster_info.push_back( 0 );
|
||||
this->ConfigBooster( boosters.back() );
|
||||
boosters.back()->InitModel();
|
||||
}else{
|
||||
this->ConfigBooster( boosters.back() );
|
||||
}
|
||||
return boosters.back();
|
||||
}
|
||||
protected:
|
||||
/*! \brief model parameters */
|
||||
struct ModelParam{
|
||||
/*! \brief number of boosters */
|
||||
int num_boosters;
|
||||
/*! \brief type of tree used */
|
||||
int booster_type;
|
||||
/*! \brief number of root: default 0, means single tree */
|
||||
int num_roots;
|
||||
/*! \brief number of features to be used by boosters */
|
||||
int num_feature;
|
||||
/*! \brief size of predicton buffer allocated for buffering boosting computation */
|
||||
int num_pbuffer;
|
||||
/*!
|
||||
* \brief whether we repeatly update a single booster each round: default 0
|
||||
* set to 1 for linear booster, so that regularization term can be considered
|
||||
*/
|
||||
int do_reboost;
|
||||
/*! \brief reserved parameters */
|
||||
int reserved[ 32 ];
|
||||
/*! \brief constructor */
|
||||
ModelParam( void ){
|
||||
num_boosters = 0;
|
||||
booster_type = 0;
|
||||
num_roots = num_feature = 0;
|
||||
do_reboost = 0;
|
||||
num_pbuffer = 0;
|
||||
memset( reserved, 0, sizeof( reserved ) );
|
||||
}
|
||||
/*!
|
||||
* \brief set parameters from outside
|
||||
* \param name name of the parameter
|
||||
* \param val value of the parameter
|
||||
*/
|
||||
inline void SetParam( const char *name, const char *val ){
|
||||
if( !strcmp("booster_type", name ) ){
|
||||
booster_type = atoi( val );
|
||||
// linear boost automatically set do reboost
|
||||
if( booster_type == 1 ) do_reboost = 1;
|
||||
}
|
||||
if( !strcmp("num_pbuffer", name ) ) num_pbuffer = atoi( val );
|
||||
if( !strcmp("do_reboost", name ) ) do_reboost = atoi( val );
|
||||
if( !strcmp("bst:num_roots", name ) ) num_roots = atoi( val );
|
||||
if( !strcmp("bst:num_feature", name ) ) num_feature = atoi( val );
|
||||
}
|
||||
};
|
||||
/*! \brief training parameters */
|
||||
struct TrainParam{
|
||||
/*! \brief number of OpenMP threads */
|
||||
int nthread;
|
||||
/*!
|
||||
* \brief index of specific booster to be re-updated, default = -1: update new booster
|
||||
* parameter this is part of trial interactive update mode
|
||||
*/
|
||||
int reupdate_booster;
|
||||
/*! \brief constructor */
|
||||
TrainParam( void ) {
|
||||
nthread = 1;
|
||||
reupdate_booster = -1;
|
||||
}
|
||||
/*!
|
||||
* \brief set parameters from outside
|
||||
* \param name name of the parameter
|
||||
* \param val value of the parameter
|
||||
*/
|
||||
inline void SetParam( const char *name, const char *val ){
|
||||
if( !strcmp("nthread", name ) ) nthread = atoi( val );
|
||||
if( !strcmp("interact:booster_index", name ) ) reupdate_booster = atoi( val );
|
||||
}
|
||||
};
|
||||
protected:
|
||||
/*! \brief model parameters */
|
||||
ModelParam mparam;
|
||||
/*! \brief training parameters */
|
||||
TrainParam tparam;
|
||||
protected:
|
||||
/*! \brief component boosters */
|
||||
std::vector<booster::IBooster*> boosters;
|
||||
/*! \brief some information indicator of the booster, reserved */
|
||||
std::vector<int> booster_info;
|
||||
/*! \brief prediction buffer */
|
||||
std::vector<float> pred_buffer;
|
||||
/*! \brief prediction buffer counter, record the progress so fart of the buffer */
|
||||
std::vector<unsigned> pred_counter;
|
||||
/*! \brief configurations saved for each booster */
|
||||
utils::ConfigSaver cfg;
|
||||
};
|
||||
};
|
||||
};
|
||||
#endif
|
||||
14
demo/binary_classification/README
Normal file
14
demo/binary_classification/README
Normal file
@@ -0,0 +1,14 @@
|
||||
Demonstrating how to use XGBoost accomplish binary classification tasks on UCI mushroom dataset http://archive.ics.uci.edu/ml/datasets/Mushroom
|
||||
|
||||
Run: ./runexp.sh
|
||||
|
||||
Format of input: LIBSVM format
|
||||
|
||||
Format of ```featmap.txt: <featureid> <featurename> <q or i or int>\n ```:
|
||||
- Feature id must be from 0 to number of features, in sorted order.
|
||||
- i means this feature is binary indicator feature
|
||||
- q means this feature is a quantitative value, such as age, time, can be missing
|
||||
- int means this feature is integer value (when int is hinted, the decision boundary will be integer)
|
||||
|
||||
|
||||
Explainations: https://github.com/tqchen/xgboost/wiki/Binary-Classification
|
||||
8124
demo/binary_classification/agaricus-lepiota.data
Normal file
8124
demo/binary_classification/agaricus-lepiota.data
Normal file
File diff suppressed because it is too large
Load Diff
32
demo/binary_classification/agaricus-lepiota.fmap
Normal file
32
demo/binary_classification/agaricus-lepiota.fmap
Normal file
@@ -0,0 +1,32 @@
|
||||
1. cap-shape: bell=b,conical=c,convex=x,flat=f,knobbed=k,sunken=s
|
||||
2. cap-surface: fibrous=f,grooves=g,scaly=y,smooth=s
|
||||
3. cap-color: brown=n,buff=b,cinnamon=c,gray=g,green=r,pink=p,purple=u,red=e,white=w,yellow=y
|
||||
4. bruises?: bruises=t,no=f
|
||||
5. odor: almond=a,anise=l,creosote=c,fishy=y,foul=f,
|
||||
musty=m,none=n,pungent=p,spicy=s
|
||||
6. gill-attachment: attached=a,descending=d,free=f,notched=n
|
||||
7. gill-spacing: close=c,crowded=w,distant=d
|
||||
8. gill-size: broad=b,narrow=n
|
||||
9. gill-color: black=k,brown=n,buff=b,chocolate=h,gray=g,
|
||||
green=r,orange=o,pink=p,purple=u,red=e,
|
||||
white=w,yellow=y
|
||||
10. stalk-shape: enlarging=e,tapering=t
|
||||
11. stalk-root: bulbous=b,club=c,cup=u,equal=e,
|
||||
rhizomorphs=z,rooted=r,missing=?
|
||||
12. stalk-surface-above-ring: fibrous=f,scaly=y,silky=k,smooth=s
|
||||
13. stalk-surface-below-ring: fibrous=f,scaly=y,silky=k,smooth=s
|
||||
14. stalk-color-above-ring: brown=n,buff=b,cinnamon=c,gray=g,orange=o,
|
||||
pink=p,red=e,white=w,yellow=y
|
||||
15. stalk-color-below-ring: brown=n,buff=b,cinnamon=c,gray=g,orange=o,
|
||||
pink=p,red=e,white=w,yellow=y
|
||||
16. veil-type: partial=p,universal=u
|
||||
17. veil-color: brown=n,orange=o,white=w,yellow=y
|
||||
18. ring-number: none=n,one=o,two=t
|
||||
19. ring-type: cobwebby=c,evanescent=e,flaring=f,large=l,
|
||||
none=n,pendant=p,sheathing=s,zone=z
|
||||
20. spore-print-color: black=k,brown=n,buff=b,chocolate=h,green=r,
|
||||
orange=o,purple=u,white=w,yellow=y
|
||||
21. population: abundant=a,clustered=c,numerous=n,
|
||||
scattered=s,several=v,solitary=y
|
||||
22. habitat: grasses=g,leaves=l,meadows=m,paths=p,
|
||||
urban=u,waste=w,woods=d
|
||||
148
demo/binary_classification/agaricus-lepiota.names
Normal file
148
demo/binary_classification/agaricus-lepiota.names
Normal file
@@ -0,0 +1,148 @@
|
||||
1. Title: Mushroom Database
|
||||
|
||||
2. Sources:
|
||||
(a) Mushroom records drawn from The Audubon Society Field Guide to North
|
||||
American Mushrooms (1981). G. H. Lincoff (Pres.), New York: Alfred
|
||||
A. Knopf
|
||||
(b) Donor: Jeff Schlimmer (Jeffrey.Schlimmer@a.gp.cs.cmu.edu)
|
||||
(c) Date: 27 April 1987
|
||||
|
||||
3. Past Usage:
|
||||
1. Schlimmer,J.S. (1987). Concept Acquisition Through Representational
|
||||
Adjustment (Technical Report 87-19). Doctoral disseration, Department
|
||||
of Information and Computer Science, University of California, Irvine.
|
||||
--- STAGGER: asymptoted to 95% classification accuracy after reviewing
|
||||
1000 instances.
|
||||
2. Iba,W., Wogulis,J., & Langley,P. (1988). Trading off Simplicity
|
||||
and Coverage in Incremental Concept Learning. In Proceedings of
|
||||
the 5th International Conference on Machine Learning, 73-79.
|
||||
Ann Arbor, Michigan: Morgan Kaufmann.
|
||||
-- approximately the same results with their HILLARY algorithm
|
||||
3. In the following references a set of rules (given below) were
|
||||
learned for this data set which may serve as a point of
|
||||
comparison for other researchers.
|
||||
|
||||
Duch W, Adamczak R, Grabczewski K (1996) Extraction of logical rules
|
||||
from training data using backpropagation networks, in: Proc. of the
|
||||
The 1st Online Workshop on Soft Computing, 19-30.Aug.1996, pp. 25-30,
|
||||
available on-line at: http://www.bioele.nuee.nagoya-u.ac.jp/wsc1/
|
||||
|
||||
Duch W, Adamczak R, Grabczewski K, Ishikawa M, Ueda H, Extraction of
|
||||
crisp logical rules using constrained backpropagation networks -
|
||||
comparison of two new approaches, in: Proc. of the European Symposium
|
||||
on Artificial Neural Networks (ESANN'97), Bruge, Belgium 16-18.4.1997,
|
||||
pp. xx-xx
|
||||
|
||||
Wlodzislaw Duch, Department of Computer Methods, Nicholas Copernicus
|
||||
University, 87-100 Torun, Grudziadzka 5, Poland
|
||||
e-mail: duch@phys.uni.torun.pl
|
||||
WWW http://www.phys.uni.torun.pl/kmk/
|
||||
|
||||
Date: Mon, 17 Feb 1997 13:47:40 +0100
|
||||
From: Wlodzislaw Duch <duch@phys.uni.torun.pl>
|
||||
Organization: Dept. of Computer Methods, UMK
|
||||
|
||||
I have attached a file containing logical rules for mushrooms.
|
||||
It should be helpful for other people since only in the last year I
|
||||
have seen about 10 papers analyzing this dataset and obtaining quite
|
||||
complex rules. We will try to contribute other results later.
|
||||
|
||||
With best regards, Wlodek Duch
|
||||
________________________________________________________________
|
||||
|
||||
Logical rules for the mushroom data sets.
|
||||
|
||||
Logical rules given below seem to be the simplest possible for the
|
||||
mushroom dataset and therefore should be treated as benchmark results.
|
||||
|
||||
Disjunctive rules for poisonous mushrooms, from most general
|
||||
to most specific:
|
||||
|
||||
P_1) odor=NOT(almond.OR.anise.OR.none)
|
||||
120 poisonous cases missed, 98.52% accuracy
|
||||
|
||||
P_2) spore-print-color=green
|
||||
48 cases missed, 99.41% accuracy
|
||||
|
||||
P_3) odor=none.AND.stalk-surface-below-ring=scaly.AND.
|
||||
(stalk-color-above-ring=NOT.brown)
|
||||
8 cases missed, 99.90% accuracy
|
||||
|
||||
P_4) habitat=leaves.AND.cap-color=white
|
||||
100% accuracy
|
||||
|
||||
Rule P_4) may also be
|
||||
|
||||
P_4') population=clustered.AND.cap_color=white
|
||||
|
||||
These rule involve 6 attributes (out of 22). Rules for edible
|
||||
mushrooms are obtained as negation of the rules given above, for
|
||||
example the rule:
|
||||
|
||||
odor=(almond.OR.anise.OR.none).AND.spore-print-color=NOT.green
|
||||
|
||||
gives 48 errors, or 99.41% accuracy on the whole dataset.
|
||||
|
||||
Several slightly more complex variations on these rules exist,
|
||||
involving other attributes, such as gill_size, gill_spacing,
|
||||
stalk_surface_above_ring, but the rules given above are the simplest
|
||||
we have found.
|
||||
|
||||
|
||||
4. Relevant Information:
|
||||
This data set includes descriptions of hypothetical samples
|
||||
corresponding to 23 species of gilled mushrooms in the Agaricus and
|
||||
Lepiota Family (pp. 500-525). Each species is identified as
|
||||
definitely edible, definitely poisonous, or of unknown edibility and
|
||||
not recommended. This latter class was combined with the poisonous
|
||||
one. The Guide clearly states that there is no simple rule for
|
||||
determining the edibility of a mushroom; no rule like ``leaflets
|
||||
three, let it be'' for Poisonous Oak and Ivy.
|
||||
|
||||
5. Number of Instances: 8124
|
||||
|
||||
6. Number of Attributes: 22 (all nominally valued)
|
||||
|
||||
7. Attribute Information: (classes: edible=e, poisonous=p)
|
||||
1. cap-shape: bell=b,conical=c,convex=x,flat=f,
|
||||
knobbed=k,sunken=s
|
||||
2. cap-surface: fibrous=f,grooves=g,scaly=y,smooth=s
|
||||
3. cap-color: brown=n,buff=b,cinnamon=c,gray=g,green=r,
|
||||
pink=p,purple=u,red=e,white=w,yellow=y
|
||||
4. bruises?: bruises=t,no=f
|
||||
5. odor: almond=a,anise=l,creosote=c,fishy=y,foul=f,
|
||||
musty=m,none=n,pungent=p,spicy=s
|
||||
6. gill-attachment: attached=a,descending=d,free=f,notched=n
|
||||
7. gill-spacing: close=c,crowded=w,distant=d
|
||||
8. gill-size: broad=b,narrow=n
|
||||
9. gill-color: black=k,brown=n,buff=b,chocolate=h,gray=g,
|
||||
green=r,orange=o,pink=p,purple=u,red=e,
|
||||
white=w,yellow=y
|
||||
10. stalk-shape: enlarging=e,tapering=t
|
||||
11. stalk-root: bulbous=b,club=c,cup=u,equal=e,
|
||||
rhizomorphs=z,rooted=r,missing=?
|
||||
12. stalk-surface-above-ring: fibrous=f,scaly=y,silky=k,smooth=s
|
||||
13. stalk-surface-below-ring: fibrous=f,scaly=y,silky=k,smooth=s
|
||||
14. stalk-color-above-ring: brown=n,buff=b,cinnamon=c,gray=g,orange=o,
|
||||
pink=p,red=e,white=w,yellow=y
|
||||
15. stalk-color-below-ring: brown=n,buff=b,cinnamon=c,gray=g,orange=o,
|
||||
pink=p,red=e,white=w,yellow=y
|
||||
16. veil-type: partial=p,universal=u
|
||||
17. veil-color: brown=n,orange=o,white=w,yellow=y
|
||||
18. ring-number: none=n,one=o,two=t
|
||||
19. ring-type: cobwebby=c,evanescent=e,flaring=f,large=l,
|
||||
none=n,pendant=p,sheathing=s,zone=z
|
||||
20. spore-print-color: black=k,brown=n,buff=b,chocolate=h,green=r,
|
||||
orange=o,purple=u,white=w,yellow=y
|
||||
21. population: abundant=a,clustered=c,numerous=n,
|
||||
scattered=s,several=v,solitary=y
|
||||
22. habitat: grasses=g,leaves=l,meadows=m,paths=p,
|
||||
urban=u,waste=w,woods=d
|
||||
|
||||
8. Missing Attribute Values: 2480 of them (denoted by "?"), all for
|
||||
attribute #11.
|
||||
|
||||
9. Class Distribution:
|
||||
-- edible: 4208 (51.8%)
|
||||
-- poisonous: 3916 (48.2%)
|
||||
-- total: 8124 instances
|
||||
50
demo/binary_classification/mapfeat.py
Executable file
50
demo/binary_classification/mapfeat.py
Executable file
@@ -0,0 +1,50 @@
|
||||
#!/usr/bin/python
|
||||
import sys
|
||||
|
||||
def loadfmap( fname ):
|
||||
fmap = {}
|
||||
nmap = {}
|
||||
|
||||
for l in open( fname ):
|
||||
arr = l.split()
|
||||
if arr[0].find('.') != -1:
|
||||
idx = int( arr[0].strip('.') )
|
||||
assert idx not in fmap
|
||||
fmap[ idx ] = {}
|
||||
ftype = arr[1].strip(':')
|
||||
content = arr[2]
|
||||
else:
|
||||
content = arr[0]
|
||||
for it in content.split(','):
|
||||
if it.strip() == '':
|
||||
continue
|
||||
k , v = it.split('=')
|
||||
fmap[ idx ][ v ] = len(nmap)
|
||||
nmap[ len(nmap) ] = ftype+'='+k
|
||||
return fmap, nmap
|
||||
|
||||
def write_nmap( fo, nmap ):
|
||||
for i in xrange( len(nmap) ):
|
||||
fo.write('%d\t%s\ti\n' % (i, nmap[i]) )
|
||||
|
||||
# start here
|
||||
fmap, nmap = loadfmap( 'agaricus-lepiota.fmap' )
|
||||
fo = open( 'featmap.txt', 'w' )
|
||||
write_nmap( fo, nmap )
|
||||
fo.close()
|
||||
|
||||
fo = open( 'agaricus.txt', 'w' )
|
||||
for l in open( 'agaricus-lepiota.data' ):
|
||||
arr = l.split(',')
|
||||
if arr[0] == 'p':
|
||||
fo.write('1')
|
||||
else:
|
||||
assert arr[0] == 'e'
|
||||
fo.write('0')
|
||||
for i in xrange( 1,len(arr) ):
|
||||
fo.write( ' %d:1' % fmap[i][arr[i].strip()] )
|
||||
fo.write('\n')
|
||||
|
||||
fo.close()
|
||||
|
||||
|
||||
29
demo/binary_classification/mknfold.py
Executable file
29
demo/binary_classification/mknfold.py
Executable file
@@ -0,0 +1,29 @@
|
||||
#!/usr/bin/python
|
||||
import sys
|
||||
import random
|
||||
|
||||
if len(sys.argv) < 2:
|
||||
print 'Usage:<filename> <k> [nfold = 5]'
|
||||
exit(0)
|
||||
|
||||
random.seed( 10 )
|
||||
|
||||
k = int( sys.argv[2] )
|
||||
if len(sys.argv) > 3:
|
||||
nfold = int( sys.argv[3] )
|
||||
else:
|
||||
nfold = 5
|
||||
|
||||
fi = open( sys.argv[1], 'r' )
|
||||
ftr = open( sys.argv[1]+'.train', 'w' )
|
||||
fte = open( sys.argv[1]+'.test', 'w' )
|
||||
for l in fi:
|
||||
if random.randint( 1 , nfold ) == k:
|
||||
fte.write( l )
|
||||
else:
|
||||
ftr.write( l )
|
||||
|
||||
fi.close()
|
||||
ftr.close()
|
||||
fte.close()
|
||||
|
||||
27
demo/binary_classification/mushroom.conf
Normal file
27
demo/binary_classification/mushroom.conf
Normal file
@@ -0,0 +1,27 @@
|
||||
# General Parameters, see comment for each definition
|
||||
# choose the tree booster, 0: tree, 1: linear
|
||||
booster_type = 0
|
||||
# choose logistic regression loss function for binary classification
|
||||
loss_type = 2
|
||||
|
||||
# Tree Booster Parameters
|
||||
# step size shrinkage
|
||||
bst:eta = 1.0
|
||||
# minimum loss reduction required to make a further partition
|
||||
bst:gamma = 1.0
|
||||
# minimum sum of instance weight(hessian) needed in a child
|
||||
bst:min_child_weight = 1
|
||||
# maximum depth of a tree
|
||||
bst:max_depth = 3
|
||||
|
||||
# Task Parameters
|
||||
# the number of round to do boosting
|
||||
num_round = 2
|
||||
# 0 means do not save any model except the final round model
|
||||
save_period = 0
|
||||
# The path of training data
|
||||
data = "agaricus.txt.train"
|
||||
# The path of validation data, used to monitor training process, here [test] sets name of the validation set
|
||||
eval[test] = "agaricus.txt.test"
|
||||
# The path of test data
|
||||
test:data = "agaricus.txt.test"
|
||||
15
demo/binary_classification/runexp.sh
Executable file
15
demo/binary_classification/runexp.sh
Executable file
@@ -0,0 +1,15 @@
|
||||
#!/bin/bash
|
||||
# map feature using indicator encoding, also produce featmap.txt
|
||||
python mapfeat.py
|
||||
# split train and test
|
||||
python mknfold.py agaricus.txt 1
|
||||
# training and output the models
|
||||
../../xgboost mushroom.conf
|
||||
# output prediction task=pred
|
||||
../../xgboost mushroom.conf task=pred model_in=0002.model
|
||||
# print the boosters of 00002.model in dump.raw.txt
|
||||
../../xgboost mushroom.conf task=dump model_in=0002.model name_dump=dump.raw.txt
|
||||
# use the feature map in printing for better visualization
|
||||
../../xgboost mushroom.conf task=dump model_in=0002.model fmap=featmap.txt name_dump=dump.nice.txt
|
||||
cat dump.nice.txt
|
||||
|
||||
13
demo/regression/README
Normal file
13
demo/regression/README
Normal file
@@ -0,0 +1,13 @@
|
||||
Demonstrating how to use XGBoost accomplish regression tasks on computer hardware dataset https://archive.ics.uci.edu/ml/datasets/Computer+Hardware
|
||||
|
||||
Run: ./runexp.sh
|
||||
|
||||
Format of input: LIBSVM format
|
||||
|
||||
Format of ```featmap.txt: <featureid> <featurename> <q or i or int>\n ```:
|
||||
- Feature id must be from 0 to number of features, in sorted order.
|
||||
- i means this feature is binary indicator feature
|
||||
- q means this feature is a quantitative value, such as age, time, can be missing
|
||||
- int means this feature is integer value (when int is hinted, the decision boundary will be integer)
|
||||
|
||||
Explainations: https://github.com/tqchen/xgboost/wiki/Regression
|
||||
30
demo/regression/machine.conf
Normal file
30
demo/regression/machine.conf
Normal file
@@ -0,0 +1,30 @@
|
||||
# General Parameters, see comment for each definition
|
||||
# choose the tree booster, 0: tree, 1: linear
|
||||
booster_type = 0
|
||||
# this is the only difference with classification, use 0: linear regression
|
||||
# when labels are in [0,1] we can also use 1: logistic regression
|
||||
loss_type = 0
|
||||
|
||||
# Tree Booster Parameters
|
||||
# step size shrinkage
|
||||
bst:eta = 1.0
|
||||
# minimum loss reduction required to make a further partition
|
||||
bst:gamma = 1.0
|
||||
# minimum sum of instance weight(hessian) needed in a child
|
||||
bst:min_child_weight = 1
|
||||
# maximum depth of a tree
|
||||
bst:max_depth = 3
|
||||
|
||||
# Task parameters
|
||||
# the number of round to do boosting
|
||||
num_round = 2
|
||||
# 0 means do not save any model except the final round model
|
||||
save_period = 0
|
||||
# The path of training data
|
||||
data = "machine.txt.train"
|
||||
# The path of validation data, used to monitor training process, here [test] sets name of the validation set
|
||||
eval[test] = "machine.txt.test"
|
||||
# The path of test data
|
||||
test:data = "machine.txt.test"
|
||||
|
||||
|
||||
209
demo/regression/machine.data
Normal file
209
demo/regression/machine.data
Normal file
@@ -0,0 +1,209 @@
|
||||
adviser,32/60,125,256,6000,256,16,128,198,199
|
||||
amdahl,470v/7,29,8000,32000,32,8,32,269,253
|
||||
amdahl,470v/7a,29,8000,32000,32,8,32,220,253
|
||||
amdahl,470v/7b,29,8000,32000,32,8,32,172,253
|
||||
amdahl,470v/7c,29,8000,16000,32,8,16,132,132
|
||||
amdahl,470v/b,26,8000,32000,64,8,32,318,290
|
||||
amdahl,580-5840,23,16000,32000,64,16,32,367,381
|
||||
amdahl,580-5850,23,16000,32000,64,16,32,489,381
|
||||
amdahl,580-5860,23,16000,64000,64,16,32,636,749
|
||||
amdahl,580-5880,23,32000,64000,128,32,64,1144,1238
|
||||
apollo,dn320,400,1000,3000,0,1,2,38,23
|
||||
apollo,dn420,400,512,3500,4,1,6,40,24
|
||||
basf,7/65,60,2000,8000,65,1,8,92,70
|
||||
basf,7/68,50,4000,16000,65,1,8,138,117
|
||||
bti,5000,350,64,64,0,1,4,10,15
|
||||
bti,8000,200,512,16000,0,4,32,35,64
|
||||
burroughs,b1955,167,524,2000,8,4,15,19,23
|
||||
burroughs,b2900,143,512,5000,0,7,32,28,29
|
||||
burroughs,b2925,143,1000,2000,0,5,16,31,22
|
||||
burroughs,b4955,110,5000,5000,142,8,64,120,124
|
||||
burroughs,b5900,143,1500,6300,0,5,32,30,35
|
||||
burroughs,b5920,143,3100,6200,0,5,20,33,39
|
||||
burroughs,b6900,143,2300,6200,0,6,64,61,40
|
||||
burroughs,b6925,110,3100,6200,0,6,64,76,45
|
||||
c.r.d,68/10-80,320,128,6000,0,1,12,23,28
|
||||
c.r.d,universe:2203t,320,512,2000,4,1,3,69,21
|
||||
c.r.d,universe:68,320,256,6000,0,1,6,33,28
|
||||
c.r.d,universe:68/05,320,256,3000,4,1,3,27,22
|
||||
c.r.d,universe:68/137,320,512,5000,4,1,5,77,28
|
||||
c.r.d,universe:68/37,320,256,5000,4,1,6,27,27
|
||||
cdc,cyber:170/750,25,1310,2620,131,12,24,274,102
|
||||
cdc,cyber:170/760,25,1310,2620,131,12,24,368,102
|
||||
cdc,cyber:170/815,50,2620,10480,30,12,24,32,74
|
||||
cdc,cyber:170/825,50,2620,10480,30,12,24,63,74
|
||||
cdc,cyber:170/835,56,5240,20970,30,12,24,106,138
|
||||
cdc,cyber:170/845,64,5240,20970,30,12,24,208,136
|
||||
cdc,omega:480-i,50,500,2000,8,1,4,20,23
|
||||
cdc,omega:480-ii,50,1000,4000,8,1,5,29,29
|
||||
cdc,omega:480-iii,50,2000,8000,8,1,5,71,44
|
||||
cambex,1636-1,50,1000,4000,8,3,5,26,30
|
||||
cambex,1636-10,50,1000,8000,8,3,5,36,41
|
||||
cambex,1641-1,50,2000,16000,8,3,5,40,74
|
||||
cambex,1641-11,50,2000,16000,8,3,6,52,74
|
||||
cambex,1651-1,50,2000,16000,8,3,6,60,74
|
||||
dec,decsys:10:1091,133,1000,12000,9,3,12,72,54
|
||||
dec,decsys:20:2060,133,1000,8000,9,3,12,72,41
|
||||
dec,microvax-1,810,512,512,8,1,1,18,18
|
||||
dec,vax:11/730,810,1000,5000,0,1,1,20,28
|
||||
dec,vax:11/750,320,512,8000,4,1,5,40,36
|
||||
dec,vax:11/780,200,512,8000,8,1,8,62,38
|
||||
dg,eclipse:c/350,700,384,8000,0,1,1,24,34
|
||||
dg,eclipse:m/600,700,256,2000,0,1,1,24,19
|
||||
dg,eclipse:mv/10000,140,1000,16000,16,1,3,138,72
|
||||
dg,eclipse:mv/4000,200,1000,8000,0,1,2,36,36
|
||||
dg,eclipse:mv/6000,110,1000,4000,16,1,2,26,30
|
||||
dg,eclipse:mv/8000,110,1000,12000,16,1,2,60,56
|
||||
dg,eclipse:mv/8000-ii,220,1000,8000,16,1,2,71,42
|
||||
formation,f4000/100,800,256,8000,0,1,4,12,34
|
||||
formation,f4000/200,800,256,8000,0,1,4,14,34
|
||||
formation,f4000/200ap,800,256,8000,0,1,4,20,34
|
||||
formation,f4000/300,800,256,8000,0,1,4,16,34
|
||||
formation,f4000/300ap,800,256,8000,0,1,4,22,34
|
||||
four-phase,2000/260,125,512,1000,0,8,20,36,19
|
||||
gould,concept:32/8705,75,2000,8000,64,1,38,144,75
|
||||
gould,concept:32/8750,75,2000,16000,64,1,38,144,113
|
||||
gould,concept:32/8780,75,2000,16000,128,1,38,259,157
|
||||
hp,3000/30,90,256,1000,0,3,10,17,18
|
||||
hp,3000/40,105,256,2000,0,3,10,26,20
|
||||
hp,3000/44,105,1000,4000,0,3,24,32,28
|
||||
hp,3000/48,105,2000,4000,8,3,19,32,33
|
||||
hp,3000/64,75,2000,8000,8,3,24,62,47
|
||||
hp,3000/88,75,3000,8000,8,3,48,64,54
|
||||
hp,3000/iii,175,256,2000,0,3,24,22,20
|
||||
harris,100,300,768,3000,0,6,24,36,23
|
||||
harris,300,300,768,3000,6,6,24,44,25
|
||||
harris,500,300,768,12000,6,6,24,50,52
|
||||
harris,600,300,768,4500,0,1,24,45,27
|
||||
harris,700,300,384,12000,6,1,24,53,50
|
||||
harris,80,300,192,768,6,6,24,36,18
|
||||
harris,800,180,768,12000,6,1,31,84,53
|
||||
honeywell,dps:6/35,330,1000,3000,0,2,4,16,23
|
||||
honeywell,dps:6/92,300,1000,4000,8,3,64,38,30
|
||||
honeywell,dps:6/96,300,1000,16000,8,2,112,38,73
|
||||
honeywell,dps:7/35,330,1000,2000,0,1,2,16,20
|
||||
honeywell,dps:7/45,330,1000,4000,0,3,6,22,25
|
||||
honeywell,dps:7/55,140,2000,4000,0,3,6,29,28
|
||||
honeywell,dps:7/65,140,2000,4000,0,4,8,40,29
|
||||
honeywell,dps:8/44,140,2000,4000,8,1,20,35,32
|
||||
honeywell,dps:8/49,140,2000,32000,32,1,20,134,175
|
||||
honeywell,dps:8/50,140,2000,8000,32,1,54,66,57
|
||||
honeywell,dps:8/52,140,2000,32000,32,1,54,141,181
|
||||
honeywell,dps:8/62,140,2000,32000,32,1,54,189,181
|
||||
honeywell,dps:8/20,140,2000,4000,8,1,20,22,32
|
||||
ibm,3033:s,57,4000,16000,1,6,12,132,82
|
||||
ibm,3033:u,57,4000,24000,64,12,16,237,171
|
||||
ibm,3081,26,16000,32000,64,16,24,465,361
|
||||
ibm,3081:d,26,16000,32000,64,8,24,465,350
|
||||
ibm,3083:b,26,8000,32000,0,8,24,277,220
|
||||
ibm,3083:e,26,8000,16000,0,8,16,185,113
|
||||
ibm,370/125-2,480,96,512,0,1,1,6,15
|
||||
ibm,370/148,203,1000,2000,0,1,5,24,21
|
||||
ibm,370/158-3,115,512,6000,16,1,6,45,35
|
||||
ibm,38/3,1100,512,1500,0,1,1,7,18
|
||||
ibm,38/4,1100,768,2000,0,1,1,13,20
|
||||
ibm,38/5,600,768,2000,0,1,1,16,20
|
||||
ibm,38/7,400,2000,4000,0,1,1,32,28
|
||||
ibm,38/8,400,4000,8000,0,1,1,32,45
|
||||
ibm,4321,900,1000,1000,0,1,2,11,18
|
||||
ibm,4331-1,900,512,1000,0,1,2,11,17
|
||||
ibm,4331-11,900,1000,4000,4,1,2,18,26
|
||||
ibm,4331-2,900,1000,4000,8,1,2,22,28
|
||||
ibm,4341,900,2000,4000,0,3,6,37,28
|
||||
ibm,4341-1,225,2000,4000,8,3,6,40,31
|
||||
ibm,4341-10,225,2000,4000,8,3,6,34,31
|
||||
ibm,4341-11,180,2000,8000,8,1,6,50,42
|
||||
ibm,4341-12,185,2000,16000,16,1,6,76,76
|
||||
ibm,4341-2,180,2000,16000,16,1,6,66,76
|
||||
ibm,4341-9,225,1000,4000,2,3,6,24,26
|
||||
ibm,4361-4,25,2000,12000,8,1,4,49,59
|
||||
ibm,4361-5,25,2000,12000,16,3,5,66,65
|
||||
ibm,4381-1,17,4000,16000,8,6,12,100,101
|
||||
ibm,4381-2,17,4000,16000,32,6,12,133,116
|
||||
ibm,8130-a,1500,768,1000,0,0,0,12,18
|
||||
ibm,8130-b,1500,768,2000,0,0,0,18,20
|
||||
ibm,8140,800,768,2000,0,0,0,20,20
|
||||
ipl,4436,50,2000,4000,0,3,6,27,30
|
||||
ipl,4443,50,2000,8000,8,3,6,45,44
|
||||
ipl,4445,50,2000,8000,8,1,6,56,44
|
||||
ipl,4446,50,2000,16000,24,1,6,70,82
|
||||
ipl,4460,50,2000,16000,24,1,6,80,82
|
||||
ipl,4480,50,8000,16000,48,1,10,136,128
|
||||
magnuson,m80/30,100,1000,8000,0,2,6,16,37
|
||||
magnuson,m80/31,100,1000,8000,24,2,6,26,46
|
||||
magnuson,m80/32,100,1000,8000,24,3,6,32,46
|
||||
magnuson,m80/42,50,2000,16000,12,3,16,45,80
|
||||
magnuson,m80/43,50,2000,16000,24,6,16,54,88
|
||||
magnuson,m80/44,50,2000,16000,24,6,16,65,88
|
||||
microdata,seq.ms/3200,150,512,4000,0,8,128,30,33
|
||||
nas,as/3000,115,2000,8000,16,1,3,50,46
|
||||
nas,as/3000-n,115,2000,4000,2,1,5,40,29
|
||||
nas,as/5000,92,2000,8000,32,1,6,62,53
|
||||
nas,as/5000-e,92,2000,8000,32,1,6,60,53
|
||||
nas,as/5000-n,92,2000,8000,4,1,6,50,41
|
||||
nas,as/6130,75,4000,16000,16,1,6,66,86
|
||||
nas,as/6150,60,4000,16000,32,1,6,86,95
|
||||
nas,as/6620,60,2000,16000,64,5,8,74,107
|
||||
nas,as/6630,60,4000,16000,64,5,8,93,117
|
||||
nas,as/6650,50,4000,16000,64,5,10,111,119
|
||||
nas,as/7000,72,4000,16000,64,8,16,143,120
|
||||
nas,as/7000-n,72,2000,8000,16,6,8,105,48
|
||||
nas,as/8040,40,8000,16000,32,8,16,214,126
|
||||
nas,as/8050,40,8000,32000,64,8,24,277,266
|
||||
nas,as/8060,35,8000,32000,64,8,24,370,270
|
||||
nas,as/9000-dpc,38,16000,32000,128,16,32,510,426
|
||||
nas,as/9000-n,48,4000,24000,32,8,24,214,151
|
||||
nas,as/9040,38,8000,32000,64,8,24,326,267
|
||||
nas,as/9060,30,16000,32000,256,16,24,510,603
|
||||
ncr,v8535:ii,112,1000,1000,0,1,4,8,19
|
||||
ncr,v8545:ii,84,1000,2000,0,1,6,12,21
|
||||
ncr,v8555:ii,56,1000,4000,0,1,6,17,26
|
||||
ncr,v8565:ii,56,2000,6000,0,1,8,21,35
|
||||
ncr,v8565:ii-e,56,2000,8000,0,1,8,24,41
|
||||
ncr,v8575:ii,56,4000,8000,0,1,8,34,47
|
||||
ncr,v8585:ii,56,4000,12000,0,1,8,42,62
|
||||
ncr,v8595:ii,56,4000,16000,0,1,8,46,78
|
||||
ncr,v8635,38,4000,8000,32,16,32,51,80
|
||||
ncr,v8650,38,4000,8000,32,16,32,116,80
|
||||
ncr,v8655,38,8000,16000,64,4,8,100,142
|
||||
ncr,v8665,38,8000,24000,160,4,8,140,281
|
||||
ncr,v8670,38,4000,16000,128,16,32,212,190
|
||||
nixdorf,8890/30,200,1000,2000,0,1,2,25,21
|
||||
nixdorf,8890/50,200,1000,4000,0,1,4,30,25
|
||||
nixdorf,8890/70,200,2000,8000,64,1,5,41,67
|
||||
perkin-elmer,3205,250,512,4000,0,1,7,25,24
|
||||
perkin-elmer,3210,250,512,4000,0,4,7,50,24
|
||||
perkin-elmer,3230,250,1000,16000,1,1,8,50,64
|
||||
prime,50-2250,160,512,4000,2,1,5,30,25
|
||||
prime,50-250-ii,160,512,2000,2,3,8,32,20
|
||||
prime,50-550-ii,160,1000,4000,8,1,14,38,29
|
||||
prime,50-750-ii,160,1000,8000,16,1,14,60,43
|
||||
prime,50-850-ii,160,2000,8000,32,1,13,109,53
|
||||
siemens,7.521,240,512,1000,8,1,3,6,19
|
||||
siemens,7.531,240,512,2000,8,1,5,11,22
|
||||
siemens,7.536,105,2000,4000,8,3,8,22,31
|
||||
siemens,7.541,105,2000,6000,16,6,16,33,41
|
||||
siemens,7.551,105,2000,8000,16,4,14,58,47
|
||||
siemens,7.561,52,4000,16000,32,4,12,130,99
|
||||
siemens,7.865-2,70,4000,12000,8,6,8,75,67
|
||||
siemens,7.870-2,59,4000,12000,32,6,12,113,81
|
||||
siemens,7.872-2,59,8000,16000,64,12,24,188,149
|
||||
siemens,7.875-2,26,8000,24000,32,8,16,173,183
|
||||
siemens,7.880-2,26,8000,32000,64,12,16,248,275
|
||||
siemens,7.881-2,26,8000,32000,128,24,32,405,382
|
||||
sperry,1100/61-h1,116,2000,8000,32,5,28,70,56
|
||||
sperry,1100/81,50,2000,32000,24,6,26,114,182
|
||||
sperry,1100/82,50,2000,32000,48,26,52,208,227
|
||||
sperry,1100/83,50,2000,32000,112,52,104,307,341
|
||||
sperry,1100/84,50,4000,32000,112,52,104,397,360
|
||||
sperry,1100/93,30,8000,64000,96,12,176,915,919
|
||||
sperry,1100/94,30,8000,64000,128,12,176,1150,978
|
||||
sperry,80/3,180,262,4000,0,1,3,12,24
|
||||
sperry,80/4,180,512,4000,0,1,3,14,24
|
||||
sperry,80/5,180,262,4000,0,1,3,18,24
|
||||
sperry,80/6,180,512,4000,0,1,3,21,24
|
||||
sperry,80/8,124,1000,8000,0,1,8,42,37
|
||||
sperry,90/80-model-3,98,1000,8000,32,2,8,46,50
|
||||
sratus,32,125,2000,8000,0,2,14,52,41
|
||||
wang,vs-100,480,512,8000,32,0,0,67,47
|
||||
wang,vs-90,480,1000,4000,0,0,0,45,25
|
||||
72
demo/regression/machine.names
Normal file
72
demo/regression/machine.names
Normal file
@@ -0,0 +1,72 @@
|
||||
1. Title: Relative CPU Performance Data
|
||||
|
||||
2. Source Information
|
||||
-- Creators: Phillip Ein-Dor and Jacob Feldmesser
|
||||
-- Ein-Dor: Faculty of Management; Tel Aviv University; Ramat-Aviv;
|
||||
Tel Aviv, 69978; Israel
|
||||
-- Donor: David W. Aha (aha@ics.uci.edu) (714) 856-8779
|
||||
-- Date: October, 1987
|
||||
|
||||
3. Past Usage:
|
||||
1. Ein-Dor and Feldmesser (CACM 4/87, pp 308-317)
|
||||
-- Results:
|
||||
-- linear regression prediction of relative cpu performance
|
||||
-- Recorded 34% average deviation from actual values
|
||||
2. Kibler,D. & Aha,D. (1988). Instance-Based Prediction of
|
||||
Real-Valued Attributes. In Proceedings of the CSCSI (Canadian
|
||||
AI) Conference.
|
||||
-- Results:
|
||||
-- instance-based prediction of relative cpu performance
|
||||
-- similar results; no transformations required
|
||||
- Predicted attribute: cpu relative performance (numeric)
|
||||
|
||||
4. Relevant Information:
|
||||
-- The estimated relative performance values were estimated by the authors
|
||||
using a linear regression method. See their article (pp 308-313) for
|
||||
more details on how the relative performance values were set.
|
||||
|
||||
5. Number of Instances: 209
|
||||
|
||||
6. Number of Attributes: 10 (6 predictive attributes, 2 non-predictive,
|
||||
1 goal field, and the linear regression's guess)
|
||||
|
||||
7. Attribute Information:
|
||||
1. vendor name: 30
|
||||
(adviser, amdahl,apollo, basf, bti, burroughs, c.r.d, cambex, cdc, dec,
|
||||
dg, formation, four-phase, gould, honeywell, hp, ibm, ipl, magnuson,
|
||||
microdata, nas, ncr, nixdorf, perkin-elmer, prime, siemens, sperry,
|
||||
sratus, wang)
|
||||
2. Model Name: many unique symbols
|
||||
3. MYCT: machine cycle time in nanoseconds (integer)
|
||||
4. MMIN: minimum main memory in kilobytes (integer)
|
||||
5. MMAX: maximum main memory in kilobytes (integer)
|
||||
6. CACH: cache memory in kilobytes (integer)
|
||||
7. CHMIN: minimum channels in units (integer)
|
||||
8. CHMAX: maximum channels in units (integer)
|
||||
9. PRP: published relative performance (integer)
|
||||
10. ERP: estimated relative performance from the original article (integer)
|
||||
|
||||
8. Missing Attribute Values: None
|
||||
|
||||
9. Class Distribution: the class value (PRP) is continuously valued.
|
||||
PRP Value Range: Number of Instances in Range:
|
||||
0-20 31
|
||||
21-100 121
|
||||
101-200 27
|
||||
201-300 13
|
||||
301-400 7
|
||||
401-500 4
|
||||
501-600 2
|
||||
above 600 4
|
||||
|
||||
Summary Statistics:
|
||||
Min Max Mean SD PRP Correlation
|
||||
MCYT: 17 1500 203.8 260.3 -0.3071
|
||||
MMIN: 64 32000 2868.0 3878.7 0.7949
|
||||
MMAX: 64 64000 11796.1 11726.6 0.8630
|
||||
CACH: 0 256 25.2 40.6 0.6626
|
||||
CHMIN: 0 52 4.7 6.8 0.6089
|
||||
CHMAX: 0 176 18.2 26.0 0.6052
|
||||
PRP: 6 1150 105.6 160.8 1.0000
|
||||
ERP: 15 1238 99.3 154.8 0.9665
|
||||
|
||||
32
demo/regression/mapfeat.py
Executable file
32
demo/regression/mapfeat.py
Executable file
@@ -0,0 +1,32 @@
|
||||
#!/usr/bin/python
|
||||
import sys
|
||||
|
||||
fo = open( 'machine.txt', 'w' )
|
||||
cnt = 6
|
||||
fmap = {}
|
||||
for l in open( 'machine.data' ):
|
||||
arr = l.split(',')
|
||||
fo.write(arr[8])
|
||||
for i in xrange( 0,6 ):
|
||||
fo.write( ' %d:%s' %(i,arr[i+2]) )
|
||||
|
||||
if arr[0] not in fmap:
|
||||
fmap[arr[0]] = cnt
|
||||
cnt += 1
|
||||
|
||||
fo.write( ' %d:1' % fmap[arr[0]] )
|
||||
fo.write('\n')
|
||||
|
||||
fo.close()
|
||||
|
||||
# create feature map for machine data
|
||||
fo = open('featmap.txt', 'w')
|
||||
# list from machine.names
|
||||
names = ['vendor','MYCT', 'MMIN', 'MMAX', 'CACH', 'CHMIN', 'CHMAX', 'PRP', 'ERP' ];
|
||||
|
||||
for i in xrange(0,6):
|
||||
fo.write( '%d\t%s\tint\n' % (i, names[i+1]))
|
||||
|
||||
for v, k in sorted( fmap.iteritems(), key = lambda x:x[1] ):
|
||||
fo.write( '%d\tvendor=%s\ti\n' % (k, v))
|
||||
fo.close()
|
||||
29
demo/regression/mknfold.py
Executable file
29
demo/regression/mknfold.py
Executable file
@@ -0,0 +1,29 @@
|
||||
#!/usr/bin/python
|
||||
import sys
|
||||
import random
|
||||
|
||||
if len(sys.argv) < 2:
|
||||
print 'Usage:<filename> <k> [nfold = 5]'
|
||||
exit(0)
|
||||
|
||||
random.seed( 10 )
|
||||
|
||||
k = int( sys.argv[2] )
|
||||
if len(sys.argv) > 3:
|
||||
nfold = int( sys.argv[3] )
|
||||
else:
|
||||
nfold = 5
|
||||
|
||||
fi = open( sys.argv[1], 'r' )
|
||||
ftr = open( sys.argv[1]+'.train', 'w' )
|
||||
fte = open( sys.argv[1]+'.test', 'w' )
|
||||
for l in fi:
|
||||
if random.randint( 1 , nfold ) == k:
|
||||
fte.write( l )
|
||||
else:
|
||||
ftr.write( l )
|
||||
|
||||
fi.close()
|
||||
ftr.close()
|
||||
fte.close()
|
||||
|
||||
16
demo/regression/runexp.sh
Executable file
16
demo/regression/runexp.sh
Executable file
@@ -0,0 +1,16 @@
|
||||
#!/bin/bash
|
||||
# map the data to features. For convenience we only use 7 original attributes and encode them as features in a trivial way
|
||||
python mapfeat.py
|
||||
# split train and test
|
||||
python mknfold.py machine.txt 1
|
||||
# training and output the models
|
||||
../../xgboost machine.conf
|
||||
# output predictions of test data
|
||||
../../xgboost machine.conf task=pred model_in=0002.model
|
||||
# print the boosters of 0002.model in dump.raw.txt
|
||||
../../xgboost machine.conf task=dump model_in=0002.model name_dump=dump.raw.txt
|
||||
# print the boosters of 0002.model in dump.nice.txt with feature map
|
||||
../../xgboost machine.conf task=dump model_in=0002.model fmap=featmap.txt name_dump=dump.nice.txt
|
||||
|
||||
# cat the result
|
||||
cat dump.nice.txt
|
||||
403
regression/xgboost_reg.h
Normal file
403
regression/xgboost_reg.h
Normal file
@@ -0,0 +1,403 @@
|
||||
#ifndef XGBOOST_REG_H
|
||||
#define XGBOOST_REG_H
|
||||
/*!
|
||||
* \file xgboost_reg.h
|
||||
* \brief class for gradient boosted regression
|
||||
* \author Kailong Chen: chenkl198812@gmail.com, Tianqi Chen: tianqi.tchen@gmail.com
|
||||
*/
|
||||
#include <cmath>
|
||||
#include <cstdlib>
|
||||
#include <cstring>
|
||||
#include "xgboost_reg_data.h"
|
||||
#include "xgboost_reg_eval.h"
|
||||
#include "../utils/xgboost_omp.h"
|
||||
#include "../booster/xgboost_gbmbase.h"
|
||||
#include "../utils/xgboost_utils.h"
|
||||
#include "../utils/xgboost_stream.h"
|
||||
|
||||
namespace xgboost{
|
||||
namespace regression{
|
||||
/*! \brief class for gradient boosted regression */
|
||||
class RegBoostLearner{
|
||||
public:
|
||||
/*! \brief constructor */
|
||||
RegBoostLearner( void ){
|
||||
silent = 0;
|
||||
}
|
||||
/*!
|
||||
* \brief a regression booter associated with training and evaluating data
|
||||
* \param train pointer to the training data
|
||||
* \param evals array of evaluating data
|
||||
* \param evname name of evaluation data, used print statistics
|
||||
*/
|
||||
RegBoostLearner( const DMatrix *train,
|
||||
const std::vector<DMatrix *> &evals,
|
||||
const std::vector<std::string> &evname ){
|
||||
silent = 0;
|
||||
this->SetData(train,evals,evname);
|
||||
}
|
||||
|
||||
/*!
|
||||
* \brief associate regression booster with training and evaluating data
|
||||
* \param train pointer to the training data
|
||||
* \param evals array of evaluating data
|
||||
* \param evname name of evaluation data, used print statistics
|
||||
*/
|
||||
inline void SetData( const DMatrix *train,
|
||||
const std::vector<DMatrix *> &evals,
|
||||
const std::vector<std::string> &evname ){
|
||||
this->train_ = train;
|
||||
this->evals_ = evals;
|
||||
this->evname_ = evname;
|
||||
// estimate feature bound
|
||||
int num_feature = (int)(train->data.NumCol());
|
||||
// assign buffer index
|
||||
unsigned buffer_size = static_cast<unsigned>( train->Size() );
|
||||
|
||||
for( size_t i = 0; i < evals.size(); ++ i ){
|
||||
buffer_size += static_cast<unsigned>( evals[i]->Size() );
|
||||
num_feature = std::max( num_feature, (int)(evals[i]->data.NumCol()) );
|
||||
}
|
||||
|
||||
char str_temp[25];
|
||||
if( num_feature > mparam.num_feature ){
|
||||
mparam.num_feature = num_feature;
|
||||
sprintf( str_temp, "%d", num_feature );
|
||||
base_gbm.SetParam( "bst:num_feature", str_temp );
|
||||
}
|
||||
|
||||
sprintf( str_temp, "%u", buffer_size );
|
||||
base_gbm.SetParam( "num_pbuffer", str_temp );
|
||||
if( !silent ){
|
||||
printf( "buffer_size=%u\n", buffer_size );
|
||||
}
|
||||
|
||||
// set eval_preds tmp sapce
|
||||
this->eval_preds_.resize( evals.size(), std::vector<float>() );
|
||||
}
|
||||
/*!
|
||||
* \brief set parameters from outside
|
||||
* \param name name of the parameter
|
||||
* \param val value of the parameter
|
||||
*/
|
||||
inline void SetParam( const char *name, const char *val ){
|
||||
if( !strcmp( name, "silent") ) silent = atoi( val );
|
||||
if( !strcmp( name, "eval_metric") ) evaluator_.AddEval( val );
|
||||
mparam.SetParam( name, val );
|
||||
base_gbm.SetParam( name, val );
|
||||
}
|
||||
/*!
|
||||
* \brief initialize solver before training, called before training
|
||||
* this function is reserved for solver to allocate necessary space and do other preparation
|
||||
*/
|
||||
inline void InitTrainer( void ){
|
||||
base_gbm.InitTrainer();
|
||||
if( mparam.loss_type == kLogisticClassify ){
|
||||
evaluator_.AddEval( "error" );
|
||||
}else{
|
||||
evaluator_.AddEval( "rmse" );
|
||||
}
|
||||
evaluator_.Init();
|
||||
}
|
||||
/*!
|
||||
* \brief initialize the current data storage for model, if the model is used first time, call this function
|
||||
*/
|
||||
inline void InitModel( void ){
|
||||
base_gbm.InitModel();
|
||||
mparam.AdjustBase();
|
||||
}
|
||||
/*!
|
||||
* \brief load model from stream
|
||||
* \param fi input stream
|
||||
*/
|
||||
inline void LoadModel( utils::IStream &fi ){
|
||||
base_gbm.LoadModel( fi );
|
||||
utils::Assert( fi.Read( &mparam, sizeof(ModelParam) ) != 0 );
|
||||
}
|
||||
/*!
|
||||
* \brief DumpModel
|
||||
* \param fo text file
|
||||
* \param fmap feature map that may help give interpretations of feature
|
||||
* \param with_stats whether print statistics as well
|
||||
*/
|
||||
inline void DumpModel( FILE *fo, const utils::FeatMap& fmap, bool with_stats ){
|
||||
base_gbm.DumpModel( fo, fmap, with_stats );
|
||||
}
|
||||
/*!
|
||||
* \brief Dump path of all trees
|
||||
* \param fo text file
|
||||
* \param data input data
|
||||
*/
|
||||
inline void DumpPath( FILE *fo, const DMatrix &data ){
|
||||
base_gbm.DumpPath( fo, data.data );
|
||||
}
|
||||
/*!
|
||||
* \brief save model to stream
|
||||
* \param fo output stream
|
||||
*/
|
||||
inline void SaveModel( utils::IStream &fo ) const{
|
||||
base_gbm.SaveModel( fo );
|
||||
fo.Write( &mparam, sizeof(ModelParam) );
|
||||
}
|
||||
/*!
|
||||
* \brief update the model for one iteration
|
||||
* \param iteration iteration number
|
||||
*/
|
||||
inline void UpdateOneIter( int iter ){
|
||||
this->PredictBuffer( preds_, *train_, 0 );
|
||||
this->GetGradient( preds_, train_->labels, grad_, hess_ );
|
||||
std::vector<unsigned> root_index;
|
||||
base_gbm.DoBoost( grad_, hess_, train_->data, root_index );
|
||||
}
|
||||
/*!
|
||||
* \brief evaluate the model for specific iteration
|
||||
* \param iter iteration number
|
||||
* \param fo file to output log
|
||||
*/
|
||||
inline void EvalOneIter( int iter, FILE *fo = stderr ){
|
||||
fprintf( fo, "[%d]", iter );
|
||||
int buffer_offset = static_cast<int>( train_->Size() );
|
||||
|
||||
for( size_t i = 0; i < evals_.size(); ++i ){
|
||||
std::vector<float> &preds = this->eval_preds_[ i ];
|
||||
this->PredictBuffer( preds, *evals_[i], buffer_offset);
|
||||
evaluator_.Eval( fo, evname_[i].c_str(), preds, (*evals_[i]).labels );
|
||||
buffer_offset += static_cast<int>( evals_[i]->Size() );
|
||||
}
|
||||
fprintf( fo,"\n" );
|
||||
}
|
||||
/*! \brief get prediction, without buffering */
|
||||
inline void Predict( std::vector<float> &preds, const DMatrix &data ){
|
||||
preds.resize( data.Size() );
|
||||
|
||||
const unsigned ndata = static_cast<unsigned>( data.Size() );
|
||||
#pragma omp parallel for schedule( static )
|
||||
for( unsigned j = 0; j < ndata; ++ j ){
|
||||
preds[j] = mparam.PredTransform
|
||||
( mparam.base_score + base_gbm.Predict( data.data, j, -1 ) );
|
||||
}
|
||||
}
|
||||
public:
|
||||
/*!
|
||||
* \brief update the model for one iteration
|
||||
* \param iteration iteration number
|
||||
*/
|
||||
inline void UpdateInteract( std::string action ){
|
||||
this->InteractPredict( preds_, *train_, 0 );
|
||||
|
||||
int buffer_offset = static_cast<int>( train_->Size() );
|
||||
for( size_t i = 0; i < evals_.size(); ++i ){
|
||||
std::vector<float> &preds = this->eval_preds_[ i ];
|
||||
this->InteractPredict( preds, *evals_[i], buffer_offset );
|
||||
buffer_offset += static_cast<int>( evals_[i]->Size() );
|
||||
}
|
||||
|
||||
if( action == "remove" ){
|
||||
base_gbm.DelteBooster(); return;
|
||||
}
|
||||
|
||||
this->GetGradient( preds_, train_->labels, grad_, hess_ );
|
||||
std::vector<unsigned> root_index;
|
||||
base_gbm.DoBoost( grad_, hess_, train_->data, root_index );
|
||||
|
||||
this->InteractRePredict( *train_, 0 );
|
||||
buffer_offset = static_cast<int>( train_->Size() );
|
||||
for( size_t i = 0; i < evals_.size(); ++i ){
|
||||
this->InteractRePredict( *evals_[i], buffer_offset );
|
||||
buffer_offset += static_cast<int>( evals_[i]->Size() );
|
||||
}
|
||||
}
|
||||
private:
|
||||
/*! \brief get the transformed predictions, given data */
|
||||
inline void InteractPredict( std::vector<float> &preds, const DMatrix &data, unsigned buffer_offset ){
|
||||
preds.resize( data.Size() );
|
||||
const unsigned ndata = static_cast<unsigned>( data.Size() );
|
||||
#pragma omp parallel for schedule( static )
|
||||
for( unsigned j = 0; j < ndata; ++ j ){
|
||||
preds[j] = mparam.PredTransform
|
||||
( mparam.base_score + base_gbm.InteractPredict( data.data, j, buffer_offset + j ) );
|
||||
}
|
||||
}
|
||||
/*! \brief repredict trial */
|
||||
inline void InteractRePredict( const DMatrix &data, unsigned buffer_offset ){
|
||||
const unsigned ndata = static_cast<unsigned>( data.Size() );
|
||||
#pragma omp parallel for schedule( static )
|
||||
for( unsigned j = 0; j < ndata; ++ j ){
|
||||
base_gbm.InteractRePredict( data.data, j, buffer_offset + j );
|
||||
}
|
||||
}
|
||||
private:
|
||||
/*! \brief get the transformed predictions, given data */
|
||||
inline void PredictBuffer( std::vector<float> &preds, const DMatrix &data, unsigned buffer_offset ){
|
||||
preds.resize( data.Size() );
|
||||
|
||||
const unsigned ndata = static_cast<unsigned>( data.Size() );
|
||||
#pragma omp parallel for schedule( static )
|
||||
for( unsigned j = 0; j < ndata; ++ j ){
|
||||
preds[j] = mparam.PredTransform
|
||||
( mparam.base_score + base_gbm.Predict( data.data, j, buffer_offset + j ) );
|
||||
}
|
||||
}
|
||||
|
||||
/*! \brief get the first order and second order gradient, given the transformed predictions and labels */
|
||||
inline void GetGradient( const std::vector<float> &preds,
|
||||
const std::vector<float> &labels,
|
||||
std::vector<float> &grad,
|
||||
std::vector<float> &hess ){
|
||||
grad.resize( preds.size() ); hess.resize( preds.size() );
|
||||
|
||||
const unsigned ndata = static_cast<unsigned>( preds.size() );
|
||||
#pragma omp parallel for schedule( static )
|
||||
for( unsigned j = 0; j < ndata; ++ j ){
|
||||
grad[j] = mparam.FirstOrderGradient( preds[j], labels[j] );
|
||||
hess[j] = mparam.SecondOrderGradient( preds[j], labels[j] );
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
enum LossType{
|
||||
kLinearSquare = 0,
|
||||
kLogisticNeglik = 1,
|
||||
kLogisticClassify = 2
|
||||
};
|
||||
|
||||
/*! \brief training parameter for regression */
|
||||
struct ModelParam{
|
||||
/* \brief global bias */
|
||||
float base_score;
|
||||
/* \brief type of loss function */
|
||||
int loss_type;
|
||||
/* \brief number of features */
|
||||
int num_feature;
|
||||
/*! \brief reserved field */
|
||||
int reserved[ 16 ];
|
||||
/*! \brief constructor */
|
||||
ModelParam( void ){
|
||||
base_score = 0.5f;
|
||||
loss_type = 0;
|
||||
num_feature = 0;
|
||||
memset( reserved, 0, sizeof( reserved ) );
|
||||
}
|
||||
/*!
|
||||
* \brief set parameters from outside
|
||||
* \param name name of the parameter
|
||||
* \param val value of the parameter
|
||||
*/
|
||||
inline void SetParam( const char *name, const char *val ){
|
||||
if( !strcmp("base_score", name ) ) base_score = (float)atof( val );
|
||||
if( !strcmp("loss_type", name ) ) loss_type = atoi( val );
|
||||
if( !strcmp("bst:num_feature", name ) ) num_feature = atoi( val );
|
||||
}
|
||||
/*!
|
||||
* \brief adjust base_score
|
||||
*/
|
||||
inline void AdjustBase( void ){
|
||||
if( loss_type == 1 || loss_type == 2 ){
|
||||
utils::Assert( base_score > 0.0f && base_score < 1.0f, "sigmoid range constrain" );
|
||||
base_score = - logf( 1.0f / base_score - 1.0f );
|
||||
}
|
||||
}
|
||||
|
||||
/*!
|
||||
* \brief transform the linear sum to prediction
|
||||
* \param x linear sum of boosting ensemble
|
||||
* \return transformed prediction
|
||||
*/
|
||||
inline float PredTransform( float x ){
|
||||
switch( loss_type ){
|
||||
case kLinearSquare: return x;
|
||||
case kLogisticClassify:
|
||||
case kLogisticNeglik: return 1.0f/(1.0f + expf(-x));
|
||||
default: utils::Error("unknown loss_type"); return 0.0f;
|
||||
}
|
||||
}
|
||||
|
||||
/*!
|
||||
* \brief calculate first order gradient of loss, given transformed prediction
|
||||
* \param predt transformed prediction
|
||||
* \param label true label
|
||||
* \return first order gradient
|
||||
*/
|
||||
inline float FirstOrderGradient( float predt, float label ) const{
|
||||
switch( loss_type ){
|
||||
case kLinearSquare: return predt - label;
|
||||
case kLogisticClassify:
|
||||
case kLogisticNeglik: return predt - label;
|
||||
default: utils::Error("unknown loss_type"); return 0.0f;
|
||||
}
|
||||
}
|
||||
/*!
|
||||
* \brief calculate second order gradient of loss, given transformed prediction
|
||||
* \param predt transformed prediction
|
||||
* \param label true label
|
||||
* \return second order gradient
|
||||
*/
|
||||
inline float SecondOrderGradient( float predt, float label ) const{
|
||||
switch( loss_type ){
|
||||
case kLinearSquare: return 1.0f;
|
||||
case kLogisticClassify:
|
||||
case kLogisticNeglik: return predt * ( 1 - predt );
|
||||
default: utils::Error("unknown loss_type"); return 0.0f;
|
||||
}
|
||||
}
|
||||
|
||||
/*!
|
||||
* \brief calculating the loss, given the predictions, labels and the loss type
|
||||
* \param preds the given predictions
|
||||
* \param labels the given labels
|
||||
* \return the specified loss
|
||||
*/
|
||||
inline float Loss(const std::vector<float> &preds, const std::vector<float> &labels) const{
|
||||
switch( loss_type ){
|
||||
case kLinearSquare: return SquareLoss(preds,labels);
|
||||
case kLogisticNeglik:
|
||||
case kLogisticClassify: return NegLoglikelihoodLoss(preds,labels);
|
||||
default: utils::Error("unknown loss_type"); return 0.0f;
|
||||
}
|
||||
}
|
||||
|
||||
/*!
|
||||
* \brief calculating the square loss, given the predictions and labels
|
||||
* \param preds the given predictions
|
||||
* \param labels the given labels
|
||||
* \return the summation of square loss
|
||||
*/
|
||||
inline float SquareLoss(const std::vector<float> &preds, const std::vector<float> &labels) const{
|
||||
float ans = 0.0;
|
||||
for(size_t i = 0; i < preds.size(); i++){
|
||||
float dif = preds[i] - labels[i];
|
||||
ans += dif * dif;
|
||||
}
|
||||
return ans;
|
||||
}
|
||||
|
||||
/*!
|
||||
* \brief calculating the square loss, given the predictions and labels
|
||||
* \param preds the given predictions
|
||||
* \param labels the given labels
|
||||
* \return the summation of square loss
|
||||
*/
|
||||
inline float NegLoglikelihoodLoss(const std::vector<float> &preds, const std::vector<float> &labels) const{
|
||||
float ans = 0.0;
|
||||
for(size_t i = 0; i < preds.size(); i++)
|
||||
ans -= labels[i] * logf(preds[i]) + ( 1 - labels[i] ) * logf(1 - preds[i]);
|
||||
return ans;
|
||||
}
|
||||
};
|
||||
private:
|
||||
int silent;
|
||||
EvalSet evaluator_;
|
||||
booster::GBMBase base_gbm;
|
||||
ModelParam mparam;
|
||||
const DMatrix *train_;
|
||||
std::vector<DMatrix *> evals_;
|
||||
std::vector<std::string> evname_;
|
||||
std::vector<unsigned> buffer_index_;
|
||||
private:
|
||||
std::vector<float> grad_, hess_, preds_;
|
||||
std::vector< std::vector<float> > eval_preds_;
|
||||
};
|
||||
}
|
||||
};
|
||||
|
||||
#endif
|
||||
155
regression/xgboost_reg_data.h
Normal file
155
regression/xgboost_reg_data.h
Normal file
@@ -0,0 +1,155 @@
|
||||
#ifndef XGBOOST_REG_DATA_H
|
||||
#define XGBOOST_REG_DATA_H
|
||||
|
||||
/*!
|
||||
* \file xgboost_reg_data.h
|
||||
* \brief input data structure for regression and binary classification task.
|
||||
* Format:
|
||||
* The data should contain each data instance in each line.
|
||||
* The format of line data is as below:
|
||||
* label <nonzero feature dimension> [feature index:feature value]+
|
||||
* \author Kailong Chen: chenkl198812@gmail.com, Tianqi Chen: tianqi.tchen@gmail.com
|
||||
*/
|
||||
#include <cstdio>
|
||||
#include <vector>
|
||||
#include "../booster/xgboost_data.h"
|
||||
#include "../utils/xgboost_utils.h"
|
||||
#include "../utils/xgboost_stream.h"
|
||||
|
||||
namespace xgboost{
|
||||
namespace regression{
|
||||
/*! \brief data matrix for regression content */
|
||||
struct DMatrix{
|
||||
public:
|
||||
/*! \brief maximum feature dimension */
|
||||
unsigned num_feature;
|
||||
/*! \brief feature data content */
|
||||
booster::FMatrixS data;
|
||||
/*! \brief label of each instance */
|
||||
std::vector<float> labels;
|
||||
public:
|
||||
/*! \brief default constructor */
|
||||
DMatrix( void ){}
|
||||
|
||||
/*! \brief get the number of instances */
|
||||
inline size_t Size() const{
|
||||
return labels.size();
|
||||
}
|
||||
/*!
|
||||
* \brief load from text file
|
||||
* \param fname name of text data
|
||||
* \param silent whether print information or not
|
||||
*/
|
||||
inline void LoadText( const char* fname, bool silent = false ){
|
||||
data.Clear();
|
||||
FILE* file = utils::FopenCheck( fname, "r" );
|
||||
float label; bool init = true;
|
||||
char tmp[ 1024 ];
|
||||
std::vector<booster::bst_uint> findex;
|
||||
std::vector<booster::bst_float> fvalue;
|
||||
|
||||
while( fscanf( file, "%s", tmp ) == 1 ){
|
||||
unsigned index; float value;
|
||||
if( sscanf( tmp, "%u:%f", &index, &value ) == 2 ){
|
||||
findex.push_back( index ); fvalue.push_back( value );
|
||||
}else{
|
||||
if( !init ){
|
||||
labels.push_back( label );
|
||||
data.AddRow( findex, fvalue );
|
||||
}
|
||||
findex.clear(); fvalue.clear();
|
||||
utils::Assert( sscanf( tmp, "%f", &label ) == 1, "invalid format" );
|
||||
init = false;
|
||||
}
|
||||
}
|
||||
|
||||
labels.push_back( label );
|
||||
data.AddRow( findex, fvalue );
|
||||
// initialize column support as well
|
||||
data.InitData();
|
||||
|
||||
if( !silent ){
|
||||
printf("%ux%u matrix with %lu entries is loaded from %s\n",
|
||||
(unsigned)data.NumRow(), (unsigned)data.NumCol(), (unsigned long)data.NumEntry(), fname );
|
||||
}
|
||||
fclose(file);
|
||||
}
|
||||
/*!
|
||||
* \brief load from binary file
|
||||
* \param fname name of binary data
|
||||
* \param silent whether print information or not
|
||||
* \return whether loading is success
|
||||
*/
|
||||
inline bool LoadBinary( const char* fname, bool silent = false ){
|
||||
FILE *fp = fopen64( fname, "rb" );
|
||||
if( fp == NULL ) return false;
|
||||
utils::FileStream fs( fp );
|
||||
data.LoadBinary( fs );
|
||||
labels.resize( data.NumRow() );
|
||||
utils::Assert( fs.Read( &labels[0], sizeof(float) * data.NumRow() ) != 0, "DMatrix LoadBinary" );
|
||||
fs.Close();
|
||||
// initialize column support as well
|
||||
data.InitData();
|
||||
|
||||
if( !silent ){
|
||||
printf("%ux%u matrix with %lu entries is loaded from %s\n",
|
||||
(unsigned)data.NumRow(), (unsigned)data.NumCol(), (unsigned long)data.NumEntry(), fname );
|
||||
}
|
||||
return true;
|
||||
}
|
||||
/*!
|
||||
* \brief save to binary file
|
||||
* \param fname name of binary data
|
||||
* \param silent whether print information or not
|
||||
*/
|
||||
inline void SaveBinary( const char* fname, bool silent = false ){
|
||||
// initialize column support as well
|
||||
data.InitData();
|
||||
|
||||
utils::FileStream fs( utils::FopenCheck( fname, "wb" ) );
|
||||
data.SaveBinary( fs );
|
||||
fs.Write( &labels[0], sizeof(float) * data.NumRow() );
|
||||
fs.Close();
|
||||
if( !silent ){
|
||||
printf("%ux%u matrix with %lu entries is saved to %s\n",
|
||||
(unsigned)data.NumRow(), (unsigned)data.NumCol(), (unsigned long)data.NumEntry(), fname );
|
||||
}
|
||||
}
|
||||
/*!
|
||||
* \brief cache load data given a file name, if filename ends with .buffer, direct load binary
|
||||
* otherwise the function will first check if fname + '.buffer' exists,
|
||||
* if binary buffer exists, it will reads from binary buffer, otherwise, it will load from text file,
|
||||
* and try to create a buffer file
|
||||
* \param fname name of binary data
|
||||
* \param silent whether print information or not
|
||||
* \param savebuffer whether do save binary buffer if it is text
|
||||
*/
|
||||
inline void CacheLoad( const char *fname, bool silent = false, bool savebuffer = true ){
|
||||
int len = strlen( fname );
|
||||
if( len > 8 && !strcmp( fname + len - 7, ".buffer") ){
|
||||
this->LoadBinary( fname, silent ); return;
|
||||
}
|
||||
char bname[ 1024 ];
|
||||
sprintf( bname, "%s.buffer", fname );
|
||||
if( !this->LoadBinary( bname, silent ) ){
|
||||
this->LoadText( fname, silent );
|
||||
if( savebuffer ) this->SaveBinary( bname, silent );
|
||||
}
|
||||
}
|
||||
private:
|
||||
/*! \brief update num_feature info */
|
||||
inline void UpdateInfo( void ){
|
||||
this->num_feature = 0;
|
||||
for( size_t i = 0; i < data.NumRow(); i ++ ){
|
||||
booster::FMatrixS::Line sp = data[i];
|
||||
for( unsigned j = 0; j < sp.len; j ++ ){
|
||||
if( num_feature <= sp[j].findex ){
|
||||
num_feature = sp[j].findex + 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
};
|
||||
};
|
||||
#endif
|
||||
119
regression/xgboost_reg_eval.h
Normal file
119
regression/xgboost_reg_eval.h
Normal file
@@ -0,0 +1,119 @@
|
||||
#ifndef XGBOOST_REG_EVAL_H
|
||||
#define XGBOOST_REG_EVAL_H
|
||||
/*!
|
||||
* \file xgboost_reg_eval.h
|
||||
* \brief evaluation metrics for regression and classification
|
||||
* \author Kailong Chen: chenkl198812@gmail.com, Tianqi Chen: tianqi.tchen@gmail.com
|
||||
*/
|
||||
|
||||
#include <cmath>
|
||||
#include <vector>
|
||||
#include <algorithm>
|
||||
#include "../utils/xgboost_utils.h"
|
||||
#include "../utils/xgboost_omp.h"
|
||||
|
||||
namespace xgboost{
|
||||
namespace regression{
|
||||
/*! \brief evaluator that evaluates the loss metrics */
|
||||
struct IEvaluator{
|
||||
/*!
|
||||
* \brief evaluate a specific metric
|
||||
* \param preds prediction
|
||||
* \param labels label
|
||||
*/
|
||||
virtual float Eval( const std::vector<float> &preds,
|
||||
const std::vector<float> &labels ) const= 0;
|
||||
/*! \return name of metric */
|
||||
virtual const char *Name( void ) const= 0;
|
||||
};
|
||||
|
||||
/*! \brief RMSE */
|
||||
struct EvalRMSE : public IEvaluator{
|
||||
virtual float Eval( const std::vector<float> &preds,
|
||||
const std::vector<float> &labels ) const{
|
||||
const unsigned ndata = static_cast<unsigned>( preds.size() );
|
||||
float sum = 0.0;
|
||||
#pragma omp parallel for reduction(+:sum) schedule( static )
|
||||
for( unsigned i = 0; i < ndata; ++ i ){
|
||||
float diff = preds[i] - labels[i];
|
||||
sum += diff * diff;
|
||||
}
|
||||
return sqrtf( sum / ndata );
|
||||
}
|
||||
virtual const char *Name( void ) const{
|
||||
return "rmse";
|
||||
}
|
||||
};
|
||||
|
||||
/*! \brief Error */
|
||||
struct EvalError : public IEvaluator{
|
||||
virtual float Eval( const std::vector<float> &preds,
|
||||
const std::vector<float> &labels ) const{
|
||||
const unsigned ndata = static_cast<unsigned>( preds.size() );
|
||||
unsigned nerr = 0;
|
||||
#pragma omp parallel for reduction(+:nerr) schedule( static )
|
||||
for( unsigned i = 0; i < ndata; ++ i ){
|
||||
if( preds[i] > 0.5f ){
|
||||
if( labels[i] < 0.5f ) nerr += 1;
|
||||
}else{
|
||||
if( labels[i] > 0.5f ) nerr += 1;
|
||||
}
|
||||
}
|
||||
return static_cast<float>(nerr) / ndata;
|
||||
}
|
||||
virtual const char *Name( void ) const{
|
||||
return "error";
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
/*! \brief Error */
|
||||
struct EvalLogLoss : public IEvaluator{
|
||||
virtual float Eval( const std::vector<float> &preds,
|
||||
const std::vector<float> &labels ) const{
|
||||
const unsigned ndata = static_cast<unsigned>( preds.size() );
|
||||
unsigned nerr = 0;
|
||||
#pragma omp parallel for reduction(+:nerr) schedule( static )
|
||||
for( unsigned i = 0; i < ndata; ++ i ){
|
||||
const float y = labels[i];
|
||||
const float py = preds[i];
|
||||
nerr -= y * std::log(py) + (1.0f-y)*std::log(1-py);
|
||||
}
|
||||
return static_cast<float>(nerr) / ndata;
|
||||
}
|
||||
virtual const char *Name( void ) const{
|
||||
return "negllik";
|
||||
}
|
||||
};
|
||||
};
|
||||
|
||||
namespace regression{
|
||||
/*! \brief a set of evaluators */
|
||||
struct EvalSet{
|
||||
public:
|
||||
inline void AddEval( const char *name ){
|
||||
if( !strcmp( name, "rmse") ) evals_.push_back( &rmse_ );
|
||||
if( !strcmp( name, "error") ) evals_.push_back( &error_ );
|
||||
if( !strcmp( name, "logloss") ) evals_.push_back( &logloss_ );
|
||||
}
|
||||
inline void Init( void ){
|
||||
std::sort( evals_.begin(), evals_.end() );
|
||||
evals_.resize( std::unique( evals_.begin(), evals_.end() ) - evals_.begin() );
|
||||
}
|
||||
inline void Eval( FILE *fo, const char *evname,
|
||||
const std::vector<float> &preds,
|
||||
const std::vector<float> &labels ) const{
|
||||
for( size_t i = 0; i < evals_.size(); ++ i ){
|
||||
float res = evals_[i]->Eval( preds, labels );
|
||||
fprintf( fo, "\t%s-%s:%f", evname, evals_[i]->Name(), res );
|
||||
}
|
||||
}
|
||||
private:
|
||||
EvalRMSE rmse_;
|
||||
EvalError error_;
|
||||
EvalLogLoss logloss_;
|
||||
std::vector<const IEvaluator*> evals_;
|
||||
};
|
||||
};
|
||||
};
|
||||
#endif
|
||||
280
regression/xgboost_reg_main.cpp
Normal file
280
regression/xgboost_reg_main.cpp
Normal file
@@ -0,0 +1,280 @@
|
||||
#define _CRT_SECURE_NO_WARNINGS
|
||||
#define _CRT_SECURE_NO_DEPRECATE
|
||||
|
||||
#include <ctime>
|
||||
#include <string>
|
||||
#include <cstring>
|
||||
#include "xgboost_reg.h"
|
||||
#include "../utils/xgboost_fmap.h"
|
||||
#include "../utils/xgboost_random.h"
|
||||
#include "../utils/xgboost_config.h"
|
||||
|
||||
namespace xgboost{
|
||||
namespace regression{
|
||||
/*!
|
||||
* \brief wrapping the training process of the gradient boosting regression model,
|
||||
* given the configuation
|
||||
* \author Kailong Chen: chenkl198812@gmail.com, Tianqi Chen: tianqi.chen@gmail.com
|
||||
*/
|
||||
class RegBoostTask{
|
||||
public:
|
||||
inline int Run( int argc, char *argv[] ){
|
||||
if( argc < 2 ){
|
||||
printf("Usage: <config>\n");
|
||||
return 0;
|
||||
}
|
||||
utils::ConfigIterator itr( argv[1] );
|
||||
while( itr.Next() ){
|
||||
this->SetParam( itr.name(), itr.val() );
|
||||
}
|
||||
for( int i = 2; i < argc; i ++ ){
|
||||
char name[256], val[256];
|
||||
if( sscanf( argv[i], "%[^=]=%s", name, val ) == 2 ){
|
||||
this->SetParam( name, val );
|
||||
}
|
||||
}
|
||||
this->InitData();
|
||||
this->InitLearner();
|
||||
if( task == "dump" ){
|
||||
this->TaskDump();
|
||||
return 0;
|
||||
}
|
||||
if( task == "interact" ){
|
||||
this->TaskInteractive(); return 0;
|
||||
}
|
||||
if( task == "dumppath" ){
|
||||
this->TaskDumpPath(); return 0;
|
||||
}
|
||||
if( task == "eval" ){
|
||||
this->TaskEval(); return 0;
|
||||
}
|
||||
if( task == "pred" ){
|
||||
this->TaskPred();
|
||||
}else{
|
||||
this->TaskTrain();
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
inline void SetParam( const char *name, const char *val ){
|
||||
if( !strcmp("silent", name ) ) silent = atoi( val );
|
||||
if( !strcmp("use_buffer", name ) ) use_buffer = atoi( val );
|
||||
if( !strcmp("seed", name ) ) random::Seed( atoi(val) );
|
||||
if( !strcmp("num_round", name ) ) num_round = atoi( val );
|
||||
if( !strcmp("save_period", name ) ) save_period = atoi( val );
|
||||
if( !strcmp("task", name ) ) task = val;
|
||||
if( !strcmp("data", name ) ) train_path = val;
|
||||
if( !strcmp("test:data", name ) ) test_path = val;
|
||||
if( !strcmp("model_in", name ) ) model_in = val;
|
||||
if( !strcmp("model_out", name ) ) model_out = val;
|
||||
if( !strcmp("model_dir", name ) ) model_dir_path = val;
|
||||
if( !strcmp("fmap", name ) ) name_fmap = val;
|
||||
if( !strcmp("name_dump", name ) ) name_dump = val;
|
||||
if( !strcmp("name_dumppath", name ) ) name_dumppath = val;
|
||||
if( !strcmp("name_pred", name ) ) name_pred = val;
|
||||
if( !strcmp("dump_stats", name ) ) dump_model_stats = atoi( val );
|
||||
if( !strcmp("interact:action", name ) ) interact_action = val;
|
||||
if( !strncmp("batch:", name, 6 ) ){
|
||||
cfg_batch.PushBack( name + 6, val );
|
||||
}
|
||||
if( !strncmp("eval[", name, 5 ) ) {
|
||||
char evname[ 256 ];
|
||||
utils::Assert( sscanf( name, "eval[%[^]]", evname ) == 1, "must specify evaluation name for display");
|
||||
eval_data_names.push_back( std::string( evname ) );
|
||||
eval_data_paths.push_back( std::string( val ) );
|
||||
}
|
||||
cfg.PushBack( name, val );
|
||||
}
|
||||
public:
|
||||
RegBoostTask( void ){
|
||||
// default parameters
|
||||
silent = 0;
|
||||
use_buffer = 1;
|
||||
num_round = 10;
|
||||
save_period = 0;
|
||||
dump_model_stats = 0;
|
||||
task = "train";
|
||||
model_in = "NULL";
|
||||
model_out = "NULL";
|
||||
name_fmap = "NULL";
|
||||
name_pred = "pred.txt";
|
||||
name_dump = "dump.txt";
|
||||
name_dumppath = "dump.path.txt";
|
||||
model_dir_path = "./";
|
||||
interact_action = "update";
|
||||
}
|
||||
~RegBoostTask( void ){
|
||||
for( size_t i = 0; i < deval.size(); i ++ ){
|
||||
delete deval[i];
|
||||
}
|
||||
}
|
||||
private:
|
||||
inline void InitData( void ){
|
||||
if( name_fmap != "NULL" ) fmap.LoadText( name_fmap.c_str() );
|
||||
if( task == "dump" ) return;
|
||||
if( task == "pred" || task == "dumppath" ){
|
||||
data.CacheLoad( test_path.c_str(), silent!=0, use_buffer!=0 );
|
||||
}else{
|
||||
// training
|
||||
data.CacheLoad( train_path.c_str(), silent!=0, use_buffer!=0 );
|
||||
utils::Assert( eval_data_names.size() == eval_data_paths.size() );
|
||||
for( size_t i = 0; i < eval_data_names.size(); ++ i ){
|
||||
deval.push_back( new DMatrix() );
|
||||
deval.back()->CacheLoad( eval_data_paths[i].c_str(), silent!=0, use_buffer!=0 );
|
||||
}
|
||||
}
|
||||
learner.SetData( &data, deval, eval_data_names );
|
||||
}
|
||||
inline void InitLearner( void ){
|
||||
cfg.BeforeFirst();
|
||||
while( cfg.Next() ){
|
||||
learner.SetParam( cfg.name(), cfg.val() );
|
||||
}
|
||||
if( model_in != "NULL" ){
|
||||
utils::FileStream fi( utils::FopenCheck( model_in.c_str(), "rb") );
|
||||
learner.LoadModel( fi );
|
||||
fi.Close();
|
||||
}else{
|
||||
utils::Assert( task == "train", "model_in not specified" );
|
||||
learner.InitModel();
|
||||
}
|
||||
learner.InitTrainer();
|
||||
}
|
||||
inline void TaskTrain( void ){
|
||||
const time_t start = time( NULL );
|
||||
unsigned long elapsed = 0;
|
||||
for( int i = 0; i < num_round; ++ i ){
|
||||
elapsed = (unsigned long)(time(NULL) - start);
|
||||
if( !silent ) printf("boosting round %d, %lu sec elapsed\n", i , elapsed );
|
||||
learner.UpdateOneIter( i );
|
||||
learner.EvalOneIter( i );
|
||||
if( save_period != 0 && (i+1) % save_period == 0 ){
|
||||
this->SaveModel( i );
|
||||
}
|
||||
elapsed = (unsigned long)(time(NULL) - start);
|
||||
}
|
||||
// always save final round
|
||||
if( save_period == 0 || num_round % save_period != 0 ){
|
||||
if( model_out == "NULL" ){
|
||||
this->SaveModel( num_round - 1 );
|
||||
}else{
|
||||
this->SaveModel( model_out.c_str() );
|
||||
}
|
||||
}
|
||||
if( !silent ){
|
||||
printf("\nupdating end, %lu sec in all\n", elapsed );
|
||||
}
|
||||
}
|
||||
inline void TaskEval( void ){
|
||||
learner.EvalOneIter( 0 );
|
||||
}
|
||||
inline void TaskInteractive( void ){
|
||||
const time_t start = time( NULL );
|
||||
unsigned long elapsed = 0;
|
||||
int batch_action = 0;
|
||||
|
||||
cfg_batch.BeforeFirst();
|
||||
while( cfg_batch.Next() ){
|
||||
if( !strcmp( cfg_batch.name(), "run" ) ){
|
||||
learner.UpdateInteract( interact_action );
|
||||
batch_action += 1;
|
||||
} else{
|
||||
learner.SetParam( cfg_batch.name(), cfg_batch.val() );
|
||||
}
|
||||
}
|
||||
|
||||
if( batch_action == 0 ){
|
||||
learner.UpdateInteract( interact_action );
|
||||
}
|
||||
utils::Assert( model_out != "NULL", "interactive mode must specify model_out" );
|
||||
this->SaveModel( model_out.c_str() );
|
||||
elapsed = (unsigned long)(time(NULL) - start);
|
||||
|
||||
if( !silent ){
|
||||
printf("\ninteractive update, %d batch actions, %lu sec in all\n", batch_action, elapsed );
|
||||
}
|
||||
}
|
||||
|
||||
inline void TaskDump( void ){
|
||||
FILE *fo = utils::FopenCheck( name_dump.c_str(), "w" );
|
||||
learner.DumpModel( fo, fmap, dump_model_stats != 0 );
|
||||
fclose( fo );
|
||||
}
|
||||
inline void TaskDumpPath( void ){
|
||||
FILE *fo = utils::FopenCheck( name_dumppath.c_str(), "w" );
|
||||
learner.DumpPath( fo, data );
|
||||
fclose( fo );
|
||||
}
|
||||
inline void SaveModel( const char *fname ) const{
|
||||
utils::FileStream fo( utils::FopenCheck( fname, "wb" ) );
|
||||
learner.SaveModel( fo );
|
||||
fo.Close();
|
||||
}
|
||||
inline void SaveModel( int i ) const{
|
||||
char fname[256];
|
||||
sprintf( fname ,"%s/%04d.model", model_dir_path.c_str(), i+1 );
|
||||
this->SaveModel( fname );
|
||||
}
|
||||
inline void TaskPred( void ){
|
||||
std::vector<float> preds;
|
||||
if( !silent ) printf("start prediction...\n");
|
||||
learner.Predict( preds, data );
|
||||
if( !silent ) printf("writing prediction to %s\n", name_pred.c_str() );
|
||||
FILE *fo = utils::FopenCheck( name_pred.c_str(), "w" );
|
||||
for( size_t i = 0; i < preds.size(); i ++ ){
|
||||
fprintf( fo, "%f\n", preds[i] );
|
||||
}
|
||||
fclose( fo );
|
||||
}
|
||||
private:
|
||||
/* \brief whether silent */
|
||||
int silent;
|
||||
/* \brief whether use auto binary buffer */
|
||||
int use_buffer;
|
||||
/* \brief number of boosting iterations */
|
||||
int num_round;
|
||||
/* \brief the period to save the model, 0 means only save the final round model */
|
||||
int save_period;
|
||||
/*! \brief interfact action */
|
||||
std::string interact_action;
|
||||
/* \brief the path of training/test data set */
|
||||
std::string train_path, test_path;
|
||||
/* \brief the path of test model file, or file to restart training */
|
||||
std::string model_in;
|
||||
/* \brief the path of final model file, to be saved */
|
||||
std::string model_out;
|
||||
/* \brief the path of directory containing the saved models */
|
||||
std::string model_dir_path;
|
||||
/* \brief task to perform */
|
||||
std::string task;
|
||||
/* \brief name of predict file */
|
||||
std::string name_pred;
|
||||
/* \brief whether dump statistics along with model */
|
||||
int dump_model_stats;
|
||||
/* \brief name of feature map */
|
||||
std::string name_fmap;
|
||||
/* \brief name of dump file */
|
||||
std::string name_dump;
|
||||
/* \brief name of dump path file */
|
||||
std::string name_dumppath;
|
||||
/* \brief the paths of validation data sets */
|
||||
std::vector<std::string> eval_data_paths;
|
||||
/* \brief the names of the evaluation data used in output log */
|
||||
std::vector<std::string> eval_data_names;
|
||||
/*! \brief saves configurations */
|
||||
utils::ConfigSaver cfg;
|
||||
/*! \brief batch configurations */
|
||||
utils::ConfigSaver cfg_batch;
|
||||
private:
|
||||
DMatrix data;
|
||||
std::vector<DMatrix*> deval;
|
||||
utils::FeatMap fmap;
|
||||
RegBoostLearner learner;
|
||||
};
|
||||
};
|
||||
};
|
||||
|
||||
int main( int argc, char *argv[] ){
|
||||
xgboost::random::Seed( 0 );
|
||||
xgboost::regression::RegBoostTask tsk;
|
||||
return tsk.Run( argc, argv );
|
||||
}
|
||||
214
utils/xgboost_config.h
Normal file
214
utils/xgboost_config.h
Normal file
@@ -0,0 +1,214 @@
|
||||
#ifndef XGBOOST_CONFIG_H
|
||||
#define XGBOOST_CONFIG_H
|
||||
/*!
|
||||
* \file xgboost_config.h
|
||||
* \brief helper class to load in configures from file
|
||||
* \author Tianqi Chen: tianqi.tchen@gmail.com
|
||||
*/
|
||||
#define _CRT_SECURE_NO_WARNINGS
|
||||
#include <cstdio>
|
||||
#include <cstring>
|
||||
#include <string>
|
||||
#include "xgboost_utils.h"
|
||||
#include <vector>
|
||||
|
||||
namespace xgboost{
|
||||
namespace utils{
|
||||
/*!
|
||||
* \brief an iterator that iterates over a configure file and gets the configures
|
||||
*/
|
||||
class ConfigIterator{
|
||||
public:
|
||||
/*!
|
||||
* \brief constructor
|
||||
* \param fname name of configure file
|
||||
*/
|
||||
ConfigIterator( const char *fname ){
|
||||
fi = FopenCheck( fname, "r");
|
||||
ch_buf = fgetc( fi );
|
||||
}
|
||||
/*! \brief destructor */
|
||||
~ConfigIterator(){
|
||||
fclose( fi );
|
||||
}
|
||||
/*!
|
||||
* \brief get current name, called after Next returns true
|
||||
* \return current parameter name
|
||||
*/
|
||||
inline const char *name( void )const{
|
||||
return s_name;
|
||||
}
|
||||
/*!
|
||||
* \brief get current value, called after Next returns true
|
||||
* \return current parameter value
|
||||
*/
|
||||
inline const char *val( void ) const{
|
||||
return s_val;
|
||||
}
|
||||
/*!
|
||||
* \brief move iterator to next position
|
||||
* \return true if there is value in next position
|
||||
*/
|
||||
inline bool Next( void ){
|
||||
while( !feof( fi ) ){
|
||||
GetNextToken( s_name );
|
||||
if( s_name[0] == '=') return false;
|
||||
if( GetNextToken( s_buf ) || s_buf[0] != '=' ) return false;
|
||||
if( GetNextToken( s_val ) || s_val[0] == '=' ) return false;
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
private:
|
||||
FILE *fi;
|
||||
char ch_buf;
|
||||
char s_name[256],s_val[256],s_buf[246];
|
||||
|
||||
inline void SkipLine(){
|
||||
do{
|
||||
ch_buf = fgetc( fi );
|
||||
}while( ch_buf != EOF && ch_buf != '\n' && ch_buf != '\r' );
|
||||
}
|
||||
|
||||
inline void ParseStr( char tok[] ){
|
||||
int i = 0;
|
||||
while( (ch_buf = fgetc(fi)) != EOF ){
|
||||
switch( ch_buf ){
|
||||
case '\\': tok[i++] = fgetc( fi ); break;
|
||||
case '\"': tok[i++] = '\0';
|
||||
return;
|
||||
case '\r':
|
||||
case '\n': Error("unterminated string"); break;
|
||||
default: tok[i++] = ch_buf;
|
||||
}
|
||||
}
|
||||
Error("unterminated string");
|
||||
}
|
||||
// return newline
|
||||
inline bool GetNextToken( char tok[] ){
|
||||
int i = 0;
|
||||
bool new_line = false;
|
||||
while( ch_buf != EOF ){
|
||||
switch( ch_buf ){
|
||||
case '#' : SkipLine(); new_line = true; break;
|
||||
case '\"':
|
||||
if( i == 0 ){
|
||||
ParseStr( tok );ch_buf = fgetc(fi); return new_line;
|
||||
}else{
|
||||
Error("token followed directly by string");
|
||||
}
|
||||
case '=':
|
||||
if( i == 0 ) {
|
||||
ch_buf = fgetc( fi );
|
||||
tok[0] = '=';
|
||||
tok[1] = '\0';
|
||||
}else{
|
||||
tok[i] = '\0';
|
||||
}
|
||||
return new_line;
|
||||
case '\r':
|
||||
case '\n':
|
||||
if( i == 0 ) new_line = true;
|
||||
case '\t':
|
||||
case ' ' :
|
||||
ch_buf = fgetc( fi );
|
||||
if( i > 0 ){
|
||||
tok[i] = '\0';
|
||||
return new_line;
|
||||
}
|
||||
break;
|
||||
default:
|
||||
tok[i++] = ch_buf;
|
||||
ch_buf = fgetc( fi );
|
||||
break;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
};
|
||||
};
|
||||
|
||||
namespace utils{
|
||||
/*!
|
||||
* \brief a class that save parameter configurations
|
||||
* temporally and allows to get them out later
|
||||
* there are two kinds of priority in ConfigSaver
|
||||
*/
|
||||
class ConfigSaver{
|
||||
public:
|
||||
/*! \brief constructor */
|
||||
ConfigSaver( void ){ idx = 0; }
|
||||
/*! \brief clear all saves */
|
||||
inline void Clear( void ){
|
||||
idx = 0;
|
||||
names.clear(); values.clear();
|
||||
names_high.clear(); values_high.clear();
|
||||
}
|
||||
/*!
|
||||
* \brief push back a parameter setting
|
||||
* \param name name of parameter
|
||||
* \param val value of parameter
|
||||
* \param priority whether the setting has higher priority: high priority occurs
|
||||
* latter when read from ConfigSaver, and can overwrite existing settings
|
||||
*/
|
||||
inline void PushBack( const char *name, const char *val, int priority = 0 ){
|
||||
if( priority == 0 ){
|
||||
names.push_back( std::string( name ) );
|
||||
values.push_back( std::string( val ) );
|
||||
}else{
|
||||
names_high.push_back( std::string( name ) );
|
||||
values_high.push_back( std::string( val ) );
|
||||
}
|
||||
}
|
||||
/*! \brief set pointer to beginning of the ConfigSaver */
|
||||
inline void BeforeFirst( void ){
|
||||
idx = 0;
|
||||
}
|
||||
/*!
|
||||
* \brief move iterator to next position
|
||||
* \return true if there is value in next position
|
||||
*/
|
||||
inline bool Next( void ){
|
||||
if( idx >= names.size() + names_high.size() ){
|
||||
return false;
|
||||
}
|
||||
idx ++;
|
||||
return true;
|
||||
}
|
||||
/*!
|
||||
* \brief get current name, called after Next returns true
|
||||
* \return current parameter name
|
||||
*/
|
||||
inline const char *name( void ) const{
|
||||
Assert( idx > 0, "can't call name before first");
|
||||
size_t i = idx - 1;
|
||||
if( i >= names.size() ){
|
||||
return names_high[ i - names.size() ].c_str();
|
||||
}else{
|
||||
return names[ i ].c_str();
|
||||
}
|
||||
}
|
||||
/*!
|
||||
* \brief get current value, called after Next returns true
|
||||
* \return current parameter value
|
||||
*/
|
||||
inline const char *val( void ) const{
|
||||
Assert( idx > 0, "can't call name before first");
|
||||
size_t i = idx - 1;
|
||||
if( i >= values.size() ){
|
||||
return values_high[ i - values.size() ].c_str();
|
||||
}else{
|
||||
return values[ i ].c_str();
|
||||
}
|
||||
}
|
||||
private:
|
||||
std::vector<std::string> names;
|
||||
std::vector<std::string> values;
|
||||
std::vector<std::string> names_high;
|
||||
std::vector<std::string> values_high;
|
||||
size_t idx;
|
||||
};
|
||||
};
|
||||
};
|
||||
#endif
|
||||
|
||||
123
utils/xgboost_fmap.h
Normal file
123
utils/xgboost_fmap.h
Normal file
@@ -0,0 +1,123 @@
|
||||
#ifndef XGBOOST_FMAP_H
|
||||
#define XGBOOST_FMAP_H
|
||||
/*!
|
||||
* \file xgboost_fmap.h
|
||||
* \brief helper class that holds the feature names and interpretations
|
||||
* \author Tianqi Chen: tianqi.tchen@gmail.com
|
||||
*/
|
||||
#include <vector>
|
||||
#include <string>
|
||||
#include <cstring>
|
||||
#include "xgboost_utils.h"
|
||||
|
||||
namespace xgboost{
|
||||
namespace utils{
|
||||
/*! \brief helper class that holds the feature names and interpretations */
|
||||
class FeatMap{
|
||||
public:
|
||||
enum Type{
|
||||
kIndicator = 0,
|
||||
kQuantitive = 1,
|
||||
kInteger = 2,
|
||||
kFloat = 3
|
||||
};
|
||||
public:
|
||||
/*! \brief load feature map from text format */
|
||||
inline void LoadText( const char *fname ){
|
||||
FILE *fi = utils::FopenCheck( fname, "r" );
|
||||
this->LoadText( fi );
|
||||
fclose( fi );
|
||||
}
|
||||
/*! \brief load feature map from text format */
|
||||
inline void LoadText( FILE *fi ){
|
||||
int fid;
|
||||
char fname[256], ftype[256];
|
||||
while( fscanf( fi, "%d%s%s", &fid, fname, ftype ) == 3 ){
|
||||
utils::Assert( fid == (int)names_.size(), "invalid fmap format" );
|
||||
names_.push_back( std::string(fname) );
|
||||
types_.push_back( GetType( ftype ) );
|
||||
}
|
||||
}
|
||||
/*! \brief number of known features */
|
||||
size_t size( void ) const{
|
||||
return names_.size();
|
||||
}
|
||||
/*! \brief return name of specific feature */
|
||||
const char* name( size_t idx ) const{
|
||||
utils::Assert( idx < names_.size(), "utils::FMap::name feature index exceed bound" );
|
||||
return names_[ idx ].c_str();
|
||||
}
|
||||
/*! \brief return type of specific feature */
|
||||
const Type& type( size_t idx ) const{
|
||||
utils::Assert( idx < names_.size(), "utils::FMap::name feature index exceed bound" );
|
||||
return types_[ idx ];
|
||||
}
|
||||
private:
|
||||
inline static Type GetType( const char *tname ){
|
||||
if( !strcmp( "i", tname ) ) return kIndicator;
|
||||
if( !strcmp( "q", tname ) ) return kQuantitive;
|
||||
if( !strcmp( "int", tname ) ) return kInteger;
|
||||
if( !strcmp( "float", tname ) ) return kFloat;
|
||||
utils::Error("unknown feature type, use i for indicator and q for quantity");
|
||||
return kIndicator;
|
||||
}
|
||||
private:
|
||||
/*! \brief name of the feature */
|
||||
std::vector<std::string> names_;
|
||||
/*! \brief type of the feature */
|
||||
std::vector<Type> types_;
|
||||
};
|
||||
}; // namespace utils
|
||||
|
||||
namespace utils{
|
||||
/*! \brief feature constraint, allow or disallow some feature during training */
|
||||
class FeatConstrain{
|
||||
public:
|
||||
FeatConstrain( void ){
|
||||
default_state_ = +1;
|
||||
}
|
||||
/*!\brief set parameters */
|
||||
inline void SetParam( const char *name, const char *val ){
|
||||
int a, b;
|
||||
if( !strcmp( name, "fban") ){
|
||||
this->ParseRange( val, a, b );
|
||||
this->SetRange( a, b, -1 );
|
||||
}
|
||||
if( !strcmp( name, "fpass") ){
|
||||
this->ParseRange( val, a, b );
|
||||
this->SetRange( a, b, +1 );
|
||||
}
|
||||
if( !strcmp( name, "fdefault") ){
|
||||
default_state_ = atoi( val );
|
||||
}
|
||||
}
|
||||
/*! \brief whether constrain is specified */
|
||||
inline bool HasConstrain( void ) const {
|
||||
return state_.size() != 0 && default_state_ == 1;
|
||||
}
|
||||
/*! \brief whether a feature index is banned or not */
|
||||
inline bool NotBanned( unsigned index ) const{
|
||||
int rt = index < state_.size() ? state_[index] : default_state_;
|
||||
if( rt == 0 ) rt = default_state_;
|
||||
return rt == 1;
|
||||
}
|
||||
private:
|
||||
inline void SetRange( int a, int b, int st ){
|
||||
if( b > (int)state_.size() ) state_.resize( b, 0 );
|
||||
for( int i = a; i < b; ++ i ){
|
||||
state_[i] = st;
|
||||
}
|
||||
}
|
||||
inline void ParseRange( const char *val, int &a, int &b ){
|
||||
if( sscanf( val, "%d-%d", &a, &b ) == 2 ) return;
|
||||
utils::Assert( sscanf( val, "%d", &a ) == 1 );
|
||||
b = a + 1;
|
||||
}
|
||||
/*! \brief default state */
|
||||
int default_state_;
|
||||
/*! \brief whether the state here is, +1:pass, -1: ban, 0:default */
|
||||
std::vector<int> state_;
|
||||
};
|
||||
}; // namespace utils
|
||||
}; // namespace xgboost
|
||||
#endif // XGBOOST_FMAP_H
|
||||
155
utils/xgboost_matrix_csr.h
Normal file
155
utils/xgboost_matrix_csr.h
Normal file
@@ -0,0 +1,155 @@
|
||||
/*!
|
||||
* \file xgboost_matrix_csr.h
|
||||
* \brief this file defines some easy to use STL based class for in memory sparse CSR matrix
|
||||
* \author Tianqi Chen: tianqi.tchen@gmail.com
|
||||
*/
|
||||
#ifndef XGBOOST_MATRIX_CSR_H
|
||||
#define XGBOOST_MATRIX_CSR_H
|
||||
#include <vector>
|
||||
#include <algorithm>
|
||||
#include "xgboost_utils.h"
|
||||
|
||||
namespace xgboost{
|
||||
namespace utils{
|
||||
/*!
|
||||
* \brief a class used to help construct CSR format matrix,
|
||||
* can be used to convert row major CSR to column major CSR
|
||||
* \tparam IndexType type of index used to store the index position, usually unsigned or size_t
|
||||
* \tparam whether enabling the usage of aclist, this option must be enabled manually
|
||||
*/
|
||||
template<typename IndexType,bool UseAcList = false>
|
||||
struct SparseCSRMBuilder{
|
||||
private:
|
||||
/*! \brief dummy variable used in the indicator matrix construction */
|
||||
std::vector<size_t> dummy_aclist;
|
||||
/*! \brief pointer to each of the row */
|
||||
std::vector<size_t> &rptr;
|
||||
/*! \brief index of nonzero entries in each row */
|
||||
std::vector<IndexType> &findex;
|
||||
/*! \brief a list of active rows, used when many rows are empty */
|
||||
std::vector<size_t> &aclist;
|
||||
public:
|
||||
SparseCSRMBuilder( std::vector<size_t> &p_rptr,
|
||||
std::vector<IndexType> &p_findex )
|
||||
:rptr(p_rptr), findex( p_findex ), aclist( dummy_aclist ){
|
||||
Assert( !UseAcList, "enabling bug" );
|
||||
}
|
||||
/*! \brief use with caution! rptr must be cleaned before use */
|
||||
SparseCSRMBuilder( std::vector<size_t> &p_rptr,
|
||||
std::vector<IndexType> &p_findex,
|
||||
std::vector<size_t> &p_aclist )
|
||||
:rptr(p_rptr), findex( p_findex ), aclist( p_aclist ){
|
||||
Assert( UseAcList, "must manually enable the option use aclist" );
|
||||
}
|
||||
public:
|
||||
/*!
|
||||
* \brief step 1: initialize the number of rows in the data, not necessary exact
|
||||
* \nrows number of rows in the matrix, can be smaller than expected
|
||||
*/
|
||||
inline void InitBudget( size_t nrows = 0 ){
|
||||
if( !UseAcList ){
|
||||
rptr.clear();
|
||||
rptr.resize( nrows + 1, 0 );
|
||||
}else{
|
||||
Assert( nrows + 1 == rptr.size(), "rptr must be initialized already" );
|
||||
this->Cleanup();
|
||||
}
|
||||
}
|
||||
/*!
|
||||
* \brief step 2: add budget to each rows, this function is called when aclist is used
|
||||
* \param row_id the id of the row
|
||||
* \param nelem number of element budget add to this row
|
||||
*/
|
||||
inline void AddBudget( size_t row_id, size_t nelem = 1 ){
|
||||
if( rptr.size() < row_id + 2 ){
|
||||
rptr.resize( row_id + 2, 0 );
|
||||
}
|
||||
if( UseAcList ){
|
||||
if( rptr[ row_id + 1 ] == 0 ) aclist.push_back( row_id );
|
||||
}
|
||||
rptr[ row_id + 1 ] += nelem;
|
||||
}
|
||||
/*! \brief step 3: initialize the necessary storage */
|
||||
inline void InitStorage( void ){
|
||||
// initialize rptr to be beginning of each segment
|
||||
size_t start = 0;
|
||||
if( !UseAcList ){
|
||||
for( size_t i = 1; i < rptr.size(); i ++ ){
|
||||
size_t rlen = rptr[ i ];
|
||||
rptr[ i ] = start;
|
||||
start += rlen;
|
||||
}
|
||||
}else{
|
||||
// case with active list
|
||||
std::sort( aclist.begin(), aclist.end() );
|
||||
|
||||
for( size_t i = 0; i < aclist.size(); i ++ ){
|
||||
size_t ridx = aclist[ i ];
|
||||
size_t rlen = rptr[ ridx + 1 ];
|
||||
rptr[ ridx + 1 ] = start;
|
||||
// set previous rptr to right position if previous feature is not active
|
||||
if( i == 0 || ridx != aclist[i-1] + 1 ) rptr[ ridx ] = start;
|
||||
start += rlen;
|
||||
}
|
||||
}
|
||||
findex.resize( start );
|
||||
}
|
||||
/*!
|
||||
* \brief step 4:
|
||||
* used in indicator matrix construction, add new
|
||||
* element to each row, the number of calls shall be exactly same as add_budget
|
||||
*/
|
||||
inline void PushElem( size_t row_id, IndexType col_id ){
|
||||
size_t &rp = rptr[ row_id + 1 ];
|
||||
findex[ rp ++ ] = col_id;
|
||||
}
|
||||
/*!
|
||||
* \brief step 5: only needed when aclist is used
|
||||
* clean up the rptr for next usage
|
||||
*/
|
||||
inline void Cleanup( void ){
|
||||
Assert( UseAcList, "this function can only be called use AcList" );
|
||||
for( size_t i = 0; i < aclist.size(); i ++ ){
|
||||
const size_t ridx = aclist[i];
|
||||
rptr[ ridx ] = 0; rptr[ ridx + 1 ] = 0;
|
||||
}
|
||||
aclist.clear();
|
||||
}
|
||||
};
|
||||
};
|
||||
|
||||
namespace utils{
|
||||
/*!
|
||||
* \brief simple sparse matrix container
|
||||
* \tparam IndexType type of index used to store the index position, usually unsigned or size_t
|
||||
*/
|
||||
template<typename IndexType>
|
||||
struct SparseCSRMat{
|
||||
private:
|
||||
/*! \brief pointer to each of the row */
|
||||
std::vector<size_t> rptr;
|
||||
/*! \brief index of nonzero entries in each row */
|
||||
std::vector<IndexType> findex;
|
||||
public:
|
||||
/*! \brief matrix builder*/
|
||||
SparseCSRMBuilder<IndexType> builder;
|
||||
public:
|
||||
SparseCSRMat( void ):builder( rptr, findex ){
|
||||
}
|
||||
public:
|
||||
/*! \return number of rows in the matrx */
|
||||
inline size_t NumRow( void ) const{
|
||||
return rptr.size() - 1;
|
||||
}
|
||||
/*! \return number of elements r-th row */
|
||||
inline size_t NumElem( size_t r ) const{
|
||||
return rptr[ r + 1 ] - rptr[ r ];
|
||||
}
|
||||
/*! \return r-th row */
|
||||
inline const IndexType *operator[]( size_t r ) const{
|
||||
return &findex[ rptr[r] ];
|
||||
}
|
||||
};
|
||||
};
|
||||
};
|
||||
#endif
|
||||
18
utils/xgboost_omp.h
Normal file
18
utils/xgboost_omp.h
Normal file
@@ -0,0 +1,18 @@
|
||||
#ifndef XGBOOST_OMP_H
|
||||
#define XGBOOST_OMP_H
|
||||
/*!
|
||||
* \file xgboost_omp.h
|
||||
* \brief header to handle OpenMP compatibility issues
|
||||
*
|
||||
* \author Tianqi Chen: tianqi.tchen@gmail.com
|
||||
*/
|
||||
|
||||
#if defined(_OPENMP)
|
||||
#include <omp.h>
|
||||
#else
|
||||
#warning "OpenMP is not available, compile to single thread code"
|
||||
inline int omp_get_thread_num() { return 0; }
|
||||
inline int omp_get_num_threads() { return 1; }
|
||||
inline void omp_set_num_threads( int nthread ) {}
|
||||
#endif
|
||||
#endif
|
||||
131
utils/xgboost_random.h
Normal file
131
utils/xgboost_random.h
Normal file
@@ -0,0 +1,131 @@
|
||||
#ifndef XGBOOST_RANDOM_H
|
||||
#define XGBOOST_RANDOM_H
|
||||
/*!
|
||||
* \file xgboost_random.h
|
||||
* \brief PRNG to support random number generation
|
||||
* \author Tianqi Chen: tianqi.tchen@gmail.com
|
||||
*
|
||||
* Use standard PRNG from stdlib
|
||||
*/
|
||||
#include <cmath>
|
||||
#include <cstdlib>
|
||||
#include <vector>
|
||||
|
||||
#ifdef _MSC_VER
|
||||
typedef unsigned char uint8_t;
|
||||
typedef unsigned short int uint16_t;
|
||||
typedef unsigned int uint32_t;
|
||||
#else
|
||||
#include <inttypes.h>
|
||||
#endif
|
||||
|
||||
/*! namespace of PRNG */
|
||||
namespace xgboost{
|
||||
namespace random{
|
||||
/*! \brief seed the PRNG */
|
||||
inline void Seed( uint32_t seed ){
|
||||
srand( seed );
|
||||
}
|
||||
|
||||
/*! \brief return a real number uniform in [0,1) */
|
||||
inline double NextDouble(){
|
||||
return static_cast<double>( rand() ) / (static_cast<double>( RAND_MAX )+1.0);
|
||||
}
|
||||
/*! \brief return a real numer uniform in (0,1) */
|
||||
inline double NextDouble2(){
|
||||
return (static_cast<double>( rand() ) + 1.0 ) / (static_cast<double>(RAND_MAX) + 2.0);
|
||||
}
|
||||
};
|
||||
|
||||
namespace random{
|
||||
/*! \brief return a random number */
|
||||
inline uint32_t NextUInt32( void ){
|
||||
return (uint32_t)rand();
|
||||
}
|
||||
/*! \brief return a random number in n */
|
||||
inline uint32_t NextUInt32( uint32_t n ){
|
||||
return (uint32_t) floor( NextDouble() * n ) ;
|
||||
}
|
||||
/*! \brief return x~N(0,1) */
|
||||
inline double SampleNormal(){
|
||||
double x,y,s;
|
||||
do{
|
||||
x = 2 * NextDouble2() - 1.0;
|
||||
y = 2 * NextDouble2() - 1.0;
|
||||
s = x*x + y*y;
|
||||
}while( s >= 1.0 || s == 0.0 );
|
||||
|
||||
return x * sqrt( -2.0 * log(s) / s ) ;
|
||||
}
|
||||
|
||||
/*! \brief return iid x,y ~N(0,1) */
|
||||
inline void SampleNormal2D( double &xx, double &yy ){
|
||||
double x,y,s;
|
||||
do{
|
||||
x = 2 * NextDouble2() - 1.0;
|
||||
y = 2 * NextDouble2() - 1.0;
|
||||
s = x*x + y*y;
|
||||
}while( s >= 1.0 || s == 0.0 );
|
||||
double t = sqrt( -2.0 * log(s) / s ) ;
|
||||
xx = x * t;
|
||||
yy = y * t;
|
||||
}
|
||||
/*! \brief return x~N(mu,sigma^2) */
|
||||
inline double SampleNormal( double mu, double sigma ){
|
||||
return SampleNormal() * sigma + mu;
|
||||
}
|
||||
|
||||
/*! \brief return 1 with probability p, coin flip */
|
||||
inline int SampleBinary( double p ){
|
||||
return NextDouble() < p;
|
||||
}
|
||||
|
||||
/*! \brief return distribution from Gamma( alpha, beta ) */
|
||||
inline double SampleGamma( double alpha, double beta ) {
|
||||
if ( alpha < 1.0 ) {
|
||||
double u;
|
||||
do {
|
||||
u = NextDouble();
|
||||
} while (u == 0.0);
|
||||
return SampleGamma(alpha + 1.0, beta) * pow(u, 1.0 / alpha);
|
||||
} else {
|
||||
double d,c,x,v,u;
|
||||
d = alpha - 1.0/3.0;
|
||||
c = 1.0 / sqrt( 9.0 * d );
|
||||
do {
|
||||
do {
|
||||
x = SampleNormal();
|
||||
v = 1.0 + c*x;
|
||||
} while ( v <= 0.0 );
|
||||
v = v * v * v;
|
||||
u = NextDouble();
|
||||
} while ( (u >= (1.0 - 0.0331 * (x*x) * (x*x)))
|
||||
&& (log(u) >= (0.5 * x * x + d * (1.0 - v + log(v)))) );
|
||||
return d * v / beta;
|
||||
}
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
inline void Exchange( T &a, T &b ){
|
||||
T c;
|
||||
c = a;
|
||||
a = b;
|
||||
b = c;
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
inline void Shuffle( T *data, size_t sz ){
|
||||
if( sz == 0 ) return;
|
||||
for( uint32_t i = (uint32_t)sz - 1; i > 0; i-- ){
|
||||
Exchange( data[i], data[ NextUInt32( i+1 ) ] );
|
||||
}
|
||||
}
|
||||
// random shuffle the data inside, require PRNG
|
||||
template<typename T>
|
||||
inline void Shuffle( std::vector<T> &data ){
|
||||
Shuffle( &data[0], data.size() );
|
||||
}
|
||||
};
|
||||
};
|
||||
|
||||
#endif
|
||||
54
utils/xgboost_stream.h
Normal file
54
utils/xgboost_stream.h
Normal file
@@ -0,0 +1,54 @@
|
||||
#ifndef XGBOOST_STREAM_H
|
||||
#define XGBOOST_STREAM_H
|
||||
|
||||
#include <cstdio>
|
||||
/*!
|
||||
* \file xgboost_stream.h
|
||||
* \brief general stream interface for serialization
|
||||
* \author Tianqi Chen: tianqi.tchen@gmail.com
|
||||
*/
|
||||
namespace xgboost{
|
||||
namespace utils{
|
||||
/*!
|
||||
* \brief interface of stream I/O, used to serialize model
|
||||
*/
|
||||
class IStream{
|
||||
public:
|
||||
/*!
|
||||
* \brief read data from stream
|
||||
* \param ptr pointer to memory buffer
|
||||
* \param size size of block
|
||||
* \return usually is the size of data readed
|
||||
*/
|
||||
virtual size_t Read( void *ptr, size_t size ) = 0;
|
||||
/*!
|
||||
* \brief write data to stream
|
||||
* \param ptr pointer to memory buffer
|
||||
* \param size size of block
|
||||
*/
|
||||
virtual void Write( const void *ptr, size_t size ) = 0;
|
||||
/*! \brief virtual destructor */
|
||||
virtual ~IStream( void ){}
|
||||
};
|
||||
|
||||
/*! \brief implementation of file i/o stream */
|
||||
class FileStream: public IStream{
|
||||
private:
|
||||
FILE *fp;
|
||||
public:
|
||||
FileStream( FILE *fp ){
|
||||
this->fp = fp;
|
||||
}
|
||||
virtual size_t Read( void *ptr, size_t size ){
|
||||
return fread( ptr, size, 1, fp );
|
||||
}
|
||||
virtual void Write( const void *ptr, size_t size ){
|
||||
fwrite( ptr, size, 1, fp );
|
||||
}
|
||||
inline void Close( void ){
|
||||
fclose( fp );
|
||||
}
|
||||
};
|
||||
};
|
||||
};
|
||||
#endif
|
||||
68
utils/xgboost_utils.h
Normal file
68
utils/xgboost_utils.h
Normal file
@@ -0,0 +1,68 @@
|
||||
#ifndef XGBOOST_UTILS_H
|
||||
#define XGBOOST_UTILS_H
|
||||
/*!
|
||||
* \file xgboost_utils.h
|
||||
* \brief simple utils to support the code
|
||||
* \author Tianqi Chen: tianqi.tchen@gmail.com
|
||||
*/
|
||||
|
||||
#define _CRT_SECURE_NO_WARNINGS
|
||||
#ifdef _MSC_VER
|
||||
#define fopen64 fopen
|
||||
#else
|
||||
|
||||
// use 64 bit offset, either to include this header in the beginning, or
|
||||
#ifdef _FILE_OFFSET_BITS
|
||||
#if _FILE_OFFSET_BITS == 32
|
||||
#warning "FILE OFFSET BITS defined to be 32 bit"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef __APPLE__
|
||||
#define off64_t off_t
|
||||
#define fopen64 fopen
|
||||
#endif
|
||||
|
||||
#define _FILE_OFFSET_BITS 64
|
||||
extern "C"{
|
||||
#include <sys/types.h>
|
||||
};
|
||||
#include <cstdio>
|
||||
#endif
|
||||
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
|
||||
namespace xgboost{
|
||||
/*! \brief namespace for helper utils of the project */
|
||||
namespace utils{
|
||||
inline void Error( const char *msg ){
|
||||
fprintf( stderr, "Error:%s\n",msg );
|
||||
exit( -1 );
|
||||
}
|
||||
|
||||
inline void Assert( bool exp ){
|
||||
if( !exp ) Error( "AssertError" );
|
||||
}
|
||||
|
||||
inline void Assert( bool exp, const char *msg ){
|
||||
if( !exp ) Error( msg );
|
||||
}
|
||||
|
||||
inline void Warning( const char *msg ){
|
||||
fprintf( stderr, "warning:%s\n",msg );
|
||||
}
|
||||
|
||||
/*! \brief replace fopen, report error when the file open fails */
|
||||
inline FILE *FopenCheck( const char *fname , const char *flag ){
|
||||
FILE *fp = fopen64( fname , flag );
|
||||
if( fp == NULL ){
|
||||
fprintf( stderr, "can not open file \"%s\"\n",fname );
|
||||
exit( -1 );
|
||||
}
|
||||
return fp;
|
||||
}
|
||||
};
|
||||
};
|
||||
|
||||
#endif
|
||||
Reference in New Issue
Block a user