Compare commits

...

110 Commits

Author SHA1 Message Date
tqchen
2aa1031d24 add dump nice to regression demo 2014-03-26 16:47:01 -07:00
tqchen
1440dc9c8f update regression 2014-03-26 16:25:44 -07:00
kalenhaha
27bd5496a8 small fix 2014-03-27 00:08:47 +08:00
kalenhaha
81b32525e0 Merge branch 'master' of https://github.com/tqchen/xgboost 2014-03-26 23:50:56 +08:00
tqchen
6fa0948461 Merge branch 'master' of ssh://github.com/tqchen/xgboost 2014-03-25 17:18:27 -07:00
tqchen
61123f86aa small fix 2014-03-25 17:17:00 -07:00
Tianqi Chen
110b97fea2 Update README.md 2014-03-26 08:01:47 +08:00
Tianqi Chen
b2eb4e956b Update README.md 2014-03-26 08:01:24 +08:00
Tianqi Chen
56ae0e32e3 Update README 2014-03-26 07:21:15 +08:00
kalenhaha
e350c38483 change the regression demo data set 2014-03-24 23:23:11 +08:00
tqchen
e59ed018e6 fix test to pred 2014-03-24 00:31:53 -07:00
kalenhaha
3123d11655 remove test directory 2014-03-23 00:05:46 +08:00
kalenhaha
ca74cba9ec adding regression demo 2014-03-22 21:52:29 +08:00
kalenhaha
a84d4f3e68 Merge branch 'master' of https://github.com/tqchen/xgboost 2014-03-22 21:50:31 +08:00
kalenhaha
76cd1561a0 separate binary classification and regression demo 2014-03-22 21:48:27 +08:00
Tianqi Chen
5b4f77488c Update README.md 2014-03-20 23:12:41 -07:00
Tianqi Chen
b0676fc682 Update README.md 2014-03-20 23:12:16 -07:00
tqchen
97418b113e add batch running 2014-03-20 16:27:24 -07:00
tqchen
d56394d2ef add feature constraint 2014-03-19 10:47:56 -07:00
tqchen
6a91438634 fixed remove bug 2014-03-13 13:42:40 -07:00
tqchen
da3b3c8136 neglok 2014-03-12 20:28:21 -07:00
tqchen
fcf06a7164 support int type 2014-03-12 17:58:14 -07:00
tqchen
8f9efa2725 more compact 2014-03-11 13:07:20 -07:00
tqchen
6e48a938c6 add accuracy 2014-03-11 13:06:22 -07:00
tqchen
19b28b978d fix delete 2014-03-11 12:40:51 -07:00
tqchen
8f16ef8e75 add remove tree 2014-03-11 11:25:50 -07:00
tqchen
d2377b26bd add name dumpath 2014-03-06 11:23:51 -08:00
tqchen
70f3f31206 add add and remove 2014-03-05 16:39:07 -08:00
tqchen
f62c5dc3c1 try interact mode 2014-03-05 15:28:53 -08:00
tqchen
2d67377a96 add a test folder 2014-03-05 15:20:11 -08:00
tqchen
d982be9dca complete row maker 2014-03-05 14:38:13 -08:00
tqchen
98114cabce add row tree maker, to be finished 2014-03-05 11:00:03 -08:00
tqchen
2910bdedf4 split new base treemaker, not very good abstraction, but ok 2014-03-05 10:20:36 -08:00
tqchen
128e94be1a fix reg model_out 2014-03-05 09:34:37 -08:00
tqchen
eade6ddf7c reupdate data 2014-03-04 22:47:39 -08:00
tqchen
9b45210fa7 fix text 2014-03-04 16:22:24 -08:00
tqchen
ddd61b43be fix fmatrix 2014-03-04 11:45:22 -08:00
tqchen
98e851d80f add simple text loader 2014-03-04 11:33:33 -08:00
tqchen
3d223232e3 ok fix 2014-03-03 22:20:45 -08:00
tqchen
b689b4525a big change, change interface to template, everything still OK 2014-03-03 22:16:37 -08:00
tqchen
a3ca03cfc1 backup makefile 2014-03-03 15:21:50 -08:00
tqchen
2aa1978cb6 compatibility issue with openmp 2014-03-03 15:11:41 -08:00
tqchen
e3b7abfb47 ok 2014-03-03 12:26:40 -08:00
tqchen
2adf905dcf maptree is not needed 2014-03-03 11:06:24 -08:00
tqchen
cfbeeef9c1 fix fmap 2014-03-03 11:05:10 -08:00
tqchen
8ae1d37828 auto do reboost 2014-03-02 16:42:22 -08:00
tqchen
0fc64d1c2a chg file name of reg 2014-03-02 16:39:00 -08:00
tqchen
1eca127f69 chg file name of reg 2014-03-02 16:38:59 -08:00
tqchen
c7b29774c2 change test task to pred 2014-03-02 16:20:42 -08:00
tqchen
a8f69878eb make style more like Google style 2014-03-02 13:30:24 -08:00
tqchen
51b6d86c17 add smart decision of nfeatures 2014-03-01 21:49:29 -08:00
tqchen
082a57ba0b fix type 2014-03-01 21:29:07 -08:00
tqchen
f3c98d0c4b add smart load 2014-03-01 21:15:54 -08:00
tqchen
1748e4517a full omp support for regression 2014-03-01 20:56:25 -08:00
tqchen
328e41244c fix col maker, make it default 2014-03-01 15:16:30 -08:00
tqchen
155b593984 add col maker 2014-03-01 14:00:09 -08:00
Tianqi Chen
76cbc754c9 Update README.md 2014-02-28 20:13:01 -08:00
Tianqi Chen
97ca3bf739 Update README.md 2014-02-28 20:10:57 -08:00
tqchen
752f336cb3 chg license, README 2014-02-28 20:09:40 -08:00
tqchen
fffad41e53 start add coltree maker 2014-02-28 11:44:50 -08:00
tqchen
10382f6365 add dump2json 2014-02-26 18:54:12 -08:00
tqchen
7b2fe1bf5d add pathdump 2014-02-26 17:08:23 -08:00
tqchen
88c982012a modify tree so that training is standalone 2014-02-26 16:03:00 -08:00
tqchen
b6f98bf37a modify tree so that training is standalone 2014-02-26 16:02:58 -08:00
tqchen
3a4d0f28d9 change input data structure 2014-02-26 11:51:58 -08:00
tqchen
e58daa6d52 fix mushroom 2014-02-24 23:19:58 -08:00
tqchen
a5b37e0395 finish mushroom 2014-02-24 23:06:57 -08:00
tqchen
e75488b578 add mushroom classification 2014-02-24 22:25:43 -08:00
tqchen
1160a38323 add mushroom 2014-02-24 22:19:40 -08:00
tqchen
4401d549f1 pass simple test 2014-02-20 22:28:05 -08:00
tqchen
fd120a8f5c changes to reg booster 2014-02-20 22:08:31 -08:00
kalenhaha
00add6dd1d tab eliminated 2014-02-19 13:25:01 +08:00
kalenhaha
cd009f2541 add toy data 2014-02-19 13:01:15 +08:00
kalenhaha
582be45810 add in reg.conf for configuration demo 2014-02-18 16:49:23 +08:00
kalenhaha
3c93216850 Merge branch 'master' of https://github.com/tqchen/xgboost 2014-02-16 14:34:35 +08:00
kalenhaha
787f76e952 fix some bugs 2014-02-16 11:44:03 +08:00
tqchen
91c170e463 fix nboosters 2014-02-15 19:42:02 -08:00
tqchen
0c44347e82 update license 2014-02-15 17:45:48 -08:00
tqchen
603704287d Merge branch 'master' of ssh://github.com/tqchen/xgboost 2014-02-15 17:42:31 -08:00
tqchen
c933625f94 update license 2014-02-15 17:42:23 -08:00
tqchen
cebf39ea47 Update README.md 2014-02-15 11:22:50 -08:00
kalenhaha
f22139c659 Comments added 2014-02-13 13:04:55 +08:00
kalenhaha
06ce8c9f3a GBRT Train and Test Phase added 2014-02-12 23:30:32 +08:00
tqchen
98a60b3610 Update README.md 2014-02-11 20:38:06 -08:00
tqchen
2dc6c9c683 chg fmt to libsvm 2014-02-10 21:41:43 -08:00
tqchen
3e53fcf465 cleanup reg 2014-02-10 21:09:09 -08:00
tqchen
cb0fa75252 add regression data 2014-02-10 20:32:23 -08:00
kalenhaha
51a63d80d0 Merge branch 'master' of https://github.com/tqchen/xgboost 2014-02-11 11:19:27 +08:00
kalenhaha
1e356c5bd2 gbrt modified 2014-02-11 11:07:00 +08:00
kalenhaha
c5ada79be5 gbrt implemented 2014-02-10 23:40:38 +08:00
tqchen
dd924becd8 Update README.md 2014-02-08 19:02:33 -08:00
tqchen
7fa301a8ce Update README.md 2014-02-08 13:01:10 -08:00
tqchen
3d1e0badd3 Update README.md 2014-02-08 13:00:49 -08:00
tqchen
7e605306ad Update README.md 2014-02-08 12:50:24 -08:00
tqchen
5e5acdc121 finish readme 2014-02-08 11:47:37 -08:00
tqchen
7302a4e1b5 add linear booster 2014-02-08 11:24:35 -08:00
tqchen
21dd4b5904 add ok 2014-02-07 22:51:16 -08:00
tqchen
61e5410789 chg makefile 2014-02-07 22:43:13 -08:00
tqchen
0febb1a443 adapt tree booster 2014-02-07 22:41:32 -08:00
tqchen
36a04f17df adapt svdfeature tree 2014-02-07 22:38:26 -08:00
tqchen
3dd477c4b2 add detailed comment about gbmcore 2014-02-07 20:30:39 -08:00
tqchen
779d6a34de add empty folder for regression. TODO 2014-02-07 20:20:09 -08:00
tqchen
4535ab7e5c move core code to booster 2014-02-07 20:13:27 -08:00
tqchen
75c36a0667 add base code 2014-02-07 18:40:53 -08:00
tqchen
790c76e814 sync everything 2014-02-06 21:28:47 -08:00
tqchen
a81ea03022 add config 2014-02-06 21:26:27 -08:00
tqchen
a198759df6 update this folder 2014-02-06 16:06:59 -08:00
tqchen
a607444038 update this folder 2014-02-06 16:06:18 -08:00
tqchen
ee6a0c7f4a initial cleanup of interface 2014-02-06 16:03:04 -08:00
tqchen
57fef8bc54 init commit 2014-02-06 15:50:50 -08:00
41 changed files with 13934 additions and 202 deletions

6
.gitignore vendored
View File

@@ -11,3 +11,9 @@
*.lai
*.la
*.a
*~
*txt*
*conf
*buffer
*model
xgboost

209
LICENSE
View File

@@ -1,202 +1,13 @@
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
Copyright (c) 2014 Tianqi Chen
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "{}"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright {yyyy} {name of copyright owner}
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

25
Makefile Normal file
View File

@@ -0,0 +1,25 @@
export CC = gcc
export CXX = g++
export CFLAGS = -Wall -O3 -msse2 -Wno-unknown-pragmas -fopenmp
# specify tensor path
BIN = xgboost
OBJ =
.PHONY: clean all
all: $(BIN) $(OBJ)
export LDFLAGS= -pthread -lm
xgboost: regression/xgboost_reg_main.cpp regression/*.h booster/*.h booster/*/*.hpp booster/*.hpp
$(BIN) :
$(CXX) $(CFLAGS) $(LDFLAGS) -o $@ $(filter %.cpp %.o %.c, $^)
$(OBJ) :
$(CXX) -c $(CFLAGS) -o $@ $(firstword $(filter %.cpp %.c, $^) )
install:
cp -f -r $(BIN) $(INSTALL_PATH)
clean:
$(RM) $(OBJ) $(BIN) *~

View File

@@ -1,4 +1,40 @@
xgboost
xgboost: eXtreme Gradient Boosting
=======
A General purpose gradient boosting (tree) library.
General Purpose Gradient Boosting Library
Authors:
* Tianqi Chen, project creater
* Kailong Chen, contributes regression module
Turorial and Documentation: https://github.com/tqchen/xgboost/wiki
Features
=======
* Sparse feature format:
- Sparse feature format allows easy handling of missing values, and improve computation efficiency.
* Push the limit on single machine:
- Efficient implementation that optimizes memory and computation.
* Layout of gradient boosting algorithm to support generic tasks, see project wiki.
Supported key components
=======
* Gradient boosting models:
- regression tree (GBRT)
- linear model/lasso
* Objectives to support tasks:
- regression
- classification
* OpenMP implementation
Planned components
=======
* More objective to support tasks:
- ranking
- matrix factorization
- structured prediction
File extension convention
=======
* .h are interface, utils and data structures, with detailed comment;
* .cpp are implementations that will be compiled, with less comment;
* .hpp are implementations that will be included by .cpp, with less comment

View File

@@ -0,0 +1,200 @@
#ifndef XGBOOST_LINEAR_HPP
#define XGBOOST_LINEAR_HPP
/*!
* \file xgboost_linear.h
* \brief Implementation of Linear booster, with L1/L2 regularization: Elastic Net
* the update rule is coordinate descent, require column major format
* \author Tianqi Chen: tianqi.tchen@gmail.com
*/
#include <vector>
#include <algorithm>
#include "../xgboost.h"
#include "../../utils/xgboost_utils.h"
namespace xgboost{
namespace booster{
/*! \brief linear model, with L1/L2 regularization */
template<typename FMatrix>
class LinearBooster : public InterfaceBooster<FMatrix>{
public:
LinearBooster( void ){ silent = 0;}
virtual ~LinearBooster( void ){}
public:
virtual void SetParam( const char *name, const char *val ){
if( !strcmp( name, "silent") ) silent = atoi( val );
if( model.weight.size() == 0 ) model.param.SetParam( name, val );
param.SetParam( name, val );
}
virtual void LoadModel( utils::IStream &fi ){
model.LoadModel( fi );
}
virtual void SaveModel( utils::IStream &fo ) const{
model.SaveModel( fo );
}
virtual void InitModel( void ){
model.InitModel();
}
public:
virtual void DoBoost( std::vector<float> &grad,
std::vector<float> &hess,
const FMatrix &fmat,
const std::vector<unsigned> &root_index ){
utils::Assert( grad.size() < UINT_MAX, "number of instance exceed what we can handle" );
this->UpdateWeights( grad, hess, fmat );
}
inline float Predict( const FMatrix &fmat, bst_uint ridx, unsigned root_index ){
float sum = model.bias();
for( typename FMatrix::RowIter it = fmat.GetRow(ridx); it.Next(); ){
sum += model.weight[ it.findex() ] * it.fvalue();
}
return sum;
}
virtual float Predict( const std::vector<float> &feat,
const std::vector<bool> &funknown,
unsigned rid = 0 ){
float sum = model.bias();
for( size_t i = 0; i < feat.size(); i ++ ){
if( funknown[i] ) continue;
sum += model.weight[ i ] * feat[ i ];
}
return sum;
}
protected:
// training parameter
struct ParamTrain{
/*! \brief learning_rate */
float learning_rate;
/*! \brief regularization weight for L2 norm */
float reg_lambda;
/*! \brief regularization weight for L1 norm */
float reg_alpha;
/*! \brief regularization weight for L2 norm in bias */
float reg_lambda_bias;
ParamTrain( void ){
reg_alpha = 0.0f; reg_lambda = 0.0f; reg_lambda_bias = 0.0f;
learning_rate = 1.0f;
}
inline void SetParam( const char *name, const char *val ){
// sync-names
if( !strcmp( "eta", name ) ) learning_rate = (float)atof( val );
if( !strcmp( "lambda", name ) ) reg_lambda = (float)atof( val );
if( !strcmp( "alpha", name ) ) reg_alpha = (float)atof( val );
if( !strcmp( "lambda_bias", name ) ) reg_lambda_bias = (float)atof( val );
// real names
if( !strcmp( "learning_rate", name ) ) learning_rate = (float)atof( val );
if( !strcmp( "reg_lambda", name ) ) reg_lambda = (float)atof( val );
if( !strcmp( "reg_alpha", name ) ) reg_alpha = (float)atof( val );
if( !strcmp( "reg_lambda_bias", name ) ) reg_lambda_bias = (float)atof( val );
}
// given original weight calculate delta
inline double CalcDelta( double sum_grad, double sum_hess, double w ){
if( sum_hess < 1e-5f ) return 0.0f;
double tmp = w - ( sum_grad + reg_lambda*w )/( sum_hess + reg_lambda );
if ( tmp >=0 ){
return std::max(-( sum_grad + reg_lambda*w + reg_alpha)/(sum_hess+reg_lambda),-w);
}else{
return std::min(-( sum_grad + reg_lambda*w - reg_alpha)/(sum_hess+reg_lambda),-w);
}
}
// given original weight calculate delta bias
inline double CalcDeltaBias( double sum_grad, double sum_hess, double w ){
return - (sum_grad + reg_lambda_bias*w) / (sum_hess + reg_lambda_bias );
}
};
// model for linear booster
class Model{
public:
// model parameter
struct Param{
// number of feature dimension
int num_feature;
// reserved field
int reserved[ 32 ];
// constructor
Param( void ){
num_feature = 0;
memset( reserved, 0, sizeof(reserved) );
}
inline void SetParam( const char *name, const char *val ){
if( !strcmp( name, "num_feature" ) ) num_feature = atoi( val );
}
};
public:
Param param;
// weight for each of feature, bias is the last one
std::vector<float> weight;
public:
// initialize the model parameter
inline void InitModel( void ){
// bias is the last weight
weight.resize( param.num_feature + 1 );
std::fill( weight.begin(), weight.end(), 0.0f );
}
// save the model to file
inline void SaveModel( utils::IStream &fo ) const{
fo.Write( &param, sizeof(Param) );
fo.Write( &weight[0], sizeof(float) * weight.size() );
}
// load model from file
inline void LoadModel( utils::IStream &fi ){
utils::Assert( fi.Read( &param, sizeof(Param) ) != 0, "Load LinearBooster" );
weight.resize( param.num_feature + 1 );
utils::Assert( fi.Read( &weight[0], sizeof(float) * weight.size() ) != 0, "Load LinearBooster" );
}
// model bias
inline float &bias( void ){
return weight.back();
}
};
private:
int silent;
protected:
Model model;
ParamTrain param;
protected:
// update weights, should work for any FMatrix
inline void UpdateWeights( std::vector<float> &grad,
const std::vector<float> &hess,
const FMatrix &smat ){
{// optimize bias
double sum_grad = 0.0, sum_hess = 0.0;
for( size_t i = 0; i < grad.size(); i ++ ){
sum_grad += grad[ i ]; sum_hess += hess[ i ];
}
// remove bias effect
double dw = param.learning_rate * param.CalcDeltaBias( sum_grad, sum_hess, model.bias() );
model.bias() += dw;
// update grad value
for( size_t i = 0; i < grad.size(); i ++ ){
grad[ i ] += dw * hess[ i ];
}
}
// optimize weight
const unsigned nfeat= (unsigned)smat.NumCol();
for( unsigned i = 0; i < nfeat; i ++ ){
if( !smat.GetSortedCol( i ).Next() ) continue;
double sum_grad = 0.0, sum_hess = 0.0;
for( typename FMatrix::ColIter it = smat.GetSortedCol(i); it.Next(); ){
const float v = it.fvalue();
sum_grad += grad[ it.rindex() ] * v;
sum_hess += hess[ it.rindex() ] * v * v;
}
float w = model.weight[ i ];
double dw = param.learning_rate * param.CalcDelta( sum_grad, sum_hess, w );
model.weight[ i ] += dw;
// update grad value
for( typename FMatrix::ColIter it = smat.GetSortedCol(i); it.Next(); ){
const float v = it.fvalue();
grad[ it.rindex() ] += hess[ it.rindex() ] * v * dw;
}
}
}
};
};
};
#endif

View File

@@ -0,0 +1,147 @@
#ifndef XGBOOST_BASE_TREEMAKER_HPP
#define XGBOOST_BASE_TREEMAKER_HPP
/*!
* \file xgboost_base_treemaker.hpp
* \brief implementation of base data structure for regression tree maker,
* gives common operations of tree construction steps template
*
* \author Tianqi Chen: tianqi.tchen@gmail.com
*/
#include <vector>
#include "xgboost_tree_model.h"
namespace xgboost{
namespace booster{
class BaseTreeMaker{
protected:
BaseTreeMaker( RegTree &tree,
const TreeParamTrain &param )
: tree( tree ), param( param ){}
protected:
// statistics that is helpful to decide a split
struct SplitEntry{
/*! \brief loss change after split this node */
float loss_chg;
/*! \brief split index */
unsigned sindex;
/*! \brief split value */
float split_value;
/*! \brief constructor */
SplitEntry( void ){
loss_chg = 0.0f;
split_value = 0.0f; sindex = 0;
}
// This function gives better priority to lower index when loss_chg equals
// not the best way, but helps to give consistent result during multi-thread execution
inline bool NeedReplace( float loss_chg, unsigned split_index ) const{
if( this->split_index() <= split_index ){
return loss_chg > this->loss_chg;
}else{
return !(this->loss_chg > loss_chg);
}
}
inline bool Update( const SplitEntry &e ){
if( this->NeedReplace( e.loss_chg, e.split_index() ) ){
this->loss_chg = e.loss_chg;
this->sindex = e.sindex;
this->split_value = e.split_value;
return true;
} else{
return false;
}
}
inline bool Update( float loss_chg, unsigned split_index, float split_value, bool default_left ){
if( this->NeedReplace( loss_chg, split_index ) ){
this->loss_chg = loss_chg;
if( default_left ) split_index |= (1U << 31);
this->sindex = split_index;
this->split_value = split_value;
return true;
}else{
return false;
}
}
inline unsigned split_index( void ) const{
return sindex & ( (1U<<31) - 1U );
}
inline bool default_left( void ) const{
return (sindex >> 31) != 0;
}
};
struct NodeEntry{
/*! \brief sum gradient statistics */
double sum_grad;
/*! \brief sum hessian statistics */
double sum_hess;
/*! \brief loss of this node, without split */
float root_gain;
/*! \brief weight calculated related to current data */
float weight;
/*! \brief current best solution */
SplitEntry best;
NodeEntry( void ){
sum_grad = sum_hess = 0.0;
weight = root_gain = 0.0f;
}
};
private:
// try to prune off current leaf, return true if successful
inline void TryPruneLeaf( int nid, int depth ){
if( tree[ nid ].is_root() ) return;
int pid = tree[ nid ].parent();
RegTree::NodeStat &s = tree.stat( pid );
++ s.leaf_child_cnt;
if( s.leaf_child_cnt >= 2 && param.need_prune( s.loss_chg, depth - 1 ) ){
this->stat_num_pruned += 2;
// need to be pruned
tree.ChangeToLeaf( pid, param.learning_rate * s.base_weight );
// tail recursion
this->TryPruneLeaf( pid, depth - 1 );
}
}
protected:
/*! \brief do prunning of a tree */
inline int DoPrune( void ){
this->stat_num_pruned = 0;
// initialize auxiliary statistics
for( int nid = 0; nid < tree.param.num_nodes; ++ nid ){
tree.stat( nid ).leaf_child_cnt = 0;
tree.stat( nid ).loss_chg = snode[ nid ].best.loss_chg;
tree.stat( nid ).sum_hess = static_cast<float>( snode[ nid ].sum_hess );
}
for( int nid = 0; nid < tree.param.num_nodes; ++ nid ){
if( tree[ nid ].is_leaf() ) this->TryPruneLeaf( nid, tree.GetDepth(nid) );
}
return this->stat_num_pruned;
}
protected:
/*! \brief update queue expand add in new leaves */
inline void UpdateQueueExpand( std::vector<int> &qexpand ){
std::vector<int> newnodes;
for( size_t i = 0; i < qexpand.size(); ++ i ){
const int nid = qexpand[i];
if( !tree[ nid ].is_leaf() ){
newnodes.push_back( tree[nid].cleft() );
newnodes.push_back( tree[nid].cright() );
}
}
// use new nodes for qexpand
qexpand = newnodes;
}
protected:
// local helper tmp data structure
// statistics
int stat_num_pruned;
/*! \brief queue of nodes to be expanded */
std::vector<int> qexpand;
/*! \brief TreeNode Data: statistics for each constructed node, the derived class must maintain this */
std::vector<NodeEntry> snode;
protected:
// original data that supports tree construction
RegTree &tree;
const TreeParamTrain &param;
};
}; // namespace booster
}; // namespace xgboost
#endif // XGBOOST_BASE_TREEMAKER_HPP

View File

@@ -0,0 +1,335 @@
#ifndef XGBOOST_COL_TREEMAKER_HPP
#define XGBOOST_COL_TREEMAKER_HPP
/*!
* \file xgboost_col_treemaker.hpp
* \brief implementation of regression tree maker,
* use a column based approach, with OpenMP
* \author Tianqi Chen: tianqi.tchen@gmail.com
*/
// use openmp
#include <vector>
#include "xgboost_tree_model.h"
#include "../../utils/xgboost_omp.h"
#include "../../utils/xgboost_random.h"
#include "../../utils/xgboost_fmap.h"
#include "xgboost_base_treemaker.hpp"
namespace xgboost{
namespace booster{
template<typename FMatrix>
class ColTreeMaker : protected BaseTreeMaker{
public:
ColTreeMaker( RegTree &tree,
const TreeParamTrain &param,
const std::vector<float> &grad,
const std::vector<float> &hess,
const FMatrix &smat,
const std::vector<unsigned> &root_index,
const utils::FeatConstrain &constrain )
: BaseTreeMaker( tree, param ),
grad(grad), hess(hess),
smat(smat), root_index(root_index), constrain(constrain) {
utils::Assert( grad.size() == hess.size(), "booster:invalid input" );
utils::Assert( smat.NumRow() == hess.size(), "booster:invalid input" );
utils::Assert( root_index.size() == 0 || root_index.size() == hess.size(), "booster:invalid input" );
utils::Assert( smat.HaveColAccess(), "ColTreeMaker: need column access matrix" );
}
inline void Make( int& stat_max_depth, int& stat_num_pruned ){
this->InitData();
this->InitNewNode( this->qexpand );
stat_max_depth = 0;
for( int depth = 0; depth < param.max_depth; ++ depth ){
this->FindSplit( depth );
this->UpdateQueueExpand( this->qexpand );
this->InitNewNode( this->qexpand );
// if nothing left to be expand, break
if( qexpand.size() == 0 ) break;
stat_max_depth = depth + 1;
}
// set all the rest expanding nodes to leaf
for( size_t i = 0; i < qexpand.size(); ++ i ){
const int nid = qexpand[i];
tree[ nid ].set_leaf( snode[nid].weight * param.learning_rate );
}
// start prunning the tree
stat_num_pruned = this->DoPrune();
}
private:
/*! \brief per thread x per node entry to store tmp data */
struct ThreadEntry{
/*! \brief sum gradient statistics */
double sum_grad;
/*! \brief sum hessian statistics */
double sum_hess;
/*! \brief last feature value scanned */
float last_fvalue;
/*! \brief current best solution */
SplitEntry best;
/*! \brief constructor */
ThreadEntry( void ){
this->ClearStats();
}
/*! \brief clear statistics */
inline void ClearStats( void ){
sum_grad = sum_hess = 0.0;
}
};
private:
// make leaf nodes for all qexpand, update node statistics, mark leaf value
inline void InitNewNode( const std::vector<int> &qexpand ){
{// setup statistics space for each tree node
for( size_t i = 0; i < stemp.size(); ++ i ){
stemp[i].resize( tree.param.num_nodes, ThreadEntry() );
}
snode.resize( tree.param.num_nodes, NodeEntry() );
}
const unsigned ndata = static_cast<unsigned>( position.size() );
#pragma omp parallel for schedule( static )
for( unsigned i = 0; i < ndata; ++ i ){
const int tid = omp_get_thread_num();
if( position[i] < 0 ) continue;
stemp[tid][ position[i] ].sum_grad += grad[i];
stemp[tid][ position[i] ].sum_hess += hess[i];
}
for( size_t j = 0; j < qexpand.size(); ++ j ){
const int nid = qexpand[ j ];
double sum_grad = 0.0, sum_hess = 0.0;
for( size_t tid = 0; tid < stemp.size(); tid ++ ){
sum_grad += stemp[tid][nid].sum_grad;
sum_hess += stemp[tid][nid].sum_hess;
}
// update node statistics
snode[nid].sum_grad = sum_grad;
snode[nid].sum_hess = sum_hess;
snode[nid].root_gain = param.CalcRootGain( sum_grad, sum_hess );
if( !tree[nid].is_root() ){
snode[nid].weight = param.CalcWeight( sum_grad, sum_hess, tree.stat( tree[nid].parent() ).base_weight );
tree.stat(nid).base_weight = snode[nid].weight;
}else{
snode[nid].weight = param.CalcWeight( sum_grad, sum_hess, 0.0f );
tree.stat(nid).base_weight = snode[nid].weight;
}
}
}
private:
// enumerate the split values of specific feature
template<typename Iter>
inline void EnumerateSplit( Iter it, const unsigned fid, std::vector<ThreadEntry> &temp, bool is_forward_search ){
// clear all the temp statistics
for( size_t j = 0; j < qexpand.size(); ++ j ){
temp[ qexpand[j] ].ClearStats();
}
while( it.Next() ){
const bst_uint ridx = it.rindex();
const int nid = position[ ridx ];
if( nid < 0 ) continue;
const float fvalue = it.fvalue();
ThreadEntry &e = temp[ nid ];
// test if first hit, this is fine, because we set 0 during init
if( e.sum_hess == 0.0 ){
e.sum_grad = grad[ ridx ];
e.sum_hess = hess[ ridx ];
e.last_fvalue = fvalue;
}else{
// try to find a split
if( fabsf(fvalue - e.last_fvalue) > rt_2eps && e.sum_hess >= param.min_child_weight ){
const double csum_hess = snode[ nid ].sum_hess - e.sum_hess;
if( csum_hess >= param.min_child_weight ){
const double csum_grad = snode[nid].sum_grad - e.sum_grad;
const double loss_chg =
+ param.CalcGain( e.sum_grad, e.sum_hess, snode[nid].weight )
+ param.CalcGain( csum_grad , csum_hess , snode[nid].weight )
- snode[nid].root_gain;
e.best.Update( loss_chg, fid, (fvalue + e.last_fvalue) * 0.5f, !is_forward_search );
}
}
// update the statistics
e.sum_grad += grad[ ridx ];
e.sum_hess += hess[ ridx ];
e.last_fvalue = fvalue;
}
}
// finish updating all statistics, check if it is possible to include all sum statistics
for( size_t i = 0; i < qexpand.size(); ++ i ){
const int nid = qexpand[ i ];
ThreadEntry &e = temp[ nid ];
const double csum_hess = snode[nid].sum_hess - e.sum_hess;
if( e.sum_hess >= param.min_child_weight && csum_hess >= param.min_child_weight ){
const double csum_grad = snode[nid].sum_grad - e.sum_grad;
const double loss_chg =
+ param.CalcGain( e.sum_grad, e.sum_hess, snode[nid].weight )
+ param.CalcGain( csum_grad, csum_hess, snode[nid].weight )
- snode[nid].root_gain;
const float delta = is_forward_search ? rt_eps:-rt_eps;
e.best.Update( loss_chg, fid, e.last_fvalue + delta, !is_forward_search );
}
}
}
// find splits at current level
inline void FindSplit( int depth ){
const unsigned nsize = static_cast<unsigned>( feat_index.size() );
#pragma omp parallel for schedule( dynamic, 1 )
for( unsigned i = 0; i < nsize; ++ i ){
const unsigned fid = feat_index[i];
const int tid = omp_get_thread_num();
if( param.need_forward_search() ){
this->EnumerateSplit( smat.GetSortedCol(fid), fid, stemp[tid], true );
}
if( param.need_backward_search() ){
this->EnumerateSplit( smat.GetReverseSortedCol(fid), fid, stemp[tid], false );
}
}
// after this each thread's stemp will get the best candidates, aggregate results
for( size_t i = 0; i < qexpand.size(); ++ i ){
const int nid = qexpand[ i ];
NodeEntry &e = snode[ nid ];
for( int tid = 0; tid < this->nthread; ++ tid ){
e.best.Update( stemp[ tid ][ nid ].best );
}
// now we know the solution in snode[ nid ], set split
if( e.best.loss_chg > rt_eps ){
tree.AddChilds( nid );
tree[ nid ].set_split( e.best.split_index(), e.best.split_value, e.best.default_left() );
} else{
tree[ nid ].set_leaf( e.weight * param.learning_rate );
}
}
{// reset position
// step 1, set default direct nodes to default, and leaf nodes to -1,
const unsigned ndata = static_cast<unsigned>( position.size() );
#pragma omp parallel for schedule( static )
for( unsigned i = 0; i < ndata; ++ i ){
const int nid = position[i];
if( nid >= 0 ){
if( tree[ nid ].is_leaf() ){
position[i] = -1;
}else{
// push to default branch, correct latter
position[i] = tree[nid].default_left() ? tree[nid].cleft(): tree[nid].cright();
}
}
}
// step 2, classify the non-default data into right places
std::vector<unsigned> fsplits;
for( size_t i = 0; i < qexpand.size(); ++ i ){
const int nid = qexpand[i];
if( !tree[nid].is_leaf() ) fsplits.push_back( tree[nid].split_index() );
}
std::sort( fsplits.begin(), fsplits.end() );
fsplits.resize( std::unique( fsplits.begin(), fsplits.end() ) - fsplits.begin() );
const unsigned nfeats = static_cast<unsigned>( fsplits.size() );
#pragma omp parallel for schedule( dynamic, 1 )
for( unsigned i = 0; i < nfeats; ++ i ){
const unsigned fid = fsplits[i];
for( typename FMatrix::ColIter it = smat.GetSortedCol( fid ); it.Next(); ){
const bst_uint ridx = it.rindex();
int nid = position[ ridx ];
if( nid == -1 ) continue;
// go back to parent, correct those who are not default
nid = tree[ nid ].parent();
if( tree[ nid ].split_index() == fid ){
if( it.fvalue() < tree[nid].split_cond() ){
position[ ridx ] = tree[ nid ].cleft();
}else{
position[ ridx ] = tree[ nid ].cright();
}
}
}
}
}
}
private:
// initialize temp data structure
inline void InitData( void ){
{
position.resize( grad.size() );
if( root_index.size() == 0 ){
std::fill( position.begin(), position.end(), 0 );
}else{
for( size_t i = 0; i < root_index.size(); ++ i ){
position[i] = root_index[i];
utils::Assert( root_index[i] < (unsigned)tree.param.num_roots, "root index exceed setting" );
}
}
// mark delete for the deleted datas
for( size_t i = 0; i < grad.size(); ++ i ){
if( hess[i] < 0.0f ) position[i] = -1;
}
if( param.subsample < 1.0f - 1e-6f ){
for( size_t i = 0; i < grad.size(); ++ i ){
if( hess[i] < 0.0f ) continue;
if( random::SampleBinary( param.subsample) == 0 ){
position[ i ] = -1;
}
}
}
}
{// initialize feature index
int ncol = static_cast<int>( smat.NumCol() );
for( int i = 0; i < ncol; i ++ ){
if( smat.GetSortedCol(i).Next() && constrain.NotBanned(i) ){
feat_index.push_back( i );
}
}
random::Shuffle( feat_index );
}
{// setup temp space for each thread
if( param.nthread != 0 ){
omp_set_num_threads( param.nthread );
}
#pragma omp parallel
{
this->nthread = omp_get_num_threads();
}
// reserve a small space
stemp.resize( this->nthread, std::vector<ThreadEntry>() );
for( size_t i = 0; i < stemp.size(); ++ i ){
stemp[i].reserve( 256 );
}
snode.reserve( 256 );
}
{// expand query
qexpand.reserve( 256 ); qexpand.clear();
for( int i = 0; i < tree.param.num_roots; ++ i ){
qexpand.push_back( i );
}
}
}
private:
// number of omp thread used during training
int nthread;
// Per feature: shuffle index of each feature index
std::vector<int> feat_index;
// Instance Data: current node position in the tree of each instance
std::vector<int> position;
// PerThread x PerTreeNode: statistics for per thread construction
std::vector< std::vector<ThreadEntry> > stemp;
private:
const std::vector<float> &grad;
const std::vector<float> &hess;
const FMatrix &smat;
const std::vector<unsigned> &root_index;
const utils::FeatConstrain &constrain;
};
};
};
#endif

View File

@@ -0,0 +1,386 @@
#ifndef XGBOOST_ROW_TREEMAKER_HPP
#define XGBOOST_ROW_TREEMAKER_HPP
/*!
* \file xgboost_row_treemaker.hpp
* \brief implementation of regression tree maker,
* use a row based approach
* \author Tianqi Chen: tianqi.tchen@gmail.com
*/
// use openmp
#include <vector>
#include "xgboost_tree_model.h"
#include "../../utils/xgboost_omp.h"
#include "../../utils/xgboost_random.h"
#include "../../utils/xgboost_fmap.h"
#include "xgboost_base_treemaker.hpp"
namespace xgboost{
namespace booster{
template<typename FMatrix>
class RowTreeMaker : protected BaseTreeMaker{
public:
RowTreeMaker( RegTree &tree,
const TreeParamTrain &param,
const std::vector<float> &grad,
const std::vector<float> &hess,
const FMatrix &smat,
const std::vector<unsigned> &root_index,
const utils::FeatConstrain &constrain )
: BaseTreeMaker( tree, param ),
grad(grad), hess(hess),
smat(smat), root_index(root_index), constrain(constrain) {
utils::Assert( grad.size() == hess.size(), "booster:invalid input" );
utils::Assert( smat.NumRow() == hess.size(), "booster:invalid input" );
utils::Assert( root_index.size() == 0 || root_index.size() == hess.size(), "booster:invalid input" );
{// setup temp space for each thread
if( param.nthread != 0 ){
omp_set_num_threads( param.nthread );
}
#pragma omp parallel
{
this->nthread = omp_get_num_threads();
}
tmp_rptr.resize( this->nthread, std::vector<size_t>() );
snode.reserve( 256 );
}
}
inline void Make( int& stat_max_depth, int& stat_num_pruned ){
this->InitData();
this->InitNewNode( this->qexpand );
stat_max_depth = 0;
for( int depth = 0; depth < param.max_depth; ++ depth ){
this->FindSplit( this->qexpand, depth );
this->UpdateQueueExpand( this->qexpand );
this->InitNewNode( this->qexpand );
// if nothing left to be expand, break
if( qexpand.size() == 0 ) break;
stat_max_depth = depth + 1;
}
// set all the rest expanding nodes to leaf
for( size_t i = 0; i < qexpand.size(); ++ i ){
const int nid = qexpand[i];
tree[ nid ].set_leaf( snode[nid].weight * param.learning_rate );
}
// start prunning the tree
stat_num_pruned = this->DoPrune();
}
// expand a specific node
inline bool Expand( const std::vector<bst_uint> &valid_index, int nid ){
if( valid_index.size() == 0 ) return false;
this->InitDataExpand( valid_index, nid );
this->InitNewNode( this->qexpand );
this->FindSplit( nid, tmp_rptr[0] );
// update node statistics
for( size_t i = 0; i < qexpand.size(); ++ i ){
const int nid = qexpand[i];
tree.stat( nid ).loss_chg = snode[ nid ].best.loss_chg;
tree.stat( nid ).sum_hess = static_cast<float>( snode[ nid ].sum_hess );
}
// change the leaf
this->UpdateQueueExpand( this->qexpand );
this->InitNewNode( this->qexpand );
// set all the rest expanding nodes to leaf
for( size_t i = 0; i < qexpand.size(); ++ i ){
const int nid = qexpand[i];
tree[ nid ].set_leaf( snode[nid].weight * param.learning_rate );
tree.stat( nid ).loss_chg = 0.0f;
tree.stat( nid ).sum_hess = static_cast<float>( snode[ nid ].sum_hess );
tree.param.max_depth = std::max( tree.param.max_depth, tree.GetDepth( nid ) );
}
if( qexpand.size() != 0 ) {
return true;
}else{
return false;
}
}
// collapse specific node
inline void Collapse( const std::vector<bst_uint> &valid_index, int nid ){
if( valid_index.size() == 0 ) return;
this->InitDataExpand( valid_index, nid );
this->InitNewNode( this->qexpand );
tree.stat( nid ).loss_chg = 0.0f;
tree.stat( nid ).sum_hess = static_cast<float>( snode[ nid ].sum_hess );
tree.CollapseToLeaf( nid, snode[nid].weight * param.learning_rate );
}
private:
// make leaf nodes for all qexpand, update node statistics, mark leaf value
inline void InitNewNode( const std::vector<int> &qexpand ){
snode.resize( tree.param.num_nodes, NodeEntry() );
for( size_t j = 0; j < qexpand.size(); ++j ){
const int nid = qexpand[ j ];
double sum_grad = 0.0, sum_hess = 0.0;
for( bst_uint i = node_bound[nid].first; i < node_bound[nid].second; ++i ){
const bst_uint ridx = row_index_set[i];
sum_grad += grad[ridx]; sum_hess += hess[ridx];
}
// update node statistics
snode[nid].sum_grad = sum_grad;
snode[nid].sum_hess = sum_hess;
snode[nid].root_gain = param.CalcRootGain( sum_grad, sum_hess );
if( !tree[nid].is_root() ){
snode[nid].weight = param.CalcWeight( sum_grad, sum_hess, tree.stat( tree[nid].parent() ).base_weight );
tree.stat(nid).base_weight = snode[nid].weight;
}else{
snode[nid].weight = param.CalcWeight( sum_grad, sum_hess, 0.0f );
tree.stat(nid).base_weight = snode[nid].weight;
}
}
}
private:
// enumerate the split values of specific feature
template<typename Iter>
inline void EnumerateSplit( Iter it, SplitEntry &best, const int nid, const unsigned fid, bool is_forward_search ){
float last_fvalue = 0.0f;
double sum_hess = 0.0, sum_grad = 0.0;
const NodeEntry enode = snode[ nid ];
while( it.Next() ){
const bst_uint ridx = it.rindex();
const float fvalue = it.fvalue();
if( sum_hess == 0.0 ){
sum_grad = grad[ ridx ];
sum_hess = hess[ ridx ];
last_fvalue = fvalue;
}else{
// try to find a split
if( fabsf(fvalue - last_fvalue) > rt_2eps && sum_hess >= param.min_child_weight ){
const double csum_hess = enode.sum_hess - sum_hess;
if( csum_hess >= param.min_child_weight ){
const double csum_grad = enode.sum_grad - sum_grad;
const double loss_chg =
+ param.CalcGain( sum_grad, sum_hess, enode.weight )
+ param.CalcGain( csum_grad, csum_hess, enode.weight )
- enode.root_gain;
best.Update( loss_chg, fid, (fvalue + last_fvalue) * 0.5f, !is_forward_search );
}else{
// the rest part doesn't meet split condition anyway, return
return;
}
}
// update the statistics
sum_grad += grad[ ridx ];
sum_hess += hess[ ridx ];
last_fvalue = fvalue;
}
}
const double csum_hess = enode.sum_hess - sum_hess;
if( sum_hess >= param.min_child_weight && csum_hess >= param.min_child_weight ){
const double csum_grad = enode.sum_grad - sum_grad;
const double loss_chg =
+ param.CalcGain( sum_grad, sum_hess, enode.weight )
+ param.CalcGain( csum_grad, csum_hess, enode.weight )
- snode[nid].root_gain;
const float delta = is_forward_search ? rt_eps:-rt_eps;
best.Update( loss_chg, fid, last_fvalue + delta, !is_forward_search );
}
}
private:
inline void FindSplit( const std::vector<int> &qexpand, int depth ){
int nexpand = (int)qexpand.size();
if( depth < 3 ){
for( int i = 0; i < nexpand; ++ i ){
this->FindSplit( qexpand[i], tmp_rptr[0] );
}
}else{
// if get to enough depth, parallelize over node
#pragma omp parallel for schedule(dynamic,1)
for( int i = 0; i < nexpand; ++ i ){
const int tid = omp_get_thread_num();
utils::Assert( tid < (int)tmp_rptr.size(), "BUG: FindSplit, tid exceed tmp_rptr size" );
this->FindSplit( qexpand[i], tmp_rptr[tid] );
}
}
}
private:
inline void MakeSplit( int nid, unsigned gid ){
node_bound.resize( tree.param.num_nodes );
// re-organize the row_index_set after split on nid
const unsigned split_index = tree[nid].split_index();
const float split_value = tree[nid].split_cond();
std::vector<bst_uint> right;
bst_uint top = node_bound[nid].first;
for( bst_uint i = node_bound[ nid ].first; i < node_bound[ nid ].second; ++i ){
const bst_uint ridx = row_index_set[i];
bool goleft = tree[ nid ].default_left();
for( typename FMatrix::RowIter it = smat.GetRow(ridx,gid); it.Next(); ){
if( it.findex() == split_index ){
if( it.fvalue() < split_value ){
goleft = true; break;
}else{
goleft = false; break;
}
}
}
if( goleft ) {
row_index_set[ top ++ ] = ridx;
}else{
right.push_back( ridx );
}
}
node_bound[ tree[nid].cleft() ] = std::make_pair( node_bound[nid].first, top );
node_bound[ tree[nid].cright() ] = std::make_pair( top, node_bound[nid].second );
utils::Assert( node_bound[nid].second - top == (bst_uint)right.size(), "BUG:MakeSplit" );
for( size_t i = 0; i < right.size(); ++ i ){
row_index_set[ top ++ ] = right[ i ];
}
}
// find splits at current level
inline void FindSplit( int nid, std::vector<size_t> &tmp_rptr ){
if( tmp_rptr.size() == 0 ){
tmp_rptr.resize( tree.param.num_feature + 1, 0 );
}
const bst_uint begin = node_bound[ nid ].first;
const bst_uint end = node_bound[ nid ].second;
const unsigned ncgroup = smat.NumColGroup();
unsigned best_group = 0;
for( unsigned gid = 0; gid < ncgroup; ++gid ){
// records the columns
std::vector<FMatrixS::REntry> centry;
// records the active features
std::vector<size_t> aclist;
utils::SparseCSRMBuilder<FMatrixS::REntry,true> builder( tmp_rptr, centry, aclist );
builder.InitBudget( tree.param.num_feature );
for( bst_uint i = begin; i < end; ++i ){
const bst_uint ridx = row_index_set[i];
for( typename FMatrix::RowIter it = smat.GetRow(ridx,gid); it.Next(); ){
const bst_uint findex = it.findex();
if( constrain.NotBanned( findex ) ) builder.AddBudget( findex );
}
}
builder.InitStorage();
for( bst_uint i = begin; i < end; ++i ){
const bst_uint ridx = row_index_set[i];
for( typename FMatrix::RowIter it = smat.GetRow(ridx,gid); it.Next(); ){
const bst_uint findex = it.findex();
if( constrain.NotBanned( findex ) ) {
builder.PushElem( findex, FMatrixS::REntry( ridx, it.fvalue() ) );
}
}
}
// --- end of building column major matrix ---
// after this point, tmp_rptr and entry is ready to use
int naclist = (int)aclist.size();
// best entry for each thread
SplitEntry nbest, tbest;
#pragma omp parallel private(tbest)
{
#pragma omp for schedule(dynamic,1)
for( int j = 0; j < naclist; ++j ){
bst_uint findex = static_cast<bst_uint>( aclist[j] );
// local sort can be faster when the features are sparse
std::sort( centry.begin() + tmp_rptr[findex], centry.begin() + tmp_rptr[findex+1], FMatrixS::REntry::cmp_fvalue );
if( param.need_forward_search() ){
this->EnumerateSplit( FMatrixS::ColIter( &centry[tmp_rptr[findex]]-1, &centry[tmp_rptr[findex+1]] - 1 ),
tbest, nid, findex, true );
}
if( param.need_backward_search() ){
this->EnumerateSplit( FMatrixS::ColBackIter( &centry[tmp_rptr[findex+1]], &centry[tmp_rptr[findex]] ),
tbest, nid, findex, false );
}
}
#pragma omp critical
{
nbest.Update( tbest );
}
}
// if current solution gives the best
if( snode[nid].best.Update( nbest ) ){
best_group = gid;
}
// cleanup tmp_rptr for next usage
builder.Cleanup();
}
// at this point, we already know the best split
if( snode[nid].best.loss_chg > rt_eps ){
const SplitEntry &e = snode[nid].best;
tree.AddChilds( nid );
tree[ nid ].set_split( e.split_index(), e.split_value, e.default_left() );
this->MakeSplit( nid, best_group );
}else{
tree[ nid ].set_leaf( snode[nid].weight * param.learning_rate );
}
}
private:
// initialize temp data structure
inline void InitData( void ){
std::vector<bst_uint> valid_index;
for( size_t i = 0; i < grad.size(); ++i ){
if( hess[ i ] < 0.0f ) continue;
if( param.subsample > 1.0f-1e-6f || random::SampleBinary( param.subsample ) != 0 ){
valid_index.push_back( static_cast<bst_uint>(i) );
}
}
node_bound.resize( tree.param.num_roots );
if( root_index.size() == 0 ){
row_index_set = valid_index;
// set bound of root node
node_bound[0] = std::make_pair( 0, (bst_uint)row_index_set.size() );
}else{
std::vector<size_t> rptr;
utils::SparseCSRMBuilder<bst_uint> builder( rptr, row_index_set );
builder.InitBudget( tree.param.num_roots );
for( size_t i = 0; i < valid_index.size(); ++i ){
const bst_uint rid = valid_index[ i ];
utils::Assert( root_index[ rid ] < (unsigned)tree.param.num_roots, "root id exceed number of roots" );
builder.AddBudget( root_index[ rid ] );
}
builder.InitStorage();
for( size_t i = 0; i < valid_index.size(); ++i ){
const bst_uint rid = valid_index[ i ];
builder.PushElem( root_index[ rid ], rid );
}
for( size_t i = 1; i < rptr.size(); ++ i ){
node_bound[i-1] = std::make_pair( rptr[ i - 1 ], rptr[ i ] );
}
}
{// expand query
qexpand.reserve( 256 ); qexpand.clear();
for( int i = 0; i < tree.param.num_roots; ++ i ){
qexpand.push_back( i );
}
}
}
// initialize temp data structure
inline void InitDataExpand( const std::vector<bst_uint> &valid_index, int nid ){
row_index_set = valid_index;
node_bound.resize( tree.param.num_nodes );
node_bound[ nid ] = std::make_pair( 0, (bst_uint)row_index_set.size() );
qexpand.clear(); qexpand.push_back( nid );
}
private:
// number of omp thread used during training
int nthread;
// tmp row pointer, per thread, used for tmp data construction
std::vector< std::vector<size_t> > tmp_rptr;
// Instance row indexes corresponding to each node
std::vector<bst_uint> row_index_set;
// lower and upper bound of each nodes' row_index
std::vector< std::pair<bst_uint, bst_uint> > node_bound;
private:
const std::vector<float> &grad;
const std::vector<float> &hess;
const FMatrix &smat;
const std::vector<unsigned> &root_index;
const utils::FeatConstrain &constrain;
};
};
};
#endif

View File

@@ -0,0 +1,430 @@
#ifndef XGBOOST_APEX_TREE_HPP
#define XGBOOST_APEX_TREE_HPP
/*!
* \file xgboost_svdf_tree.hpp
* \brief implementation of regression tree constructor, with layerwise support
* this file is adapted from GBRT implementation in SVDFeature project
* \author Tianqi Chen: tqchen@apex.sjtu.edu.cn, tianqi.tchen@gmail.com
*/
#include <algorithm>
#include "xgboost_tree_model.h"
#include "../../utils/xgboost_random.h"
#include "../../utils/xgboost_matrix_csr.h"
namespace xgboost{
namespace booster{
inline void assert_sorted( unsigned *idset, int len ){
if( !rt_debug || !check_bug ) return;
for( int i = 1; i < len; i ++ ){
utils::Assert( idset[i-1] < idset[i], "idset not sorted" );
}
}
};
namespace booster{
// selecter of rtree to find the suitable candidate
class RTSelecter{
public:
struct Entry{
float loss_chg;
size_t start;
int len;
unsigned sindex;
float split_value;
Entry(){}
Entry( float loss_chg, size_t start, int len, unsigned split_index, float split_value, bool default_left ){
this->loss_chg = loss_chg;
this->start = start;
this->len = len;
if( default_left ) split_index |= (1U << 31);
this->sindex = split_index;
this->split_value = split_value;
}
inline unsigned split_index( void ) const{
return sindex & ( (1U<<31) - 1U );
}
inline bool default_left( void ) const{
return (sindex >> 31) != 0;
}
};
private:
Entry best_entry;
const TreeParamTrain &param;
public:
RTSelecter( const TreeParamTrain &p ):param( p ){
memset( &best_entry, 0, sizeof(best_entry) );
best_entry.loss_chg = 0.0f;
}
inline void push_back( const Entry &e ){
if( e.loss_chg > best_entry.loss_chg ) best_entry = e;
}
inline const Entry & select( void ){
return best_entry;
}
};
// updater of rtree, allows the parameters to be stored inside, key solver
template<typename FMatrix>
class RTreeUpdater{
protected:
// training task, element of single task
struct Task{
// node id in tree
int nid;
// idset pointer, instance id in [idset,idset+len)
unsigned *idset;
// length of idset
unsigned len;
// base_weight of parent
float parent_base_weight;
Task(){}
Task( int nid, unsigned *idset, unsigned len, float pweight = 0.0f ){
this->nid = nid;
this->idset = idset;
this->len = len;
this->parent_base_weight = pweight;
}
};
// sparse column entry
struct SCEntry{
// feature value
float fvalue;
// row index in grad
unsigned rindex;
SCEntry(){}
SCEntry( float fvalue, unsigned rindex ){
this->fvalue = fvalue; this->rindex = rindex;
}
inline bool operator<( const SCEntry &p ) const{
return fvalue < p.fvalue;
}
};
private:
// training parameter
const TreeParamTrain &param;
// parameters, reference
RegTree &tree;
std::vector<float> &grad;
std::vector<float> &hess;
const FMatrix &smat;
const std::vector<unsigned> &group_id;
private:
// maximum depth up to now
int max_depth;
// number of nodes being pruned
int num_pruned;
// stack to store current task
std::vector<Task> task_stack;
// temporal space for index set
std::vector<unsigned> idset;
private:
// task management: NOTE DFS here
inline void add_task( Task tsk ){
task_stack.push_back( tsk );
}
inline bool next_task( Task &tsk ){
if( task_stack.size() == 0 ) return false;
tsk = task_stack.back();
task_stack.pop_back();
return true;
}
private:
// try to prune off current leaf, return true if successful
inline void try_prune_leaf( int nid, int depth ){
if( tree[ nid ].is_root() ) return;
int pid = tree[ nid ].parent();
RegTree::NodeStat &s = tree.stat( pid );
s.leaf_child_cnt ++;
if( s.leaf_child_cnt >= 2 && param.need_prune( s.loss_chg, depth - 1 ) ){
// need to be pruned
tree.ChangeToLeaf( pid, param.learning_rate * s.base_weight );
// add statistics to number of nodes pruned
num_pruned += 2;
// tail recursion
this->try_prune_leaf( pid, depth - 1 );
}
}
// make leaf for current node :)
inline void make_leaf( Task tsk, double sum_grad, double sum_hess, bool compute ){
for( unsigned i = 0; i < tsk.len; i ++ ){
const unsigned ridx = tsk.idset[i];
if( compute ){
sum_grad += grad[ ridx ];
sum_hess += hess[ ridx ];
}
}
tree.stat( tsk.nid ).sum_hess = static_cast<float>( sum_hess );
tree[ tsk.nid ].set_leaf( param.learning_rate * param.CalcWeight( sum_grad, sum_hess, tsk.parent_base_weight ) );
this->try_prune_leaf( tsk.nid, tree.GetDepth( tsk.nid ) );
}
private:
// make split for current task, re-arrange positions in idset
inline void make_split( Task tsk, const SCEntry *entry, int num, float loss_chg, double sum_hess, double base_weight ){
// before split, first prepare statistics
RegTree::NodeStat &s = tree.stat( tsk.nid );
s.loss_chg = loss_chg;
s.leaf_child_cnt = 0;
s.sum_hess = static_cast<float>( sum_hess );
s.base_weight = static_cast<float>( base_weight );
// add childs to current node
tree.AddChilds( tsk.nid );
// assert that idset is sorted
assert_sorted( tsk.idset, tsk.len );
// use merge sort style to get the solution
std::vector<unsigned> qset;
for( int i = 0; i < num; i ++ ){
qset.push_back( entry[i].rindex );
}
std::sort( qset.begin(), qset.end() );
// do merge sort style, make the other set, remove elements in qset
for( unsigned i = 0, top = 0; i < tsk.len; i ++ ){
if( top < qset.size() ){
if( tsk.idset[ i ] != qset[ top ] ){
tsk.idset[ i - top ] = tsk.idset[ i ];
}else{
top ++;
}
}else{
tsk.idset[ i - qset.size() ] = tsk.idset[ i ];
}
}
// get two parts
RegTree::Node &n = tree[ tsk.nid ];
Task def_part( n.default_left() ? n.cleft() : n.cright(), tsk.idset, tsk.len - qset.size(), s.base_weight );
Task spl_part( n.default_left() ? n.cright(): n.cleft() , tsk.idset + def_part.len, qset.size(), s.base_weight );
// fill back split part
for( unsigned i = 0; i < spl_part.len; i ++ ){
spl_part.idset[ i ] = qset[ i ];
}
// add tasks to the queue
this->add_task( def_part );
this->add_task( spl_part );
}
// enumerate split point of the tree
inline void enumerate_split( RTSelecter &sglobal, int tlen,
double rsum_grad, double rsum_hess, double root_gain,
const SCEntry *entry, size_t start, size_t end,
int findex, float parent_base_weight ){
// local selecter
RTSelecter slocal( param );
if( param.need_forward_search() ){
// forward process, default right
double csum_grad = 0.0, csum_hess = 0.0;
for( size_t j = start; j < end; j ++ ){
const unsigned ridx = entry[ j ].rindex;
csum_grad += grad[ ridx ];
csum_hess += hess[ ridx ];
// check for split
if( j == end - 1 || entry[j].fvalue + rt_2eps < entry[ j + 1 ].fvalue ){
if( csum_hess < param.min_child_weight ) continue;
const double dsum_hess = rsum_hess - csum_hess;
if( dsum_hess < param.min_child_weight ) break;
// change of loss
double loss_chg =
param.CalcGain( csum_grad, csum_hess, parent_base_weight ) +
param.CalcGain( rsum_grad - csum_grad, dsum_hess, parent_base_weight ) - root_gain;
const int clen = static_cast<int>( j + 1 - start );
// add candidate to selecter
slocal.push_back( RTSelecter::Entry( loss_chg, start, clen, findex,
j == end - 1 ? entry[j].fvalue + rt_eps : 0.5 * (entry[j].fvalue+entry[j+1].fvalue),
false ) );
}
}
}
if( param.need_backward_search() ){
// backward process, default left
double csum_grad = 0.0, csum_hess = 0.0;
for( size_t j = end; j > start; j -- ){
const unsigned ridx = entry[ j - 1 ].rindex;
csum_grad += grad[ ridx ];
csum_hess += hess[ ridx ];
// check for split
if( j == start + 1 || entry[ j - 2 ].fvalue + rt_2eps < entry[ j - 1 ].fvalue ){
if( csum_hess < param.min_child_weight ) continue;
const double dsum_hess = rsum_hess - csum_hess;
if( dsum_hess < param.min_child_weight ) break;
double loss_chg = param.CalcGain( csum_grad, csum_hess, parent_base_weight ) +
param.CalcGain( rsum_grad - csum_grad, dsum_hess, parent_base_weight ) - root_gain;
const int clen = static_cast<int>( end - j + 1 );
// add candidate to selecter
slocal.push_back( RTSelecter::Entry( loss_chg, j - 1, clen, findex,
j == start + 1 ? entry[j-1].fvalue - rt_eps : 0.5 * (entry[j-2].fvalue + entry[j-1].fvalue),
true ) );
}
}
}
sglobal.push_back( slocal.select() );
}
private:
// temporal storage for expand column major
std::vector<size_t> tmp_rptr;
// find split for current task, another implementation of expand in column major manner
// should be more memory frugal, avoid global sorting across feature
inline void expand( Task tsk ){
// assert that idset is sorted
// if reach maximum depth, make leaf from current node
int depth = tree.GetDepth( tsk.nid );
// update statistiss
if( depth > max_depth ) max_depth = depth;
// if bigger than max depth
if( depth >= param.max_depth ){
this->make_leaf( tsk, 0.0, 0.0, true ); return;
}
// convert to column major CSR format
const int nrows = tree.param.num_feature;
if( tmp_rptr.size() == 0 ){
// initialize tmp storage in first usage
tmp_rptr.resize( nrows + 1 );
std::fill( tmp_rptr.begin(), tmp_rptr.end(), 0 );
}
// records the columns
std::vector<SCEntry> entry;
// records the active features
std::vector<size_t> aclist;
utils::SparseCSRMBuilder<SCEntry,true> builder( tmp_rptr, entry, aclist );
builder.InitBudget( nrows );
// statistics of root
double rsum_grad = 0.0, rsum_hess = 0.0;
for( unsigned i = 0; i < tsk.len; i ++ ){
const unsigned ridx = tsk.idset[i];
rsum_grad += grad[ ridx ];
rsum_hess += hess[ ridx ];
for( typename FMatrix::RowIter it = smat.GetRow(ridx); it.Next(); ){
builder.AddBudget( it.findex() );
}
}
// if minimum split weight is not meet
if( param.cannot_split( rsum_hess, depth ) ){
this->make_leaf( tsk, rsum_grad, rsum_hess, false ); builder.Cleanup(); return;
}
builder.InitStorage();
for( unsigned i = 0; i < tsk.len; i ++ ){
const unsigned ridx = tsk.idset[i];
for( typename FMatrix::RowIter it = smat.GetRow(ridx); it.Next(); ){
builder.PushElem( it.findex(), SCEntry( it.fvalue(), ridx ) );
}
}
// --- end of building column major matrix ---
// after this point, tmp_rptr and entry is ready to use
// global selecter
RTSelecter sglobal( param );
// gain root
const double root_gain = param.CalcRootGain( rsum_grad, rsum_hess );
// KEY: layerwise, weight of current node if it is leaf
const double base_weight = param.CalcWeight( rsum_grad, rsum_hess, tsk.parent_base_weight );
// enumerate feature index
for( size_t i = 0; i < aclist.size(); i ++ ){
int findex = static_cast<int>( aclist[i] );
size_t start = tmp_rptr[ findex ];
size_t end = tmp_rptr[ findex + 1 ];
utils::Assert( start < end, "bug" );
// local sort can be faster when the features are sparse
std::sort( entry.begin() + start, entry.begin() + end );
// local selecter
this->enumerate_split( sglobal, tsk.len,
rsum_grad, rsum_hess, root_gain,
&entry[0], start, end, findex, base_weight );
}
// Cleanup tmp_rptr for next use
builder.Cleanup();
// get the best solution
const RTSelecter::Entry &e = sglobal.select();
// allowed to split
if( e.loss_chg > rt_eps ){
// add splits
tree[ tsk.nid ].set_split( e.split_index(), e.split_value, e.default_left() );
// re-arrange idset, push tasks
this->make_split( tsk, &entry[ e.start ], e.len, e.loss_chg, rsum_hess, base_weight );
}else{
// make leaf if we didn't meet requirement
this->make_leaf( tsk, rsum_grad, rsum_hess, false );
}
}
private:
// initialize the tasks
inline void init_tasks( size_t ngrads ){
// add group partition if necessary
if( group_id.size() == 0 ){
if( param.subsample > 1.0f - 1e-6f ){
idset.resize( 0 );
for( size_t i = 0; i < ngrads; i ++ ){
if( hess[i] < 0.0f ) continue;
idset.push_back( (unsigned)i );
}
}else{
idset.resize( 0 );
for( size_t i = 0; i < ngrads; i ++ ){
if( random::SampleBinary( param.subsample ) != 0 ){
idset.push_back( (unsigned)i );
}
}
}
this->add_task( Task( 0, &idset[0], idset.size() ) ); return;
}
utils::Assert( group_id.size() == ngrads, "number of groups must be exact" );
{// new method for grouping, use CSR builder
std::vector<size_t> rptr;
utils::SparseCSRMBuilder<unsigned> builder( rptr, idset );
builder.InitBudget( tree.param.num_roots );
for( size_t i = 0; i < group_id.size(); i ++ ){
// drop invalid elements
if( hess[ i ] < 0.0f ) continue;
utils::Assert( group_id[ i ] < (unsigned)tree.param.num_roots,
"group id exceed number of roots" );
builder.AddBudget( group_id[ i ] );
}
builder.InitStorage();
for( size_t i = 0; i < group_id.size(); i ++ ){
// drop invalid elements
if( hess[ i ] < 0.0f ) continue;
builder.PushElem( group_id[ i ], static_cast<unsigned>(i) );
}
for( size_t i = 1; i < rptr.size(); i ++ ){
const size_t start = rptr[ i - 1 ], end = rptr[ i ];
if( start < end ){
this->add_task( Task( i - 1, &idset[ start ], end - start ) );
}
}
}
}
public:
RTreeUpdater( const TreeParamTrain &pparam,
RegTree &ptree,
std::vector<float> &pgrad,
std::vector<float> &phess,
const FMatrix &psmat,
const std::vector<unsigned> &pgroup_id ):
param( pparam ), tree( ptree ), grad( pgrad ), hess( phess ),
smat( psmat ), group_id( pgroup_id ){
}
inline int do_boost( int &num_pruned ){
this->init_tasks( grad.size() );
this->max_depth = 0;
this->num_pruned = 0;
Task tsk;
while( this->next_task( tsk ) ){
this->expand( tsk );
}
num_pruned = this->num_pruned;
return max_depth;
}
};
};
};
#endif

View File

@@ -0,0 +1,268 @@
#ifndef XGBOOST_TREE_HPP
#define XGBOOST_TREE_HPP
/*!
* \file xgboost_tree.hpp
* \brief implementation of regression tree
* \author Tianqi Chen: tianqi.tchen@gmail.com
*/
#include "xgboost_tree_model.h"
namespace xgboost{
namespace booster{
const bool rt_debug = false;
// whether to check bugs
const bool check_bug = false;
const float rt_eps = 1e-5f;
const float rt_2eps = rt_eps * 2.0f;
inline double sqr( double a ){
return a * a;
}
};
};
#include "../../utils/xgboost_fmap.h"
#include "xgboost_svdf_tree.hpp"
#include "xgboost_col_treemaker.hpp"
#include "xgboost_row_treemaker.hpp"
namespace xgboost{
namespace booster{
// regression tree, construction algorithm is seperated from this class
// see RegTreeUpdater
template<typename FMatrix>
class RegTreeTrainer : public InterfaceBooster<FMatrix>{
public:
RegTreeTrainer( void ){
silent = 0; tree_maker = 1;
// interact mode
interact_type = 0;
interact_node = 0;
// normally we won't have more than 64 OpenMP threads
threadtemp.resize( 64, ThreadEntry() );
}
virtual ~RegTreeTrainer( void ){}
public:
virtual void SetParam( const char *name, const char *val ){
if( !strcmp( name, "silent") ) silent = atoi( val );
if( !strcmp( name, "tree_maker") ) tree_maker = atoi( val );
if( !strncmp( name, "interact:", 9) ){
const char *ename = name + 9;
interact_node = atoi( val );
if( !strcmp( ename, "expand") ) {
interact_type = 1;
}
if( !strcmp( ename, "remove") ) {
interact_type = 2;
}
}
param.SetParam( name, val );
constrain.SetParam( name, val );
tree.param.SetParam( name, val );
}
virtual void LoadModel( utils::IStream &fi ){
tree.LoadModel( fi );
}
virtual void SaveModel( utils::IStream &fo ) const{
tree.SaveModel( fo );
}
virtual void InitModel( void ){
tree.InitModel();
}
public:
virtual void DoBoost( std::vector<float> &grad,
std::vector<float> &hess,
const FMatrix &smat,
const std::vector<unsigned> &root_index ){
utils::Assert( grad.size() < UINT_MAX, "number of instance exceed what we can handle" );
// interactive update
if( interact_type != 0 ){
switch( interact_type ){
case 1: this->ExpandNode( grad, hess, smat, root_index, interact_node ); return;
case 2: this->CollapseNode( grad, hess, smat, root_index, interact_node ); return;
default: utils::Error("unknown interact type");
}
}
if( !silent ){
printf( "\nbuild GBRT with %u instances\n", (unsigned)grad.size() );
}
int num_pruned;
switch( tree_maker ){
case 0: {
utils::Assert( !constrain.HasConstrain(), "tree maker 0 does not support constrain" );
RTreeUpdater<FMatrix> updater( param, tree, grad, hess, smat, root_index );
tree.param.max_depth = updater.do_boost( num_pruned );
break;
}
case 1:{
ColTreeMaker<FMatrix> maker( tree, param, grad, hess, smat, root_index, constrain );
maker.Make( tree.param.max_depth, num_pruned );
break;
}
case 2:{
RowTreeMaker<FMatrix> maker( tree, param, grad, hess, smat, root_index, constrain );
maker.Make( tree.param.max_depth, num_pruned );
break;
}
default: utils::Error("unknown tree maker");
}
if( !silent ){
printf( "tree train end, %d roots, %d extra nodes, %d pruned nodes ,max_depth=%d\n",
tree.param.num_roots, tree.num_extra_nodes(), num_pruned, tree.MaxDepth() );
}
}
virtual float Predict( const FMatrix &fmat, bst_uint ridx, unsigned gid = 0 ){
ThreadEntry &e = this->InitTmp();
this->PrepareTmp( fmat.GetRow(ridx), e );
int pid = this->GetLeafIndex( e.feat, e.funknown, gid );
this->DropTmp( fmat.GetRow(ridx), e );
return tree[ pid ].leaf_value();
}
virtual int GetLeafIndex( const std::vector<float> &feat,
const std::vector<bool> &funknown,
unsigned gid = 0 ){
// start from groups that belongs to current data
int pid = (int)gid;
// tranverse tree
while( !tree[ pid ].is_leaf() ){
unsigned split_index = tree[ pid ].split_index();
pid = this->GetNext( pid, feat[ split_index ], funknown[ split_index ] );
}
return pid;
}
virtual void PredPath( std::vector<int> &path, const FMatrix &fmat, bst_uint ridx, unsigned gid = 0 ){
path.clear();
ThreadEntry &e = this->InitTmp();
this->PrepareTmp( fmat.GetRow(ridx), e );
int pid = (int)gid;
path.push_back( pid );
// tranverse tree
while( !tree[ pid ].is_leaf() ){
unsigned split_index = tree[ pid ].split_index();
pid = this->GetNext( pid, e.feat[ split_index ], e.funknown[ split_index ] );
path.push_back( pid );
}
this->DropTmp( fmat.GetRow(ridx), e );
}
virtual float Predict( const std::vector<float> &feat,
const std::vector<bool> &funknown,
unsigned gid = 0 ){
utils::Assert( feat.size() >= (size_t)tree.param.num_feature,
"input data smaller than num feature" );
int pid = this->GetLeafIndex( feat, funknown, gid );
return tree[ pid ].leaf_value();
}
virtual void DumpModel( FILE *fo, const utils::FeatMap &fmap, bool with_stats ){
tree.DumpModel( fo, fmap, with_stats );
}
private:
inline void CollapseNode( std::vector<float> &grad,
std::vector<float> &hess,
const FMatrix &fmat,
const std::vector<unsigned> &root_index,
int nid ){
std::vector<bst_uint> valid_index;
for( size_t i = 0; i < grad.size(); i ++ ){
ThreadEntry &e = this->InitTmp();
this->PrepareTmp( fmat.GetRow(i), e );
int pid = root_index.size() == 0 ? 0 : (int)root_index[i];
// tranverse tree
while( !tree[ pid ].is_leaf() ){
unsigned split_index = tree[ pid ].split_index();
pid = this->GetNext( pid, e.feat[ split_index ], e.funknown[ split_index ] );
if( pid == nid ){
valid_index.push_back( static_cast<bst_uint>(i) ); break;
}
}
this->DropTmp( fmat.GetRow(i), e );
}
RowTreeMaker<FMatrix> maker( tree, param, grad, hess, fmat, root_index, constrain );
maker.Collapse( valid_index, nid );
if( !silent ){
printf( "tree collapse end, max_depth=%d\n", tree.param.max_depth );
}
}
inline void ExpandNode( std::vector<float> &grad,
std::vector<float> &hess,
const FMatrix &fmat,
const std::vector<unsigned> &root_index,
int nid ){
std::vector<bst_uint> valid_index;
for( size_t i = 0; i < grad.size(); i ++ ){
ThreadEntry &e = this->InitTmp();
this->PrepareTmp( fmat.GetRow(i), e );
unsigned rtidx = root_index.size() == 0 ? 0 : root_index[i];
int pid = this->GetLeafIndex( e.feat, e.funknown, rtidx );
this->DropTmp( fmat.GetRow(i), e );
if( pid == nid ) valid_index.push_back( static_cast<bst_uint>(i) );
}
RowTreeMaker<FMatrix> maker( tree, param, grad, hess, fmat, root_index, constrain );
bool success = maker.Expand( valid_index, nid );
if( !silent ){
printf( "tree expand end, success=%d, max_depth=%d\n", (int)success, tree.MaxDepth() );
}
}
private:
// silent
int silent;
RegTree tree;
TreeParamTrain param;
private:
// some training parameters
// tree maker
int tree_maker;
// interaction
int interact_type;
int interact_node;
// feature constrain
utils::FeatConstrain constrain;
private:
struct ThreadEntry{
std::vector<float> feat;
std::vector<bool> funknown;
};
std::vector<ThreadEntry> threadtemp;
private:
inline ThreadEntry& InitTmp( void ){
const int tid = omp_get_thread_num();
utils::Assert( tid < (int)threadtemp.size(), "RTreeUpdater: threadtemp pool is too small" );
ThreadEntry &e = threadtemp[ tid ];
if( e.feat.size() != (size_t)tree.param.num_feature ){
e.feat.resize( tree.param.num_feature );
e.funknown.resize( tree.param.num_feature );
std::fill( e.funknown.begin(), e.funknown.end(), true );
}
return e;
}
inline void PrepareTmp( typename FMatrix::RowIter it, ThreadEntry &e ){
while( it.Next() ){
const bst_uint findex = it.findex();
utils::Assert( findex < (unsigned)tree.param.num_feature , "input feature execeed bound" );
e.funknown[ findex ] = false;
e.feat[ findex ] = it.fvalue();
}
}
inline void DropTmp( typename FMatrix::RowIter it, ThreadEntry &e ){
while( it.Next() ){
e.funknown[ it.findex() ] = true;
}
}
inline int GetNext( int pid, float fvalue, bool is_unknown ){
float split_value = tree[ pid ].split_cond();
if( is_unknown ){
return tree[ pid ].cdefault();
}else{
if( fvalue < split_value ) return tree[ pid ].cleft();
else return tree[ pid ].cright();
}
}
};
};
};
#endif

View File

@@ -0,0 +1,554 @@
#ifndef XGBOOST_TREE_MODEL_H
#define XGBOOST_TREE_MODEL_H
/*!
* \file xgboost_tree_model.h
* \brief generic definition of model structure used in tree models
* used to support learning of boosting tree
* \author Tianqi Chen: tianqi.tchen@gmail.com
*/
#include <cstring>
#include "../../utils/xgboost_utils.h"
#include "../../utils/xgboost_stream.h"
namespace xgboost{
namespace booster{
/*!
* \brief template class of TreeModel
* \tparam TSplitCond data type to indicate split condition
* \tparam TNodeStat auxiliary statistics of node to help tree building
*/
template<typename TSplitCond,typename TNodeStat>
class TreeModel{
public:
/*! \brief data type to indicate split condition */
typedef TNodeStat NodeStat;
/*! \brief auxiliary statistics of node to help tree building */
typedef TSplitCond SplitCond;
public:
/*! \brief parameters of the tree */
struct Param{
/*! \brief number of start root */
int num_roots;
/*! \brief total number of nodes */
int num_nodes;
/*!\brief number of deleted nodes */
int num_deleted;
/*! \brief maximum depth, this is a statistics of the tree */
int max_depth;
/*! \brief number of features used for tree construction */
int num_feature;
/*! \brief reserved part */
int reserved[ 32 ];
/*! \brief constructor */
Param( void ){
max_depth = 0;
memset( reserved, 0, sizeof( reserved ) );
}
/*!
* \brief set parameters from outside
* \param name name of the parameter
* \param val value of the parameter
*/
inline void SetParam( const char *name, const char *val ){
if( !strcmp("num_roots", name ) ) num_roots = atoi( val );
if( !strcmp("num_feature", name ) ) num_feature = atoi( val );
}
};
/*! \brief tree node */
class Node{
private:
friend class TreeModel<TSplitCond,TNodeStat>;
/*!
* \brief in leaf node, we have weights, in non-leaf nodes,
* we have split condition
*/
union Info{
float leaf_value;
TSplitCond split_cond;
};
private:
// pointer to parent, highest bit is used to indicate whether it's a left child or not
int parent_;
// pointer to left, right
int cleft_, cright_;
// split feature index, left split or right split depends on the highest bit
unsigned sindex_;
// extra info
Info info_;
private:
inline void set_parent( int pidx, bool is_left_child = true ){
if( is_left_child ) pidx |= (1U << 31);
this->parent_ = pidx;
}
public:
/*! \brief index of left child */
inline int cleft( void ) const{
return this->cleft_;
}
/*! \brief index of right child */
inline int cright( void ) const{
return this->cright_;
}
/*! \brief index of default child when feature is missing */
inline int cdefault( void ) const{
return this->default_left() ? this->cleft() : this->cright();
}
/*! \brief feature index of split condition */
inline unsigned split_index( void ) const{
return sindex_ & ( (1U<<31) - 1U );
}
/*! \brief when feature is unknown, whether goes to left child */
inline bool default_left( void ) const{
return (sindex_ >> 31) != 0;
}
/*! \brief whether current node is leaf node */
inline bool is_leaf( void ) const{
return cleft_ == -1;
}
/*! \brief get leaf value of leaf node */
inline float leaf_value( void ) const{
return (this->info_).leaf_value;
}
/*! \brief get split condition of the node */
inline TSplitCond split_cond( void ) const{
return (this->info_).split_cond;
}
/*! \brief get parent of the node */
inline int parent( void ) const{
return parent_ & ( (1U << 31) - 1 );
}
/*! \brief whether current node is left child */
inline bool is_left_child( void ) const{
return ( parent_ & (1U << 31)) != 0;
}
/*! \brief whether current node is root */
inline bool is_root( void ) const{
return parent_ == -1;
}
/*!
* \brief set the right child
* \param nide node id to right child
*/
inline void set_right_child( int nid ){
this->cright_ = nid;
}
/*!
* \brief set split condition of current node
* \param split_index feature index to split
* \param split_cond split condition
* \param default_left the default direction when feature is unknown
*/
inline void set_split( unsigned split_index, TSplitCond split_cond, bool default_left = false ){
if( default_left ) split_index |= (1U << 31);
this->sindex_ = split_index;
(this->info_).split_cond = split_cond;
}
/*!
* \brief set the leaf value of the node
* \param value leaf value
* \param right right index, could be used to store
* additional information
*/
inline void set_leaf( float value, int right = -1 ){
(this->info_).leaf_value = value;
this->cleft_ = -1;
this->cright_ = right;
}
};
protected:
// vector of nodes
std::vector<Node> nodes;
// stats of nodes
std::vector<TNodeStat> stats;
protected:
// free node space, used during training process
std::vector<int> deleted_nodes;
// allocate a new node,
// !!!!!! NOTE: may cause BUG here, nodes.resize
inline int AllocNode( void ){
if( param.num_deleted != 0 ){
int nd = deleted_nodes.back();
deleted_nodes.pop_back();
param.num_deleted --;
return nd;
}
int nd = param.num_nodes ++;
nodes.resize( param.num_nodes );
stats.resize( param.num_nodes );
return nd;
}
// delete a tree node
inline void DeleteNode( int nid ){
utils::Assert( nid >= param.num_roots, "can not delete root");
deleted_nodes.push_back( nid );
nodes[ nid ].set_parent( -1 );
param.num_deleted ++;
}
public:
/*!
* \brief change a non leaf node to a leaf node, delete its children
* \param rid node id of the node
* \param new leaf value
*/
inline void ChangeToLeaf( int rid, float value ){
utils::Assert( nodes[ nodes[rid].cleft() ].is_leaf(), "can not delete a non termial child");
utils::Assert( nodes[ nodes[rid].cright() ].is_leaf(), "can not delete a non termial child");
this->DeleteNode( nodes[ rid ].cleft() );
this->DeleteNode( nodes[ rid ].cright() );
nodes[ rid ].set_leaf( value );
}
/*!
* \brief collapse a non leaf node to a leaf node, delete its children
* \param rid node id of the node
* \param new leaf value
*/
inline void CollapseToLeaf( int rid, float value ){
if( nodes[rid].is_leaf() ) return;
if( !nodes[ nodes[rid].cleft() ].is_leaf() ){
CollapseToLeaf( nodes[rid].cleft(), 0.0f );
}
if( !nodes[ nodes[rid].cright() ].is_leaf() ){
CollapseToLeaf( nodes[rid].cright(), 0.0f );
}
this->ChangeToLeaf( rid, value );
}
public:
/*! \brief model parameter */
Param param;
public:
/*! \brief constructor */
TreeModel( void ){
param.num_nodes = 1;
param.num_roots = 1;
param.num_deleted = 0;
nodes.resize( 1 );
}
/*! \brief get node given nid */
inline Node &operator[]( int nid ){
return nodes[ nid ];
}
/*! \brief get node statistics given nid */
inline NodeStat &stat( int nid ){
return stats[ nid ];
}
/*! \brief initialize the model */
inline void InitModel( void ){
param.num_nodes = param.num_roots;
nodes.resize( param.num_nodes );
stats.resize( param.num_nodes );
for( int i = 0; i < param.num_nodes; i ++ ){
nodes[i].set_leaf( 0.0f );
nodes[i].set_parent( -1 );
}
}
/*!
* \brief load model from stream
* \param fi input stream
*/
inline void LoadModel( utils::IStream &fi ){
utils::Assert( fi.Read( &param, sizeof(Param) ) > 0, "TreeModel" );
nodes.resize( param.num_nodes ); stats.resize( param.num_nodes );
utils::Assert( fi.Read( &nodes[0], sizeof(Node) * nodes.size() ) > 0, "TreeModel::Node" );
utils::Assert( fi.Read( &stats[0], sizeof(NodeStat) * stats.size() ) > 0, "TreeModel::Node" );
deleted_nodes.resize( 0 );
for( int i = param.num_roots; i < param.num_nodes; i ++ ){
if( nodes[i].is_root() ) deleted_nodes.push_back( i );
}
utils::Assert( (int)deleted_nodes.size() == param.num_deleted, "number of deleted nodes do not match" );
}
/*!
* \brief save model to stream
* \param fo output stream
*/
inline void SaveModel( utils::IStream &fo ) const{
utils::Assert( param.num_nodes == (int)nodes.size() );
utils::Assert( param.num_nodes == (int)stats.size() );
fo.Write( &param, sizeof(Param) );
fo.Write( &nodes[0], sizeof(Node) * nodes.size() );
fo.Write( &stats[0], sizeof(NodeStat) * nodes.size() );
}
/*!
* \brief add child nodes to node
* \param nid node id to add childs
*/
inline void AddChilds( int nid ){
int pleft = this->AllocNode();
int pright = this->AllocNode();
nodes[ nid ].cleft_ = pleft;
nodes[ nid ].cright_ = pright;
nodes[ nodes[ nid ].cleft() ].set_parent( nid, true );
nodes[ nodes[ nid ].cright() ].set_parent( nid, false );
}
/*!
* \brief only add a right child to a leaf node
* \param node id to add right child
*/
inline void AddRightChild( int nid ){
int pright = this->AllocNode();
nodes[ nid ].right = pright;
nodes[ nodes[ nid ].right ].set_parent( nid, false );
}
/*!
* \brief get current depth
* \param nid node id
* \param pass_rchild whether right child is not counted in depth
*/
inline int GetDepth( int nid, bool pass_rchild = false ) const{
int depth = 0;
while( !nodes[ nid ].is_root() ){
if( !pass_rchild || nodes[ nid ].is_left_child() ) depth ++;
nid = nodes[ nid ].parent();
}
return depth;
}
/*!
* \brief get maximum depth
* \param nid node id
*/
inline int MaxDepth( int nid ) const{
if( nodes[nid].is_leaf() ) return 0;
return std::max( MaxDepth( nodes[nid].cleft() )+1,
MaxDepth( nodes[nid].cright() )+1 );
}
/*!
* \brief get maximum depth
*/
inline int MaxDepth( void ){
int maxd = 0;
for( int i = 0; i < param.num_roots; ++ i ){
maxd = std::max( maxd, MaxDepth( i ) );
}
return maxd;
}
/*! \brief number of extra nodes besides the root */
inline int num_extra_nodes( void ) const {
return param.num_nodes - param.num_roots - param.num_deleted;
}
/*! \brief dump model to text file */
inline void DumpModel( FILE *fo, const utils::FeatMap& fmap, bool with_stats ){
this->Dump( 0, fo, fmap, 0, with_stats );
}
private:
void Dump( int nid, FILE *fo, const utils::FeatMap& fmap, int depth, bool with_stats ){
for( int i = 0; i < depth; ++ i ){
fprintf( fo, "\t" );
}
if( nodes[ nid ].is_leaf() ){
fprintf( fo, "%d:leaf=%f ", nid, nodes[ nid ].leaf_value() );
if( with_stats ){
stat( nid ).Print( fo, true );
}
fprintf( fo, "\n" );
}else{
// right then left,
TSplitCond cond = nodes[ nid ].split_cond();
const unsigned split_index = nodes[ nid ].split_index();
if( split_index < fmap.size() ){
switch( fmap.type(split_index) ){
case utils::FeatMap::kIndicator:{
int nyes = nodes[ nid ].default_left()?nodes[nid].cright():nodes[nid].cleft();
fprintf( fo, "%d:[%s] yes=%d,no=%d",
nid, fmap.name( split_index ),
nyes, nodes[nid].cdefault() );
break;
}
case utils::FeatMap::kInteger:{
fprintf( fo, "%d:[%s<%d] yes=%d,no=%d,missing=%d",
nid, fmap.name(split_index), int( float(cond)+1.0f),
nodes[ nid ].cleft(), nodes[ nid ].cright(),
nodes[ nid ].cdefault() );
break;
}
case utils::FeatMap::kFloat:
case utils::FeatMap::kQuantitive:{
fprintf( fo, "%d:[%s<%f] yes=%d,no=%d,missing=%d",
nid, fmap.name(split_index), float(cond),
nodes[ nid ].cleft(), nodes[ nid ].cright(),
nodes[ nid ].cdefault() );
break;
}
default: utils::Error("unknown fmap type");
}
}else{
fprintf( fo, "%d:[f%u<%f] yes=%d,no=%d,missing=%d",
nid, split_index, float(cond),
nodes[ nid ].cleft(), nodes[ nid ].cright(),
nodes[ nid ].cdefault() );
}
if( with_stats ){
fprintf( fo, " ");
stat( nid ).Print( fo, false );
}
fprintf( fo, "\n" );
this->Dump( nodes[ nid ].cleft(), fo, fmap, depth+1, with_stats );
this->Dump( nodes[ nid ].cright(), fo, fmap, depth+1, with_stats );
}
}
};
};
namespace booster{
/*! \brief training parameters for regression tree */
struct TreeParamTrain{
// learning step size for a time
float learning_rate;
// minimum loss change required for a split
float min_split_loss;
// maximum depth of a tree
int max_depth;
//----- the rest parameters are less important ----
// minimum amount of hessian(weight) allowed in a child
float min_child_weight;
// weight decay parameter used to control leaf fitting
float reg_lambda;
// reg method
int reg_method;
// default direction choice
int default_direction;
// whether we want to do subsample
float subsample;
// whether to use layerwise aware regularization
int use_layerwise;
// number of threads to be used for tree construction, if OpenMP is enabled, if equals 0, use system default
int nthread;
/*! \brief constructor */
TreeParamTrain( void ){
learning_rate = 0.3f;
min_child_weight = 1.0f;
max_depth = 6;
reg_lambda = 1.0f;
reg_method = 2;
default_direction = 0;
subsample = 1.0f;
use_layerwise = 0;
nthread = 0;
}
/*!
* \brief set parameters from outside
* \param name name of the parameter
* \param val value of the parameter
*/
inline void SetParam( const char *name, const char *val ){
// sync-names
if( !strcmp( name, "gamma") ) min_split_loss = (float)atof( val );
if( !strcmp( name, "eta") ) learning_rate = (float)atof( val );
if( !strcmp( name, "lambda") ) reg_lambda = (float)atof( val );
// normal tree prameters
if( !strcmp( name, "learning_rate") ) learning_rate = (float)atof( val );
if( !strcmp( name, "min_child_weight") ) min_child_weight = (float)atof( val );
if( !strcmp( name, "min_split_loss") ) min_split_loss = (float)atof( val );
if( !strcmp( name, "max_depth") ) max_depth = atoi( val );
if( !strcmp( name, "reg_lambda") ) reg_lambda = (float)atof( val );
if( !strcmp( name, "reg_method") ) reg_method = (float)atof( val );
if( !strcmp( name, "subsample") ) subsample = (float)atof( val );
if( !strcmp( name, "use_layerwise") ) use_layerwise = atoi( val );
if( !strcmp( name, "nthread") ) nthread = atoi( val );
if( !strcmp( name, "default_direction") ) {
if( !strcmp( val, "learn") ) default_direction = 0;
if( !strcmp( val, "left") ) default_direction = 1;
if( !strcmp( val, "right") ) default_direction = 2;
}
}
protected:
// functions for L1 cost
static inline double ThresholdL1( double w, double lambda ){
if( w > +lambda ) return w - lambda;
if( w < -lambda ) return w + lambda;
return 0.0;
}
inline double CalcWeight( double sum_grad, double sum_hess )const{
if( sum_hess < min_child_weight ){
return 0.0;
}else{
switch( reg_method ){
case 1: return - ThresholdL1( sum_grad, reg_lambda ) / sum_hess;
case 2: return - sum_grad / ( sum_hess + reg_lambda );
// elstic net
case 3: return - ThresholdL1( sum_grad, 0.5 * reg_lambda ) / ( sum_hess + 0.5 * reg_lambda );
default: return - sum_grad / sum_hess;
}
}
}
private:
inline static double Sqr( double a ){
return a * a;
}
public:
// calculate the cost of loss function
inline double CalcGain( double sum_grad, double sum_hess ) const{
if( sum_hess < min_child_weight ){
return 0.0;
}
switch( reg_method ){
case 1 : return Sqr( ThresholdL1( sum_grad, reg_lambda ) ) / sum_hess;
case 2 : return Sqr( sum_grad ) / ( sum_hess + reg_lambda );
// elstic net
case 3 : return Sqr( ThresholdL1( sum_grad, 0.5 * reg_lambda ) ) / ( sum_hess + 0.5 * reg_lambda );
default: return Sqr( sum_grad ) / sum_hess;
}
}
// KEY:layerwise
// calculate cost of root
inline double CalcRootGain( double sum_grad, double sum_hess ) const{
if( use_layerwise == 0 ) return this->CalcGain( sum_grad, sum_hess );
else return 0.0;
}
// KEY:layerwise
// calculate the cost after split
// base_weight: the base_weight of parent
inline double CalcGain( double sum_grad, double sum_hess, double base_weight ) const{
if( use_layerwise == 0 ) return this->CalcGain( sum_grad, sum_hess );
else return this->CalcGain( sum_grad + sum_hess * base_weight, sum_hess );
}
// calculate the weight of leaf
inline double CalcWeight( double sum_grad, double sum_hess, double parent_base_weight )const{
if( use_layerwise == 0 ) return CalcWeight( sum_grad, sum_hess );
else return parent_base_weight + CalcWeight( sum_grad + parent_base_weight * sum_hess, sum_hess );
}
/*! \brief whether need forward small to big search: default right */
inline bool need_forward_search( void ) const{
return this->default_direction != 1;
}
/*! \brief whether need forward big to small search: default left */
inline bool need_backward_search( void ) const{
return this->default_direction != 2;
}
/*! \brief given the loss change, whether we need to invode prunning */
inline bool need_prune( double loss_chg, int depth ) const{
return loss_chg < this->min_split_loss;
}
/*! \brief whether we can split with current hessian */
inline bool cannot_split( double sum_hess, int depth ) const{
return sum_hess < this->min_child_weight * 2.0;
}
};
};
namespace booster{
/*! \brief node statistics used in regression tree */
struct RTreeNodeStat{
/*! \brief loss chg caused by current split */
float loss_chg;
/*! \brief sum of hessian values, used to measure coverage of data */
float sum_hess;
/*! \brief weight of current node */
float base_weight;
/*! \brief number of child that is leaf node known up to now */
int leaf_child_cnt;
/*! \brief print information of current stats to fo */
inline void Print( FILE *fo, bool is_leaf ) const{
if( !is_leaf ){
fprintf( fo, "gain=%f,cover=%f", loss_chg, sum_hess );
}else{
fprintf( fo, "cover=%f", sum_hess );
}
}
};
/*! \brief most comment structure of regression tree */
class RegTree: public TreeModel<bst_float,RTreeNodeStat>{
};
};
};
#endif

39
booster/xgboost-inl.hpp Normal file
View File

@@ -0,0 +1,39 @@
#ifndef XGBOOST_INL_HPP
#define XGBOOST_INL_HPP
/*!
* \file xgboost-inl.hpp
* \brief bootser implementations
* \author Tianqi Chen: tianqi.tchen@gmail.com
*/
// implementation of boosters go to here
// A good design should have minimum functions defined interface, user should only operate on interface
// I break it a bit, by using template and let user 'see' the implementation
// The user should pretend that they only can use the interface, and we are all cool
// I find this is the only way so far I can think of to make boosters invariant of data structure,
// while keep everything fast
#include "xgboost.h"
#include "../utils/xgboost_utils.h"
#include "tree/xgboost_tree.hpp"
#include "linear/xgboost_linear.hpp"
namespace xgboost{
namespace booster{
/*!
* \brief create a gradient booster, given type of booster
* \param booster_type type of gradient booster, can be used to specify implements
* \tparam FMatrix input data type for booster
* \return the pointer to the gradient booster created
*/
template<typename FMatrix>
inline InterfaceBooster<FMatrix> *CreateBooster( int booster_type ){
switch( booster_type ){
case 0: return new RegTreeTrainer<FMatrix>();
case 1: return new LinearBooster<FMatrix>();
default: utils::Error("unknown booster_type"); return NULL;
}
}
}; // namespace booster
}; // namespace xgboost
#endif // XGBOOST_INL_HPP

157
booster/xgboost.h Normal file
View File

@@ -0,0 +1,157 @@
#ifndef XGBOOST_H
#define XGBOOST_H
/*!
* \file xgboost.h
* \brief the general gradient boosting interface
*
* common practice of this header: use IBooster and CreateBooster<FMatrixS>
*
* \author Tianqi Chen: tianqi.tchen@gmail.com
*/
#include <vector>
#include "../utils/xgboost_utils.h"
#include "../utils/xgboost_fmap.h"
#include "../utils/xgboost_stream.h"
#include "../utils/xgboost_config.h"
#include "xgboost_data.h"
/*! \brief namespace for xboost package */
namespace xgboost{
/*! \brief namespace for boosters */
namespace booster{
/*!
* \brief interface of a gradient boosting learner
* \tparam FMatrix the feature matrix format that the booster takes
*/
template<typename FMatrix>
class InterfaceBooster{
public:
// interface for model setting and loading
// calling procedure:
// (1) booster->SetParam to setting necessary parameters
// (2) if it is first time usage of the model:
// call booster->InitModel
// else:
// call booster->LoadModel
// (3) booster->DoBoost to update the model
// (4) booster->Predict to get new prediction
/*!
* \brief set parameters from outside
* \param name name of the parameter
* \param val value of the parameter
*/
virtual void SetParam( const char *name, const char *val ) = 0;
/*!
* \brief load model from stream
* \param fi input stream
*/
virtual void LoadModel( utils::IStream &fi ) = 0;
/*!
* \brief save model to stream
* \param fo output stream
*/
virtual void SaveModel( utils::IStream &fo ) const = 0;
/*!
* \brief initialize solver before training, called before training
* this function is reserved for solver to allocate necessary space and do other preparation
*/
virtual void InitModel( void ) = 0;
public:
/*!
* \brief do gradient boost training for one step, using the information given,
* Note: content of grad and hess can change after DoBoost
* \param grad first order gradient of each instance
* \param hess second order gradient of each instance
* \param feats features of each instance
* \param root_index pre-partitioned root index of each instance,
* root_index.size() can be 0 which indicates that no pre-partition involved
*/
virtual void DoBoost( std::vector<float> &grad,
std::vector<float> &hess,
const FMatrix &feats,
const std::vector<unsigned> &root_index ) = 0;
/*!
* \brief predict the path ids along a trees, for given sparse feature vector. When booster is a tree
* \param path the result of path
* \param feats feature matrix
* \param row_index row index in the feature matrix
* \param root_index root id of current instance, default = 0
*/
virtual void PredPath( std::vector<int> &path, const FMatrix &feats,
bst_uint row_index, unsigned root_index = 0 ){
utils::Error( "not implemented" );
}
/*!
* \brief predict values for given sparse feature vector
*
* NOTE: in tree implementation, Sparse Predict is OpenMP threadsafe, but not threadsafe in general,
* dense version of Predict to ensures threadsafety
* \param feats feature matrix
* \param row_index row index in the feature matrix
* \param root_index root id of current instance, default = 0
* \return prediction
*/
virtual float Predict( const FMatrix &feats, bst_uint row_index, unsigned root_index = 0 ){
utils::Error( "not implemented" );
return 0.0f;
}
/*!
* \brief predict values for given dense feature vector
* \param feat feature vector in dense format
* \param funknown indicator that the feature is missing
* \param rid root id of current instance, default = 0
* \return prediction
*/
virtual float Predict( const std::vector<float> &feat,
const std::vector<bool> &funknown,
unsigned rid = 0 ){
utils::Error( "not implemented" );
return 0.0f;
}
/*!
* \brief print information
* \param fo output stream
*/
virtual void PrintInfo( FILE *fo ){}
/*!
* \brief dump model into text file
* \param fo output stream
* \param fmap feature map that may help give interpretations of feature
* \param with_stats whether print statistics
*/
virtual void DumpModel( FILE *fo, const utils::FeatMap& fmap, bool with_stats = false ){
utils::Error( "not implemented" );
}
public:
/*! \brief virtual destructor */
virtual ~InterfaceBooster( void ){}
};
};
namespace booster{
/*!
* \brief this will is the most commonly used booster interface
* we try to make booster invariant of data structures, but most cases, FMatrixS is what we wnat
*/
typedef InterfaceBooster<FMatrixS> IBooster;
};
};
namespace xgboost{
namespace booster{
/*!
* \brief create a gradient booster, given type of booster
* normally we use FMatrixS, by calling CreateBooster<FMatrixS>
* \param booster_type type of gradient booster, can be used to specify implements
* \tparam FMatrix input data type for booster
* \return the pointer to the gradient booster created
*/
template<typename FMatrix>
inline InterfaceBooster<FMatrix> *CreateBooster( int booster_type );
};
};
// this file includes the template implementations of all boosters
// the cost of using template is that the user can 'see' all the implementations, which is OK
// ignore implementations and focus on the interface:)
#include "xgboost-inl.hpp"
#endif

393
booster/xgboost_data.h Normal file
View File

@@ -0,0 +1,393 @@
#ifndef XGBOOST_DATA_H
#define XGBOOST_DATA_H
/*!
* \file xgboost_data.h
* \brief the input data structure for gradient boosting
* \author Tianqi Chen: tianqi.tchen@gmail.com
*/
#include <vector>
#include <climits>
#include "../utils/xgboost_utils.h"
#include "../utils/xgboost_stream.h"
#include "../utils/xgboost_matrix_csr.h"
namespace xgboost{
namespace booster{
/*! \brief interger type used in boost */
typedef int bst_int;
/*! \brief unsigned interger type used in boost */
typedef unsigned bst_uint;
/*! \brief float type used in boost */
typedef float bst_float;
/*! \brief debug option for booster */
const bool bst_debug = false;
};
};
namespace xgboost{
namespace booster{
/**
* \brief This is a interface, defining the way to access features,
* by column or by row. This interface is used to make implementation
* of booster does not depend on how feature is stored.
*
* Why template instead of virtual class: for efficiency
* feature matrix is going to be used by most inner loop of the algorithm
*
* \tparam Derived type of actual implementation
* \sa FMatrixS: most of time FMatrixS is sufficient, refer to it if you find it confusing
*/
template<typename Derived>
struct FMatrix{
public:
/*! \brief exmaple iterator over one row */
struct RowIter{
/*!
* \brief move to next position
* \return whether there is element in next position
*/
inline bool Next( void );
/*! \return feature index in current position */
inline bst_uint findex( void ) const;
/*! \return feature value in current position */
inline bst_float fvalue( void ) const;
};
/*! \brief example iterator over one column */
struct ColIter{
/*!
* \brief move to next position
* \return whether there is element in next position
*/
inline bool Next( void );
/*! \return row index of current position */
inline bst_uint rindex( void ) const;
/*! \return feature value in current position */
inline bst_float fvalue( void ) const;
};
/*! \brief backward iterator over column */
struct ColBackIter : public ColIter {};
public:
/*!
* \brief get number of rows
* \return number of rows
*/
inline size_t NumRow( void ) const;
/*!
* \brief get number of columns
* \return number of columns
*/
inline size_t NumCol( void ) const;
/*!
* \brief get row iterator
* \param ridx row index
* \return row iterator
*/
inline RowIter GetRow( size_t ridx ) const;
/*!
* \brief get number of column groups, this ise used together with GetRow( ridx, gid )
* \return number of column group
*/
inline unsigned NumColGroup( void ) const{
return 1;
}
/*!
* \brief get row iterator, return iterator of specific column group
* \param ridx row index
* \param gid colmun group id
* \return row iterator, only iterates over features of specified column group
*/
inline RowIter GetRow( size_t ridx, unsigned gid ) const;
/*! \return whether column access is enabled */
inline bool HaveColAccess( void ) const;
/*!
* \brief get column iterator, the columns must be sorted by feature value
* \param ridx column index
* \return column iterator
*/
inline ColIter GetSortedCol( size_t ridx ) const;
/*!
* \brief get column backward iterator, starts from biggest fvalue, and iterator back
* \param ridx column index
* \return reverse column iterator
*/
inline ColBackIter GetReverseSortedCol( size_t ridx ) const;
};
};
};
namespace xgboost{
namespace booster{
/*!
* \brief feature matrix to store training instance, in sparse CSR format
*/
class FMatrixS: public FMatrix<FMatrixS>{
public:
/*! \brief one entry in a row */
struct REntry{
/*! \brief feature index */
bst_uint findex;
/*! \brief feature value */
bst_float fvalue;
/*! \brief constructor */
REntry( void ){}
/*! \brief constructor */
REntry( bst_uint findex, bst_float fvalue ) : findex(findex), fvalue(fvalue){}
inline static bool cmp_fvalue( const REntry &a, const REntry &b ){
return a.fvalue < b.fvalue;
}
};
/*! \brief one row of sparse feature matrix */
struct Line{
/*! \brief array of feature index */
const REntry *data_;
/*! \brief size of the data */
bst_uint len;
/*! \brief get k-th element */
inline const REntry& operator[]( unsigned i ) const{
return data_[i];
}
};
/*! \brief row iterator */
struct RowIter{
const REntry *dptr_, *end_;
RowIter( const REntry* dptr, const REntry* end )
:dptr_(dptr),end_(end){}
inline bool Next( void ){
if( dptr_ == end_ ) return false;
else{
++ dptr_; return true;
}
}
inline bst_uint findex( void ) const{
return dptr_->findex;
}
inline bst_float fvalue( void ) const{
return dptr_->fvalue;
}
};
/*! \brief column iterator */
struct ColIter: public RowIter{
ColIter( const REntry* dptr, const REntry* end )
:RowIter( dptr, end ){}
inline bst_uint rindex( void ) const{
return this->findex();
}
};
/*! \brief reverse column iterator */
struct ColBackIter: public ColIter{
ColBackIter( const REntry* dptr, const REntry* end )
:ColIter( dptr, end ){}
// shadows RowIter::Next
inline bool Next( void ){
if( dptr_ == end_ ) return false;
else{
-- dptr_; return true;
}
}
};
public:
/*! \brief constructor */
FMatrixS( void ){ this->Clear(); }
/*! \brief get number of rows */
inline size_t NumRow( void ) const{
return row_ptr_.size() - 1;
}
/*!
* \brief get number of nonzero entries
* \return number of nonzero entries
*/
inline size_t NumEntry( void ) const{
return row_data_.size();
}
/*! \brief clear the storage */
inline void Clear( void ){
row_ptr_.clear();
row_ptr_.push_back( 0 );
row_data_.clear();
col_ptr_.clear();
col_data_.clear();
}
/*! \brief get sparse part of current row */
inline Line operator[]( size_t sidx ) const{
Line sp;
utils::Assert( !bst_debug || sidx < this->NumRow(), "row id exceed bound" );
sp.len = static_cast<bst_uint>( row_ptr_[ sidx + 1 ] - row_ptr_[ sidx ] );
sp.data_ = &row_data_[ row_ptr_[ sidx ] ];
return sp;
}
/*!
* \brief add a row to the matrix, with data stored in STL container
* \param findex feature index
* \param fvalue feature value
* \param fstart start bound of feature
* \param fend end bound range of feature
* \return the row id added line
*/
inline size_t AddRow( const std::vector<bst_uint> &findex,
const std::vector<bst_float> &fvalue,
unsigned fstart = 0, unsigned fend = UINT_MAX ){
utils::Assert( findex.size() == fvalue.size() );
unsigned cnt = 0;
for( size_t i = 0; i < findex.size(); i ++ ){
if( findex[i] < fstart || findex[i] >= fend ) continue;
row_data_.push_back( REntry( findex[i], fvalue[i] ) );
cnt ++;
}
row_ptr_.push_back( row_ptr_.back() + cnt );
return row_ptr_.size() - 2;
}
/*! \brief get row iterator*/
inline RowIter GetRow( size_t ridx ) const{
utils::Assert( !bst_debug || ridx < this->NumRow(), "row id exceed bound" );
return RowIter( &row_data_[ row_ptr_[ridx] ] - 1, &row_data_[ row_ptr_[ridx+1] ] - 1 );
}
/*! \brief get row iterator*/
inline RowIter GetRow( size_t ridx, unsigned gid ) const{
utils::Assert( gid == 0, "FMatrixS only have 1 column group" );
return FMatrixS::GetRow( ridx );
}
public:
/*! \return whether column access is enabled */
inline bool HaveColAccess( void ) const{
return col_ptr_.size() != 0 && col_data_.size() == row_data_.size();
}
/*! \brief get number of colmuns */
inline size_t NumCol( void ) const{
utils::Assert( this->HaveColAccess() );
return col_ptr_.size() - 1;
}
/*! \brief get col iterator*/
inline ColIter GetSortedCol( size_t cidx ) const{
utils::Assert( !bst_debug || cidx < this->NumCol(), "col id exceed bound" );
return ColIter( &col_data_[ col_ptr_[cidx] ] - 1, &col_data_[ col_ptr_[cidx+1] ] - 1 );
}
/*! \brief get col iterator */
inline ColBackIter GetReverseSortedCol( size_t cidx ) const{
utils::Assert( !bst_debug || cidx < this->NumCol(), "col id exceed bound" );
return ColBackIter( &col_data_[ col_ptr_[cidx+1] ], &col_data_[ col_ptr_[cidx] ] );
}
/*!
* \brief intialize the data so that we have both column and row major
* access, call this whenever we need column access
*/
inline void InitData( void ){
utils::SparseCSRMBuilder<REntry> builder( col_ptr_, col_data_ );
builder.InitBudget( 0 );
for( size_t i = 0; i < this->NumRow(); i ++ ){
for( RowIter it = this->GetRow(i); it.Next(); ){
builder.AddBudget( it.findex() );
}
}
builder.InitStorage();
for( size_t i = 0; i < this->NumRow(); i ++ ){
for( RowIter it = this->GetRow(i); it.Next(); ){
builder.PushElem( it.findex(), REntry( (bst_uint)i, it.fvalue() ) );
}
}
// sort columns
unsigned ncol = static_cast<unsigned>( this->NumCol() );
for( unsigned i = 0; i < ncol; i ++ ){
std::sort( &col_data_[ col_ptr_[ i ] ], &col_data_[ col_ptr_[ i+1 ] ], REntry::cmp_fvalue );
}
}
/*!
* \brief save data to binary stream
* note: since we have size_t in ptr,
* the function is not consistent between 64bit and 32bit machine
* \param fo output stream
*/
inline void SaveBinary( utils::IStream &fo ) const{
FMatrixS::SaveBinary( fo, row_ptr_, row_data_ );
int col_access = this->HaveColAccess() ? 1 : 0;
fo.Write( &col_access, sizeof(int) );
if( col_access != 0 ){
FMatrixS::SaveBinary( fo, col_ptr_, col_data_ );
}
}
/*!
* \brief load data from binary stream
* note: since we have size_t in ptr,
* the function is not consistent between 64bit and 32bit machin
* \param fi input stream
*/
inline void LoadBinary( utils::IStream &fi ){
FMatrixS::LoadBinary( fi, row_ptr_, row_data_ );
int col_access;
fi.Read( &col_access, sizeof(int) );
if( col_access != 0 ){
FMatrixS::LoadBinary( fi, col_ptr_, col_data_ );
}
}
/*!
* \brief load from text file
* \param fi input file pointer
*/
inline void LoadText( FILE *fi ){
this->Clear();
int ninst;
while( fscanf( fi, "%d", &ninst ) == 1 ){
std::vector<booster::bst_uint> findex;
std::vector<booster::bst_float> fvalue;
while( ninst -- ){
unsigned index; float value;
utils::Assert( fscanf( fi, "%u:%f", &index, &value ) == 2, "load Text" );
findex.push_back( index ); fvalue.push_back( value );
}
this->AddRow( findex, fvalue );
}
// initialize column support as well
this->InitData();
}
private:
/*!
* \brief save data to binary stream
* \param fo output stream
* \param ptr pointer data
* \param data data content
*/
inline static void SaveBinary( utils::IStream &fo,
const std::vector<size_t> &ptr,
const std::vector<REntry> &data ){
size_t nrow = ptr.size() - 1;
fo.Write( &nrow, sizeof(size_t) );
fo.Write( &ptr[0], ptr.size() * sizeof(size_t) );
if( data.size() != 0 ){
fo.Write( &data[0] , data.size() * sizeof(REntry) );
}
}
/*!
* \brief load data from binary stream
* \param fi input stream
* \param ptr pointer data
* \param data data content
*/
inline static void LoadBinary( utils::IStream &fi,
std::vector<size_t> &ptr,
std::vector<REntry> &data ){
size_t nrow;
utils::Assert( fi.Read( &nrow, sizeof(size_t) ) != 0, "Load FMatrixS" );
ptr.resize( nrow + 1 );
utils::Assert( fi.Read( &ptr[0], ptr.size() * sizeof(size_t) ), "Load FMatrixS" );
data.resize( ptr.back() );
if( data.size() != 0 ){
utils::Assert( fi.Read( &data[0] , data.size() * sizeof(REntry) ) , "Load FMatrixS" );
}
}
protected:
/*! \brief row pointer of CSR sparse storage */
std::vector<size_t> row_ptr_;
/*! \brief data in the row */
std::vector<REntry> row_data_;
/*! \brief column pointer of CSC format */
std::vector<size_t> col_ptr_;
/*! \brief column datas */
std::vector<REntry> col_data_;
};
};
};
#endif

385
booster/xgboost_gbmbase.h Normal file
View File

@@ -0,0 +1,385 @@
#ifndef XGBOOST_GBMBASE_H
#define XGBOOST_GBMBASE_H
#include <cstring>
#include "xgboost.h"
#include "xgboost_data.h"
#include "../utils/xgboost_omp.h"
#include "../utils/xgboost_config.h"
/*!
* \file xgboost_gbmbase.h
* \brief a base model class,
* that assembles the ensembles of booster together and do model update
* this class can be used as base code to create booster variants
*
* The detailed implementation of boosters should start by using the class
* provided by this file
*
* \author Tianqi Chen: tianqi.tchen@gmail.com
*/
namespace xgboost{
namespace booster{
/*!
* \brief a base model class,
* that assembles the ensembles of booster together and provide single routines to do prediction buffer and update
* this class can be used as base code to create booster variants
* *
* relation to xgboost.h:
* (1) xgboost.h provides a interface to a single booster(e.g. a single regression tree )
* while GBMBaseModel builds upon IBooster to build a class that
* ensembls the boosters together;
* (2) GBMBaseModel provides prediction buffering scheme to speedup training;
* (3) Summary: GBMBaseModel is a standard wrapper for boosting ensembles;
*
* Usage of this class, the number index gives calling dependencies:
* (1) model.SetParam to set the parameters
* (2) model.LoadModel to load old models or model.InitModel to create a new model
* (3) model.InitTrainer before calling model.Predict and model.DoBoost
* (4) model.Predict to get predictions given a instance
* (4) model.DoBoost to update the ensembles, add new booster to the model
* (4) model.SaveModel to save learned results
*
* Bufferring: each instance comes with a buffer_index in Predict.
* when mparam.num_pbuffer != 0, a unique buffer index can be
* assigned to each instance to buffer previous results of boosters,
* this helps to speedup training, so consider assign buffer_index
* for each training instances, if buffer_index = -1, the code
* recalculate things from scratch and will still works correctly
*/
class GBMBase{
public:
/*! \brief number of thread used */
GBMBase( void ){}
/*! \brief destructor */
virtual ~GBMBase( void ){
this->FreeSpace();
}
/*!
* \brief set parameters from outside
* \param name name of the parameter
* \param val value of the parameter
*/
inline void SetParam( const char *name, const char *val ){
if( !strncmp( name, "bst:", 4 ) ){
cfg.PushBack( name + 4, val );
}
if( !strcmp( name, "silent") ){
cfg.PushBack( name, val );
}
tparam.SetParam( name, val );
if( boosters.size() == 0 ) mparam.SetParam( name, val );
}
/*!
* \brief load model from stream
* \param fi input stream
*/
inline void LoadModel( utils::IStream &fi ){
if( boosters.size() != 0 ) this->FreeSpace();
utils::Assert( fi.Read( &mparam, sizeof(ModelParam) ) != 0 );
boosters.resize( mparam.num_boosters );
for( size_t i = 0; i < boosters.size(); i ++ ){
boosters[ i ] = booster::CreateBooster<FMatrixS>( mparam.booster_type );
boosters[ i ]->LoadModel( fi );
}
{// load info
booster_info.resize( mparam.num_boosters );
if( mparam.num_boosters != 0 ){
utils::Assert( fi.Read( &booster_info[0], sizeof(int)*mparam.num_boosters ) != 0 );
}
}
if( mparam.num_pbuffer != 0 ){
pred_buffer.resize ( mparam.num_pbuffer );
pred_counter.resize( mparam.num_pbuffer );
utils::Assert( fi.Read( &pred_buffer[0] , pred_buffer.size()*sizeof(float) ) != 0 );
utils::Assert( fi.Read( &pred_counter[0], pred_counter.size()*sizeof(unsigned) ) != 0 );
}
}
/*!
* \brief save model to stream
* \param fo output stream
*/
inline void SaveModel( utils::IStream &fo ) const {
utils::Assert( mparam.num_boosters == (int)boosters.size() );
fo.Write( &mparam, sizeof(ModelParam) );
for( size_t i = 0; i < boosters.size(); i ++ ){
boosters[ i ]->SaveModel( fo );
}
if( booster_info.size() != 0 ){
fo.Write( &booster_info[0], sizeof(int) * booster_info.size() );
}
if( mparam.num_pbuffer != 0 ){
fo.Write( &pred_buffer[0] , pred_buffer.size()*sizeof(float) );
fo.Write( &pred_counter[0], pred_counter.size()*sizeof(unsigned) );
}
}
/*!
* \brief initialize the current data storage for model, if the model is used first time, call this function
*/
inline void InitModel( void ){
pred_buffer.clear(); pred_counter.clear();
pred_buffer.resize ( mparam.num_pbuffer, 0.0 );
pred_counter.resize( mparam.num_pbuffer, 0 );
utils::Assert( mparam.num_boosters == 0 );
utils::Assert( boosters.size() == 0 );
}
/*!
* \brief initialize solver before training, called before training
* this function is reserved for solver to allocate necessary space and do other preparation
*/
inline void InitTrainer( void ){
if( tparam.nthread != 0 ){
omp_set_num_threads( tparam.nthread );
}
// make sure all the boosters get the latest parameters
for( size_t i = 0; i < this->boosters.size(); i ++ ){
this->ConfigBooster( this->boosters[i] );
}
}
/*!
* \brief DumpModel
* \param fo text file
* \param fmap feature map that may help give interpretations of feature
* \param with_stats whether print statistics
*/
inline void DumpModel( FILE *fo, const utils::FeatMap& fmap, bool with_stats ){
for( size_t i = 0; i < boosters.size(); i ++ ){
fprintf( fo, "booster[%d]\n", (int)i );
boosters[i]->DumpModel( fo, fmap, with_stats );
}
}
/*!
* \brief Dump path of all trees
* \param fo text file
* \param data input data
*/
inline void DumpPath( FILE *fo, const FMatrixS &data ){
for( size_t i = 0; i < data.NumRow(); ++ i ){
for( size_t j = 0; j < boosters.size(); ++ j ){
if( j != 0 ) fprintf( fo, "\t" );
std::vector<int> path;
boosters[j]->PredPath( path, data, i );
fprintf( fo, "%d", path[0] );
for( size_t k = 1; k < path.size(); ++ k ){
fprintf( fo, ",%d", path[k] );
}
}
fprintf( fo, "\n" );
}
}
public:
/*!
* \brief do gradient boost training for one step, using the information given
* Note: content of grad and hess can change after DoBoost
* \param grad first order gradient of each instance
* \param hess second order gradient of each instance
* \param feats features of each instance
* \param root_index pre-partitioned root index of each instance,
* root_index.size() can be 0 which indicates that no pre-partition involved
*/
inline void DoBoost( std::vector<float> &grad,
std::vector<float> &hess,
const booster::FMatrixS &feats,
const std::vector<unsigned> &root_index ) {
booster::IBooster *bst = this->GetUpdateBooster();
bst->DoBoost( grad, hess, feats, root_index );
}
/*!
* \brief predict values for given sparse feature vector
* NOTE: in tree implementation, this is only OpenMP threadsafe, but not threadsafe
* \param feats feature matrix
* \param row_index row index in the feature matrix
* \param buffer_index the buffer index of the current feature line, default -1 means no buffer assigned
* \param root_index root id of current instance, default = 0
* \return prediction
*/
inline float Predict( const FMatrixS &feats, bst_uint row_index, int buffer_index = -1, unsigned root_index = 0 ){
size_t istart = 0;
float psum = 0.0f;
// load buffered results if any
if( mparam.do_reboost == 0 && buffer_index >= 0 ){
utils::Assert( buffer_index < mparam.num_pbuffer, "buffer index exceed num_pbuffer" );
istart = this->pred_counter[ buffer_index ];
psum = this->pred_buffer [ buffer_index ];
}
for( size_t i = istart; i < this->boosters.size(); i ++ ){
psum += this->boosters[ i ]->Predict( feats, row_index, root_index );
}
// updated the buffered results
if( mparam.do_reboost == 0 && buffer_index >= 0 ){
this->pred_counter[ buffer_index ] = static_cast<unsigned>( boosters.size() );
this->pred_buffer [ buffer_index ] = psum;
}
return psum;
}
public:
//--------trial code for interactive update an existing booster------
//-------- usually not needed, ignore this region ---------
/*!
* \brief same as Predict, but removes the prediction of booster to be updated
* this function must be called once and only once for every data with pbuffer
*/
inline float InteractPredict( const FMatrixS &feats, bst_uint row_index, int buffer_index = -1, unsigned root_index = 0 ){
float psum = this->Predict( feats, row_index, buffer_index, root_index );
if( tparam.reupdate_booster != -1 ){
const int bid = tparam.reupdate_booster;
utils::Assert( bid >= 0 && bid < (int)boosters.size(), "interact:booster_index exceed existing bound" );
psum -= boosters[ bid ]->Predict( feats, row_index, root_index );
if( mparam.do_reboost == 0 && buffer_index >= 0 ){
this->pred_buffer[ buffer_index ] = psum;
}
}
return psum;
}
/*! \brief delete the specified booster */
inline void DelteBooster( void ){
const int bid = tparam.reupdate_booster;
utils::Assert( bid >= 0 && bid < mparam.num_boosters , "must specify booster index for deletion");
delete boosters[ bid ];
for( int i = bid + 1; i < mparam.num_boosters; ++ i ){
boosters[i-1] = boosters[ i ];
booster_info[i-1] = booster_info[ i ];
}
boosters.resize( mparam.num_boosters -= 1 );
booster_info.resize( boosters.size() );
}
/*! \brief update the prediction buffer, after booster have been updated */
inline void InteractRePredict( const FMatrixS &feats, bst_uint row_index, int buffer_index = -1, unsigned root_index = 0 ){
if( tparam.reupdate_booster != -1 ){
const int bid = tparam.reupdate_booster;
utils::Assert( bid >= 0 && bid < (int)boosters.size(), "interact:booster_index exceed existing bound" );
if( mparam.do_reboost == 0 && buffer_index >= 0 ){
this->pred_buffer[ buffer_index ] += boosters[ bid ]->Predict( feats, row_index, root_index );
}
}
}
//-----------non public fields afterwards-------------
protected:
/*! \brief free space of the model */
inline void FreeSpace( void ){
for( size_t i = 0; i < boosters.size(); i ++ ){
delete boosters[i];
}
boosters.clear(); booster_info.clear(); mparam.num_boosters = 0;
}
/*! \brief configure a booster */
inline void ConfigBooster( booster::IBooster *bst ){
cfg.BeforeFirst();
while( cfg.Next() ){
bst->SetParam( cfg.name(), cfg.val() );
}
}
/*!
* \brief get a booster to update
* \return the booster created
*/
inline booster::IBooster *GetUpdateBooster( void ){
if( tparam.reupdate_booster != -1 ){
const int bid = tparam.reupdate_booster;
utils::Assert( bid >= 0 && bid < (int)boosters.size(), "interact:booster_index exceed existing bound" );
this->ConfigBooster( boosters[bid] );
return boosters[ bid ];
}
if( mparam.do_reboost == 0 || boosters.size() == 0 ){
mparam.num_boosters += 1;
boosters.push_back( booster::CreateBooster<FMatrixS>( mparam.booster_type ) );
booster_info.push_back( 0 );
this->ConfigBooster( boosters.back() );
boosters.back()->InitModel();
}else{
this->ConfigBooster( boosters.back() );
}
return boosters.back();
}
protected:
/*! \brief model parameters */
struct ModelParam{
/*! \brief number of boosters */
int num_boosters;
/*! \brief type of tree used */
int booster_type;
/*! \brief number of root: default 0, means single tree */
int num_roots;
/*! \brief number of features to be used by boosters */
int num_feature;
/*! \brief size of predicton buffer allocated for buffering boosting computation */
int num_pbuffer;
/*!
* \brief whether we repeatly update a single booster each round: default 0
* set to 1 for linear booster, so that regularization term can be considered
*/
int do_reboost;
/*! \brief reserved parameters */
int reserved[ 32 ];
/*! \brief constructor */
ModelParam( void ){
num_boosters = 0;
booster_type = 0;
num_roots = num_feature = 0;
do_reboost = 0;
num_pbuffer = 0;
memset( reserved, 0, sizeof( reserved ) );
}
/*!
* \brief set parameters from outside
* \param name name of the parameter
* \param val value of the parameter
*/
inline void SetParam( const char *name, const char *val ){
if( !strcmp("booster_type", name ) ){
booster_type = atoi( val );
// linear boost automatically set do reboost
if( booster_type == 1 ) do_reboost = 1;
}
if( !strcmp("num_pbuffer", name ) ) num_pbuffer = atoi( val );
if( !strcmp("do_reboost", name ) ) do_reboost = atoi( val );
if( !strcmp("bst:num_roots", name ) ) num_roots = atoi( val );
if( !strcmp("bst:num_feature", name ) ) num_feature = atoi( val );
}
};
/*! \brief training parameters */
struct TrainParam{
/*! \brief number of OpenMP threads */
int nthread;
/*!
* \brief index of specific booster to be re-updated, default = -1: update new booster
* parameter this is part of trial interactive update mode
*/
int reupdate_booster;
/*! \brief constructor */
TrainParam( void ) {
nthread = 1;
reupdate_booster = -1;
}
/*!
* \brief set parameters from outside
* \param name name of the parameter
* \param val value of the parameter
*/
inline void SetParam( const char *name, const char *val ){
if( !strcmp("nthread", name ) ) nthread = atoi( val );
if( !strcmp("interact:booster_index", name ) ) reupdate_booster = atoi( val );
}
};
protected:
/*! \brief model parameters */
ModelParam mparam;
/*! \brief training parameters */
TrainParam tparam;
protected:
/*! \brief component boosters */
std::vector<booster::IBooster*> boosters;
/*! \brief some information indicator of the booster, reserved */
std::vector<int> booster_info;
/*! \brief prediction buffer */
std::vector<float> pred_buffer;
/*! \brief prediction buffer counter, record the progress so fart of the buffer */
std::vector<unsigned> pred_counter;
/*! \brief configurations saved for each booster */
utils::ConfigSaver cfg;
};
};
};
#endif

View File

@@ -0,0 +1,14 @@
Demonstrating how to use XGBoost accomplish binary classification tasks on UCI mushroom dataset http://archive.ics.uci.edu/ml/datasets/Mushroom
Run: ./runexp.sh
Format of input: LIBSVM format
Format of ```featmap.txt: <featureid> <featurename> <q or i or int>\n ```:
- Feature id must be from 0 to number of features, in sorted order.
- i means this feature is binary indicator feature
- q means this feature is a quantitative value, such as age, time, can be missing
- int means this feature is integer value (when int is hinted, the decision boundary will be integer)
Explainations: https://github.com/tqchen/xgboost/wiki/Binary-Classification

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,32 @@
1. cap-shape: bell=b,conical=c,convex=x,flat=f,knobbed=k,sunken=s
2. cap-surface: fibrous=f,grooves=g,scaly=y,smooth=s
3. cap-color: brown=n,buff=b,cinnamon=c,gray=g,green=r,pink=p,purple=u,red=e,white=w,yellow=y
4. bruises?: bruises=t,no=f
5. odor: almond=a,anise=l,creosote=c,fishy=y,foul=f,
musty=m,none=n,pungent=p,spicy=s
6. gill-attachment: attached=a,descending=d,free=f,notched=n
7. gill-spacing: close=c,crowded=w,distant=d
8. gill-size: broad=b,narrow=n
9. gill-color: black=k,brown=n,buff=b,chocolate=h,gray=g,
green=r,orange=o,pink=p,purple=u,red=e,
white=w,yellow=y
10. stalk-shape: enlarging=e,tapering=t
11. stalk-root: bulbous=b,club=c,cup=u,equal=e,
rhizomorphs=z,rooted=r,missing=?
12. stalk-surface-above-ring: fibrous=f,scaly=y,silky=k,smooth=s
13. stalk-surface-below-ring: fibrous=f,scaly=y,silky=k,smooth=s
14. stalk-color-above-ring: brown=n,buff=b,cinnamon=c,gray=g,orange=o,
pink=p,red=e,white=w,yellow=y
15. stalk-color-below-ring: brown=n,buff=b,cinnamon=c,gray=g,orange=o,
pink=p,red=e,white=w,yellow=y
16. veil-type: partial=p,universal=u
17. veil-color: brown=n,orange=o,white=w,yellow=y
18. ring-number: none=n,one=o,two=t
19. ring-type: cobwebby=c,evanescent=e,flaring=f,large=l,
none=n,pendant=p,sheathing=s,zone=z
20. spore-print-color: black=k,brown=n,buff=b,chocolate=h,green=r,
orange=o,purple=u,white=w,yellow=y
21. population: abundant=a,clustered=c,numerous=n,
scattered=s,several=v,solitary=y
22. habitat: grasses=g,leaves=l,meadows=m,paths=p,
urban=u,waste=w,woods=d

View File

@@ -0,0 +1,148 @@
1. Title: Mushroom Database
2. Sources:
(a) Mushroom records drawn from The Audubon Society Field Guide to North
American Mushrooms (1981). G. H. Lincoff (Pres.), New York: Alfred
A. Knopf
(b) Donor: Jeff Schlimmer (Jeffrey.Schlimmer@a.gp.cs.cmu.edu)
(c) Date: 27 April 1987
3. Past Usage:
1. Schlimmer,J.S. (1987). Concept Acquisition Through Representational
Adjustment (Technical Report 87-19). Doctoral disseration, Department
of Information and Computer Science, University of California, Irvine.
--- STAGGER: asymptoted to 95% classification accuracy after reviewing
1000 instances.
2. Iba,W., Wogulis,J., & Langley,P. (1988). Trading off Simplicity
and Coverage in Incremental Concept Learning. In Proceedings of
the 5th International Conference on Machine Learning, 73-79.
Ann Arbor, Michigan: Morgan Kaufmann.
-- approximately the same results with their HILLARY algorithm
3. In the following references a set of rules (given below) were
learned for this data set which may serve as a point of
comparison for other researchers.
Duch W, Adamczak R, Grabczewski K (1996) Extraction of logical rules
from training data using backpropagation networks, in: Proc. of the
The 1st Online Workshop on Soft Computing, 19-30.Aug.1996, pp. 25-30,
available on-line at: http://www.bioele.nuee.nagoya-u.ac.jp/wsc1/
Duch W, Adamczak R, Grabczewski K, Ishikawa M, Ueda H, Extraction of
crisp logical rules using constrained backpropagation networks -
comparison of two new approaches, in: Proc. of the European Symposium
on Artificial Neural Networks (ESANN'97), Bruge, Belgium 16-18.4.1997,
pp. xx-xx
Wlodzislaw Duch, Department of Computer Methods, Nicholas Copernicus
University, 87-100 Torun, Grudziadzka 5, Poland
e-mail: duch@phys.uni.torun.pl
WWW http://www.phys.uni.torun.pl/kmk/
Date: Mon, 17 Feb 1997 13:47:40 +0100
From: Wlodzislaw Duch <duch@phys.uni.torun.pl>
Organization: Dept. of Computer Methods, UMK
I have attached a file containing logical rules for mushrooms.
It should be helpful for other people since only in the last year I
have seen about 10 papers analyzing this dataset and obtaining quite
complex rules. We will try to contribute other results later.
With best regards, Wlodek Duch
________________________________________________________________
Logical rules for the mushroom data sets.
Logical rules given below seem to be the simplest possible for the
mushroom dataset and therefore should be treated as benchmark results.
Disjunctive rules for poisonous mushrooms, from most general
to most specific:
P_1) odor=NOT(almond.OR.anise.OR.none)
120 poisonous cases missed, 98.52% accuracy
P_2) spore-print-color=green
48 cases missed, 99.41% accuracy
P_3) odor=none.AND.stalk-surface-below-ring=scaly.AND.
(stalk-color-above-ring=NOT.brown)
8 cases missed, 99.90% accuracy
P_4) habitat=leaves.AND.cap-color=white
100% accuracy
Rule P_4) may also be
P_4') population=clustered.AND.cap_color=white
These rule involve 6 attributes (out of 22). Rules for edible
mushrooms are obtained as negation of the rules given above, for
example the rule:
odor=(almond.OR.anise.OR.none).AND.spore-print-color=NOT.green
gives 48 errors, or 99.41% accuracy on the whole dataset.
Several slightly more complex variations on these rules exist,
involving other attributes, such as gill_size, gill_spacing,
stalk_surface_above_ring, but the rules given above are the simplest
we have found.
4. Relevant Information:
This data set includes descriptions of hypothetical samples
corresponding to 23 species of gilled mushrooms in the Agaricus and
Lepiota Family (pp. 500-525). Each species is identified as
definitely edible, definitely poisonous, or of unknown edibility and
not recommended. This latter class was combined with the poisonous
one. The Guide clearly states that there is no simple rule for
determining the edibility of a mushroom; no rule like ``leaflets
three, let it be'' for Poisonous Oak and Ivy.
5. Number of Instances: 8124
6. Number of Attributes: 22 (all nominally valued)
7. Attribute Information: (classes: edible=e, poisonous=p)
1. cap-shape: bell=b,conical=c,convex=x,flat=f,
knobbed=k,sunken=s
2. cap-surface: fibrous=f,grooves=g,scaly=y,smooth=s
3. cap-color: brown=n,buff=b,cinnamon=c,gray=g,green=r,
pink=p,purple=u,red=e,white=w,yellow=y
4. bruises?: bruises=t,no=f
5. odor: almond=a,anise=l,creosote=c,fishy=y,foul=f,
musty=m,none=n,pungent=p,spicy=s
6. gill-attachment: attached=a,descending=d,free=f,notched=n
7. gill-spacing: close=c,crowded=w,distant=d
8. gill-size: broad=b,narrow=n
9. gill-color: black=k,brown=n,buff=b,chocolate=h,gray=g,
green=r,orange=o,pink=p,purple=u,red=e,
white=w,yellow=y
10. stalk-shape: enlarging=e,tapering=t
11. stalk-root: bulbous=b,club=c,cup=u,equal=e,
rhizomorphs=z,rooted=r,missing=?
12. stalk-surface-above-ring: fibrous=f,scaly=y,silky=k,smooth=s
13. stalk-surface-below-ring: fibrous=f,scaly=y,silky=k,smooth=s
14. stalk-color-above-ring: brown=n,buff=b,cinnamon=c,gray=g,orange=o,
pink=p,red=e,white=w,yellow=y
15. stalk-color-below-ring: brown=n,buff=b,cinnamon=c,gray=g,orange=o,
pink=p,red=e,white=w,yellow=y
16. veil-type: partial=p,universal=u
17. veil-color: brown=n,orange=o,white=w,yellow=y
18. ring-number: none=n,one=o,two=t
19. ring-type: cobwebby=c,evanescent=e,flaring=f,large=l,
none=n,pendant=p,sheathing=s,zone=z
20. spore-print-color: black=k,brown=n,buff=b,chocolate=h,green=r,
orange=o,purple=u,white=w,yellow=y
21. population: abundant=a,clustered=c,numerous=n,
scattered=s,several=v,solitary=y
22. habitat: grasses=g,leaves=l,meadows=m,paths=p,
urban=u,waste=w,woods=d
8. Missing Attribute Values: 2480 of them (denoted by "?"), all for
attribute #11.
9. Class Distribution:
-- edible: 4208 (51.8%)
-- poisonous: 3916 (48.2%)
-- total: 8124 instances

View File

@@ -0,0 +1,50 @@
#!/usr/bin/python
import sys
def loadfmap( fname ):
fmap = {}
nmap = {}
for l in open( fname ):
arr = l.split()
if arr[0].find('.') != -1:
idx = int( arr[0].strip('.') )
assert idx not in fmap
fmap[ idx ] = {}
ftype = arr[1].strip(':')
content = arr[2]
else:
content = arr[0]
for it in content.split(','):
if it.strip() == '':
continue
k , v = it.split('=')
fmap[ idx ][ v ] = len(nmap)
nmap[ len(nmap) ] = ftype+'='+k
return fmap, nmap
def write_nmap( fo, nmap ):
for i in xrange( len(nmap) ):
fo.write('%d\t%s\ti\n' % (i, nmap[i]) )
# start here
fmap, nmap = loadfmap( 'agaricus-lepiota.fmap' )
fo = open( 'featmap.txt', 'w' )
write_nmap( fo, nmap )
fo.close()
fo = open( 'agaricus.txt', 'w' )
for l in open( 'agaricus-lepiota.data' ):
arr = l.split(',')
if arr[0] == 'p':
fo.write('1')
else:
assert arr[0] == 'e'
fo.write('0')
for i in xrange( 1,len(arr) ):
fo.write( ' %d:1' % fmap[i][arr[i].strip()] )
fo.write('\n')
fo.close()

View File

@@ -0,0 +1,29 @@
#!/usr/bin/python
import sys
import random
if len(sys.argv) < 2:
print 'Usage:<filename> <k> [nfold = 5]'
exit(0)
random.seed( 10 )
k = int( sys.argv[2] )
if len(sys.argv) > 3:
nfold = int( sys.argv[3] )
else:
nfold = 5
fi = open( sys.argv[1], 'r' )
ftr = open( sys.argv[1]+'.train', 'w' )
fte = open( sys.argv[1]+'.test', 'w' )
for l in fi:
if random.randint( 1 , nfold ) == k:
fte.write( l )
else:
ftr.write( l )
fi.close()
ftr.close()
fte.close()

View File

@@ -0,0 +1,27 @@
# General Parameters, see comment for each definition
# choose the tree booster, 0: tree, 1: linear
booster_type = 0
# choose logistic regression loss function for binary classification
loss_type = 2
# Tree Booster Parameters
# step size shrinkage
bst:eta = 1.0
# minimum loss reduction required to make a further partition
bst:gamma = 1.0
# minimum sum of instance weight(hessian) needed in a child
bst:min_child_weight = 1
# maximum depth of a tree
bst:max_depth = 3
# Task Parameters
# the number of round to do boosting
num_round = 2
# 0 means do not save any model except the final round model
save_period = 0
# The path of training data
data = "agaricus.txt.train"
# The path of validation data, used to monitor training process, here [test] sets name of the validation set
eval[test] = "agaricus.txt.test"
# The path of test data
test:data = "agaricus.txt.test"

View File

@@ -0,0 +1,15 @@
#!/bin/bash
# map feature using indicator encoding, also produce featmap.txt
python mapfeat.py
# split train and test
python mknfold.py agaricus.txt 1
# training and output the models
../../xgboost mushroom.conf
# output prediction task=pred
../../xgboost mushroom.conf task=pred model_in=0002.model
# print the boosters of 00002.model in dump.raw.txt
../../xgboost mushroom.conf task=dump model_in=0002.model name_dump=dump.raw.txt
# use the feature map in printing for better visualization
../../xgboost mushroom.conf task=dump model_in=0002.model fmap=featmap.txt name_dump=dump.nice.txt
cat dump.nice.txt

13
demo/regression/README Normal file
View File

@@ -0,0 +1,13 @@
Demonstrating how to use XGBoost accomplish regression tasks on computer hardware dataset https://archive.ics.uci.edu/ml/datasets/Computer+Hardware
Run: ./runexp.sh
Format of input: LIBSVM format
Format of ```featmap.txt: <featureid> <featurename> <q or i or int>\n ```:
- Feature id must be from 0 to number of features, in sorted order.
- i means this feature is binary indicator feature
- q means this feature is a quantitative value, such as age, time, can be missing
- int means this feature is integer value (when int is hinted, the decision boundary will be integer)
Explainations: https://github.com/tqchen/xgboost/wiki/Regression

View File

@@ -0,0 +1,30 @@
# General Parameters, see comment for each definition
# choose the tree booster, 0: tree, 1: linear
booster_type = 0
# this is the only difference with classification, use 0: linear regression
# when labels are in [0,1] we can also use 1: logistic regression
loss_type = 0
# Tree Booster Parameters
# step size shrinkage
bst:eta = 1.0
# minimum loss reduction required to make a further partition
bst:gamma = 1.0
# minimum sum of instance weight(hessian) needed in a child
bst:min_child_weight = 1
# maximum depth of a tree
bst:max_depth = 3
# Task parameters
# the number of round to do boosting
num_round = 2
# 0 means do not save any model except the final round model
save_period = 0
# The path of training data
data = "machine.txt.train"
# The path of validation data, used to monitor training process, here [test] sets name of the validation set
eval[test] = "machine.txt.test"
# The path of test data
test:data = "machine.txt.test"

View File

@@ -0,0 +1,209 @@
adviser,32/60,125,256,6000,256,16,128,198,199
amdahl,470v/7,29,8000,32000,32,8,32,269,253
amdahl,470v/7a,29,8000,32000,32,8,32,220,253
amdahl,470v/7b,29,8000,32000,32,8,32,172,253
amdahl,470v/7c,29,8000,16000,32,8,16,132,132
amdahl,470v/b,26,8000,32000,64,8,32,318,290
amdahl,580-5840,23,16000,32000,64,16,32,367,381
amdahl,580-5850,23,16000,32000,64,16,32,489,381
amdahl,580-5860,23,16000,64000,64,16,32,636,749
amdahl,580-5880,23,32000,64000,128,32,64,1144,1238
apollo,dn320,400,1000,3000,0,1,2,38,23
apollo,dn420,400,512,3500,4,1,6,40,24
basf,7/65,60,2000,8000,65,1,8,92,70
basf,7/68,50,4000,16000,65,1,8,138,117
bti,5000,350,64,64,0,1,4,10,15
bti,8000,200,512,16000,0,4,32,35,64
burroughs,b1955,167,524,2000,8,4,15,19,23
burroughs,b2900,143,512,5000,0,7,32,28,29
burroughs,b2925,143,1000,2000,0,5,16,31,22
burroughs,b4955,110,5000,5000,142,8,64,120,124
burroughs,b5900,143,1500,6300,0,5,32,30,35
burroughs,b5920,143,3100,6200,0,5,20,33,39
burroughs,b6900,143,2300,6200,0,6,64,61,40
burroughs,b6925,110,3100,6200,0,6,64,76,45
c.r.d,68/10-80,320,128,6000,0,1,12,23,28
c.r.d,universe:2203t,320,512,2000,4,1,3,69,21
c.r.d,universe:68,320,256,6000,0,1,6,33,28
c.r.d,universe:68/05,320,256,3000,4,1,3,27,22
c.r.d,universe:68/137,320,512,5000,4,1,5,77,28
c.r.d,universe:68/37,320,256,5000,4,1,6,27,27
cdc,cyber:170/750,25,1310,2620,131,12,24,274,102
cdc,cyber:170/760,25,1310,2620,131,12,24,368,102
cdc,cyber:170/815,50,2620,10480,30,12,24,32,74
cdc,cyber:170/825,50,2620,10480,30,12,24,63,74
cdc,cyber:170/835,56,5240,20970,30,12,24,106,138
cdc,cyber:170/845,64,5240,20970,30,12,24,208,136
cdc,omega:480-i,50,500,2000,8,1,4,20,23
cdc,omega:480-ii,50,1000,4000,8,1,5,29,29
cdc,omega:480-iii,50,2000,8000,8,1,5,71,44
cambex,1636-1,50,1000,4000,8,3,5,26,30
cambex,1636-10,50,1000,8000,8,3,5,36,41
cambex,1641-1,50,2000,16000,8,3,5,40,74
cambex,1641-11,50,2000,16000,8,3,6,52,74
cambex,1651-1,50,2000,16000,8,3,6,60,74
dec,decsys:10:1091,133,1000,12000,9,3,12,72,54
dec,decsys:20:2060,133,1000,8000,9,3,12,72,41
dec,microvax-1,810,512,512,8,1,1,18,18
dec,vax:11/730,810,1000,5000,0,1,1,20,28
dec,vax:11/750,320,512,8000,4,1,5,40,36
dec,vax:11/780,200,512,8000,8,1,8,62,38
dg,eclipse:c/350,700,384,8000,0,1,1,24,34
dg,eclipse:m/600,700,256,2000,0,1,1,24,19
dg,eclipse:mv/10000,140,1000,16000,16,1,3,138,72
dg,eclipse:mv/4000,200,1000,8000,0,1,2,36,36
dg,eclipse:mv/6000,110,1000,4000,16,1,2,26,30
dg,eclipse:mv/8000,110,1000,12000,16,1,2,60,56
dg,eclipse:mv/8000-ii,220,1000,8000,16,1,2,71,42
formation,f4000/100,800,256,8000,0,1,4,12,34
formation,f4000/200,800,256,8000,0,1,4,14,34
formation,f4000/200ap,800,256,8000,0,1,4,20,34
formation,f4000/300,800,256,8000,0,1,4,16,34
formation,f4000/300ap,800,256,8000,0,1,4,22,34
four-phase,2000/260,125,512,1000,0,8,20,36,19
gould,concept:32/8705,75,2000,8000,64,1,38,144,75
gould,concept:32/8750,75,2000,16000,64,1,38,144,113
gould,concept:32/8780,75,2000,16000,128,1,38,259,157
hp,3000/30,90,256,1000,0,3,10,17,18
hp,3000/40,105,256,2000,0,3,10,26,20
hp,3000/44,105,1000,4000,0,3,24,32,28
hp,3000/48,105,2000,4000,8,3,19,32,33
hp,3000/64,75,2000,8000,8,3,24,62,47
hp,3000/88,75,3000,8000,8,3,48,64,54
hp,3000/iii,175,256,2000,0,3,24,22,20
harris,100,300,768,3000,0,6,24,36,23
harris,300,300,768,3000,6,6,24,44,25
harris,500,300,768,12000,6,6,24,50,52
harris,600,300,768,4500,0,1,24,45,27
harris,700,300,384,12000,6,1,24,53,50
harris,80,300,192,768,6,6,24,36,18
harris,800,180,768,12000,6,1,31,84,53
honeywell,dps:6/35,330,1000,3000,0,2,4,16,23
honeywell,dps:6/92,300,1000,4000,8,3,64,38,30
honeywell,dps:6/96,300,1000,16000,8,2,112,38,73
honeywell,dps:7/35,330,1000,2000,0,1,2,16,20
honeywell,dps:7/45,330,1000,4000,0,3,6,22,25
honeywell,dps:7/55,140,2000,4000,0,3,6,29,28
honeywell,dps:7/65,140,2000,4000,0,4,8,40,29
honeywell,dps:8/44,140,2000,4000,8,1,20,35,32
honeywell,dps:8/49,140,2000,32000,32,1,20,134,175
honeywell,dps:8/50,140,2000,8000,32,1,54,66,57
honeywell,dps:8/52,140,2000,32000,32,1,54,141,181
honeywell,dps:8/62,140,2000,32000,32,1,54,189,181
honeywell,dps:8/20,140,2000,4000,8,1,20,22,32
ibm,3033:s,57,4000,16000,1,6,12,132,82
ibm,3033:u,57,4000,24000,64,12,16,237,171
ibm,3081,26,16000,32000,64,16,24,465,361
ibm,3081:d,26,16000,32000,64,8,24,465,350
ibm,3083:b,26,8000,32000,0,8,24,277,220
ibm,3083:e,26,8000,16000,0,8,16,185,113
ibm,370/125-2,480,96,512,0,1,1,6,15
ibm,370/148,203,1000,2000,0,1,5,24,21
ibm,370/158-3,115,512,6000,16,1,6,45,35
ibm,38/3,1100,512,1500,0,1,1,7,18
ibm,38/4,1100,768,2000,0,1,1,13,20
ibm,38/5,600,768,2000,0,1,1,16,20
ibm,38/7,400,2000,4000,0,1,1,32,28
ibm,38/8,400,4000,8000,0,1,1,32,45
ibm,4321,900,1000,1000,0,1,2,11,18
ibm,4331-1,900,512,1000,0,1,2,11,17
ibm,4331-11,900,1000,4000,4,1,2,18,26
ibm,4331-2,900,1000,4000,8,1,2,22,28
ibm,4341,900,2000,4000,0,3,6,37,28
ibm,4341-1,225,2000,4000,8,3,6,40,31
ibm,4341-10,225,2000,4000,8,3,6,34,31
ibm,4341-11,180,2000,8000,8,1,6,50,42
ibm,4341-12,185,2000,16000,16,1,6,76,76
ibm,4341-2,180,2000,16000,16,1,6,66,76
ibm,4341-9,225,1000,4000,2,3,6,24,26
ibm,4361-4,25,2000,12000,8,1,4,49,59
ibm,4361-5,25,2000,12000,16,3,5,66,65
ibm,4381-1,17,4000,16000,8,6,12,100,101
ibm,4381-2,17,4000,16000,32,6,12,133,116
ibm,8130-a,1500,768,1000,0,0,0,12,18
ibm,8130-b,1500,768,2000,0,0,0,18,20
ibm,8140,800,768,2000,0,0,0,20,20
ipl,4436,50,2000,4000,0,3,6,27,30
ipl,4443,50,2000,8000,8,3,6,45,44
ipl,4445,50,2000,8000,8,1,6,56,44
ipl,4446,50,2000,16000,24,1,6,70,82
ipl,4460,50,2000,16000,24,1,6,80,82
ipl,4480,50,8000,16000,48,1,10,136,128
magnuson,m80/30,100,1000,8000,0,2,6,16,37
magnuson,m80/31,100,1000,8000,24,2,6,26,46
magnuson,m80/32,100,1000,8000,24,3,6,32,46
magnuson,m80/42,50,2000,16000,12,3,16,45,80
magnuson,m80/43,50,2000,16000,24,6,16,54,88
magnuson,m80/44,50,2000,16000,24,6,16,65,88
microdata,seq.ms/3200,150,512,4000,0,8,128,30,33
nas,as/3000,115,2000,8000,16,1,3,50,46
nas,as/3000-n,115,2000,4000,2,1,5,40,29
nas,as/5000,92,2000,8000,32,1,6,62,53
nas,as/5000-e,92,2000,8000,32,1,6,60,53
nas,as/5000-n,92,2000,8000,4,1,6,50,41
nas,as/6130,75,4000,16000,16,1,6,66,86
nas,as/6150,60,4000,16000,32,1,6,86,95
nas,as/6620,60,2000,16000,64,5,8,74,107
nas,as/6630,60,4000,16000,64,5,8,93,117
nas,as/6650,50,4000,16000,64,5,10,111,119
nas,as/7000,72,4000,16000,64,8,16,143,120
nas,as/7000-n,72,2000,8000,16,6,8,105,48
nas,as/8040,40,8000,16000,32,8,16,214,126
nas,as/8050,40,8000,32000,64,8,24,277,266
nas,as/8060,35,8000,32000,64,8,24,370,270
nas,as/9000-dpc,38,16000,32000,128,16,32,510,426
nas,as/9000-n,48,4000,24000,32,8,24,214,151
nas,as/9040,38,8000,32000,64,8,24,326,267
nas,as/9060,30,16000,32000,256,16,24,510,603
ncr,v8535:ii,112,1000,1000,0,1,4,8,19
ncr,v8545:ii,84,1000,2000,0,1,6,12,21
ncr,v8555:ii,56,1000,4000,0,1,6,17,26
ncr,v8565:ii,56,2000,6000,0,1,8,21,35
ncr,v8565:ii-e,56,2000,8000,0,1,8,24,41
ncr,v8575:ii,56,4000,8000,0,1,8,34,47
ncr,v8585:ii,56,4000,12000,0,1,8,42,62
ncr,v8595:ii,56,4000,16000,0,1,8,46,78
ncr,v8635,38,4000,8000,32,16,32,51,80
ncr,v8650,38,4000,8000,32,16,32,116,80
ncr,v8655,38,8000,16000,64,4,8,100,142
ncr,v8665,38,8000,24000,160,4,8,140,281
ncr,v8670,38,4000,16000,128,16,32,212,190
nixdorf,8890/30,200,1000,2000,0,1,2,25,21
nixdorf,8890/50,200,1000,4000,0,1,4,30,25
nixdorf,8890/70,200,2000,8000,64,1,5,41,67
perkin-elmer,3205,250,512,4000,0,1,7,25,24
perkin-elmer,3210,250,512,4000,0,4,7,50,24
perkin-elmer,3230,250,1000,16000,1,1,8,50,64
prime,50-2250,160,512,4000,2,1,5,30,25
prime,50-250-ii,160,512,2000,2,3,8,32,20
prime,50-550-ii,160,1000,4000,8,1,14,38,29
prime,50-750-ii,160,1000,8000,16,1,14,60,43
prime,50-850-ii,160,2000,8000,32,1,13,109,53
siemens,7.521,240,512,1000,8,1,3,6,19
siemens,7.531,240,512,2000,8,1,5,11,22
siemens,7.536,105,2000,4000,8,3,8,22,31
siemens,7.541,105,2000,6000,16,6,16,33,41
siemens,7.551,105,2000,8000,16,4,14,58,47
siemens,7.561,52,4000,16000,32,4,12,130,99
siemens,7.865-2,70,4000,12000,8,6,8,75,67
siemens,7.870-2,59,4000,12000,32,6,12,113,81
siemens,7.872-2,59,8000,16000,64,12,24,188,149
siemens,7.875-2,26,8000,24000,32,8,16,173,183
siemens,7.880-2,26,8000,32000,64,12,16,248,275
siemens,7.881-2,26,8000,32000,128,24,32,405,382
sperry,1100/61-h1,116,2000,8000,32,5,28,70,56
sperry,1100/81,50,2000,32000,24,6,26,114,182
sperry,1100/82,50,2000,32000,48,26,52,208,227
sperry,1100/83,50,2000,32000,112,52,104,307,341
sperry,1100/84,50,4000,32000,112,52,104,397,360
sperry,1100/93,30,8000,64000,96,12,176,915,919
sperry,1100/94,30,8000,64000,128,12,176,1150,978
sperry,80/3,180,262,4000,0,1,3,12,24
sperry,80/4,180,512,4000,0,1,3,14,24
sperry,80/5,180,262,4000,0,1,3,18,24
sperry,80/6,180,512,4000,0,1,3,21,24
sperry,80/8,124,1000,8000,0,1,8,42,37
sperry,90/80-model-3,98,1000,8000,32,2,8,46,50
sratus,32,125,2000,8000,0,2,14,52,41
wang,vs-100,480,512,8000,32,0,0,67,47
wang,vs-90,480,1000,4000,0,0,0,45,25

View File

@@ -0,0 +1,72 @@
1. Title: Relative CPU Performance Data
2. Source Information
-- Creators: Phillip Ein-Dor and Jacob Feldmesser
-- Ein-Dor: Faculty of Management; Tel Aviv University; Ramat-Aviv;
Tel Aviv, 69978; Israel
-- Donor: David W. Aha (aha@ics.uci.edu) (714) 856-8779
-- Date: October, 1987
3. Past Usage:
1. Ein-Dor and Feldmesser (CACM 4/87, pp 308-317)
-- Results:
-- linear regression prediction of relative cpu performance
-- Recorded 34% average deviation from actual values
2. Kibler,D. & Aha,D. (1988). Instance-Based Prediction of
Real-Valued Attributes. In Proceedings of the CSCSI (Canadian
AI) Conference.
-- Results:
-- instance-based prediction of relative cpu performance
-- similar results; no transformations required
- Predicted attribute: cpu relative performance (numeric)
4. Relevant Information:
-- The estimated relative performance values were estimated by the authors
using a linear regression method. See their article (pp 308-313) for
more details on how the relative performance values were set.
5. Number of Instances: 209
6. Number of Attributes: 10 (6 predictive attributes, 2 non-predictive,
1 goal field, and the linear regression's guess)
7. Attribute Information:
1. vendor name: 30
(adviser, amdahl,apollo, basf, bti, burroughs, c.r.d, cambex, cdc, dec,
dg, formation, four-phase, gould, honeywell, hp, ibm, ipl, magnuson,
microdata, nas, ncr, nixdorf, perkin-elmer, prime, siemens, sperry,
sratus, wang)
2. Model Name: many unique symbols
3. MYCT: machine cycle time in nanoseconds (integer)
4. MMIN: minimum main memory in kilobytes (integer)
5. MMAX: maximum main memory in kilobytes (integer)
6. CACH: cache memory in kilobytes (integer)
7. CHMIN: minimum channels in units (integer)
8. CHMAX: maximum channels in units (integer)
9. PRP: published relative performance (integer)
10. ERP: estimated relative performance from the original article (integer)
8. Missing Attribute Values: None
9. Class Distribution: the class value (PRP) is continuously valued.
PRP Value Range: Number of Instances in Range:
0-20 31
21-100 121
101-200 27
201-300 13
301-400 7
401-500 4
501-600 2
above 600 4
Summary Statistics:
Min Max Mean SD PRP Correlation
MCYT: 17 1500 203.8 260.3 -0.3071
MMIN: 64 32000 2868.0 3878.7 0.7949
MMAX: 64 64000 11796.1 11726.6 0.8630
CACH: 0 256 25.2 40.6 0.6626
CHMIN: 0 52 4.7 6.8 0.6089
CHMAX: 0 176 18.2 26.0 0.6052
PRP: 6 1150 105.6 160.8 1.0000
ERP: 15 1238 99.3 154.8 0.9665

32
demo/regression/mapfeat.py Executable file
View File

@@ -0,0 +1,32 @@
#!/usr/bin/python
import sys
fo = open( 'machine.txt', 'w' )
cnt = 6
fmap = {}
for l in open( 'machine.data' ):
arr = l.split(',')
fo.write(arr[8])
for i in xrange( 0,6 ):
fo.write( ' %d:%s' %(i,arr[i+2]) )
if arr[0] not in fmap:
fmap[arr[0]] = cnt
cnt += 1
fo.write( ' %d:1' % fmap[arr[0]] )
fo.write('\n')
fo.close()
# create feature map for machine data
fo = open('featmap.txt', 'w')
# list from machine.names
names = ['vendor','MYCT', 'MMIN', 'MMAX', 'CACH', 'CHMIN', 'CHMAX', 'PRP', 'ERP' ];
for i in xrange(0,6):
fo.write( '%d\t%s\tint\n' % (i, names[i+1]))
for v, k in sorted( fmap.iteritems(), key = lambda x:x[1] ):
fo.write( '%d\tvendor=%s\ti\n' % (k, v))
fo.close()

29
demo/regression/mknfold.py Executable file
View File

@@ -0,0 +1,29 @@
#!/usr/bin/python
import sys
import random
if len(sys.argv) < 2:
print 'Usage:<filename> <k> [nfold = 5]'
exit(0)
random.seed( 10 )
k = int( sys.argv[2] )
if len(sys.argv) > 3:
nfold = int( sys.argv[3] )
else:
nfold = 5
fi = open( sys.argv[1], 'r' )
ftr = open( sys.argv[1]+'.train', 'w' )
fte = open( sys.argv[1]+'.test', 'w' )
for l in fi:
if random.randint( 1 , nfold ) == k:
fte.write( l )
else:
ftr.write( l )
fi.close()
ftr.close()
fte.close()

16
demo/regression/runexp.sh Executable file
View File

@@ -0,0 +1,16 @@
#!/bin/bash
# map the data to features. For convenience we only use 7 original attributes and encode them as features in a trivial way
python mapfeat.py
# split train and test
python mknfold.py machine.txt 1
# training and output the models
../../xgboost machine.conf
# output predictions of test data
../../xgboost machine.conf task=pred model_in=0002.model
# print the boosters of 0002.model in dump.raw.txt
../../xgboost machine.conf task=dump model_in=0002.model name_dump=dump.raw.txt
# print the boosters of 0002.model in dump.nice.txt with feature map
../../xgboost machine.conf task=dump model_in=0002.model fmap=featmap.txt name_dump=dump.nice.txt
# cat the result
cat dump.nice.txt

403
regression/xgboost_reg.h Normal file
View File

@@ -0,0 +1,403 @@
#ifndef XGBOOST_REG_H
#define XGBOOST_REG_H
/*!
* \file xgboost_reg.h
* \brief class for gradient boosted regression
* \author Kailong Chen: chenkl198812@gmail.com, Tianqi Chen: tianqi.tchen@gmail.com
*/
#include <cmath>
#include <cstdlib>
#include <cstring>
#include "xgboost_reg_data.h"
#include "xgboost_reg_eval.h"
#include "../utils/xgboost_omp.h"
#include "../booster/xgboost_gbmbase.h"
#include "../utils/xgboost_utils.h"
#include "../utils/xgboost_stream.h"
namespace xgboost{
namespace regression{
/*! \brief class for gradient boosted regression */
class RegBoostLearner{
public:
/*! \brief constructor */
RegBoostLearner( void ){
silent = 0;
}
/*!
* \brief a regression booter associated with training and evaluating data
* \param train pointer to the training data
* \param evals array of evaluating data
* \param evname name of evaluation data, used print statistics
*/
RegBoostLearner( const DMatrix *train,
const std::vector<DMatrix *> &evals,
const std::vector<std::string> &evname ){
silent = 0;
this->SetData(train,evals,evname);
}
/*!
* \brief associate regression booster with training and evaluating data
* \param train pointer to the training data
* \param evals array of evaluating data
* \param evname name of evaluation data, used print statistics
*/
inline void SetData( const DMatrix *train,
const std::vector<DMatrix *> &evals,
const std::vector<std::string> &evname ){
this->train_ = train;
this->evals_ = evals;
this->evname_ = evname;
// estimate feature bound
int num_feature = (int)(train->data.NumCol());
// assign buffer index
unsigned buffer_size = static_cast<unsigned>( train->Size() );
for( size_t i = 0; i < evals.size(); ++ i ){
buffer_size += static_cast<unsigned>( evals[i]->Size() );
num_feature = std::max( num_feature, (int)(evals[i]->data.NumCol()) );
}
char str_temp[25];
if( num_feature > mparam.num_feature ){
mparam.num_feature = num_feature;
sprintf( str_temp, "%d", num_feature );
base_gbm.SetParam( "bst:num_feature", str_temp );
}
sprintf( str_temp, "%u", buffer_size );
base_gbm.SetParam( "num_pbuffer", str_temp );
if( !silent ){
printf( "buffer_size=%u\n", buffer_size );
}
// set eval_preds tmp sapce
this->eval_preds_.resize( evals.size(), std::vector<float>() );
}
/*!
* \brief set parameters from outside
* \param name name of the parameter
* \param val value of the parameter
*/
inline void SetParam( const char *name, const char *val ){
if( !strcmp( name, "silent") ) silent = atoi( val );
if( !strcmp( name, "eval_metric") ) evaluator_.AddEval( val );
mparam.SetParam( name, val );
base_gbm.SetParam( name, val );
}
/*!
* \brief initialize solver before training, called before training
* this function is reserved for solver to allocate necessary space and do other preparation
*/
inline void InitTrainer( void ){
base_gbm.InitTrainer();
if( mparam.loss_type == kLogisticClassify ){
evaluator_.AddEval( "error" );
}else{
evaluator_.AddEval( "rmse" );
}
evaluator_.Init();
}
/*!
* \brief initialize the current data storage for model, if the model is used first time, call this function
*/
inline void InitModel( void ){
base_gbm.InitModel();
mparam.AdjustBase();
}
/*!
* \brief load model from stream
* \param fi input stream
*/
inline void LoadModel( utils::IStream &fi ){
base_gbm.LoadModel( fi );
utils::Assert( fi.Read( &mparam, sizeof(ModelParam) ) != 0 );
}
/*!
* \brief DumpModel
* \param fo text file
* \param fmap feature map that may help give interpretations of feature
* \param with_stats whether print statistics as well
*/
inline void DumpModel( FILE *fo, const utils::FeatMap& fmap, bool with_stats ){
base_gbm.DumpModel( fo, fmap, with_stats );
}
/*!
* \brief Dump path of all trees
* \param fo text file
* \param data input data
*/
inline void DumpPath( FILE *fo, const DMatrix &data ){
base_gbm.DumpPath( fo, data.data );
}
/*!
* \brief save model to stream
* \param fo output stream
*/
inline void SaveModel( utils::IStream &fo ) const{
base_gbm.SaveModel( fo );
fo.Write( &mparam, sizeof(ModelParam) );
}
/*!
* \brief update the model for one iteration
* \param iteration iteration number
*/
inline void UpdateOneIter( int iter ){
this->PredictBuffer( preds_, *train_, 0 );
this->GetGradient( preds_, train_->labels, grad_, hess_ );
std::vector<unsigned> root_index;
base_gbm.DoBoost( grad_, hess_, train_->data, root_index );
}
/*!
* \brief evaluate the model for specific iteration
* \param iter iteration number
* \param fo file to output log
*/
inline void EvalOneIter( int iter, FILE *fo = stderr ){
fprintf( fo, "[%d]", iter );
int buffer_offset = static_cast<int>( train_->Size() );
for( size_t i = 0; i < evals_.size(); ++i ){
std::vector<float> &preds = this->eval_preds_[ i ];
this->PredictBuffer( preds, *evals_[i], buffer_offset);
evaluator_.Eval( fo, evname_[i].c_str(), preds, (*evals_[i]).labels );
buffer_offset += static_cast<int>( evals_[i]->Size() );
}
fprintf( fo,"\n" );
}
/*! \brief get prediction, without buffering */
inline void Predict( std::vector<float> &preds, const DMatrix &data ){
preds.resize( data.Size() );
const unsigned ndata = static_cast<unsigned>( data.Size() );
#pragma omp parallel for schedule( static )
for( unsigned j = 0; j < ndata; ++ j ){
preds[j] = mparam.PredTransform
( mparam.base_score + base_gbm.Predict( data.data, j, -1 ) );
}
}
public:
/*!
* \brief update the model for one iteration
* \param iteration iteration number
*/
inline void UpdateInteract( std::string action ){
this->InteractPredict( preds_, *train_, 0 );
int buffer_offset = static_cast<int>( train_->Size() );
for( size_t i = 0; i < evals_.size(); ++i ){
std::vector<float> &preds = this->eval_preds_[ i ];
this->InteractPredict( preds, *evals_[i], buffer_offset );
buffer_offset += static_cast<int>( evals_[i]->Size() );
}
if( action == "remove" ){
base_gbm.DelteBooster(); return;
}
this->GetGradient( preds_, train_->labels, grad_, hess_ );
std::vector<unsigned> root_index;
base_gbm.DoBoost( grad_, hess_, train_->data, root_index );
this->InteractRePredict( *train_, 0 );
buffer_offset = static_cast<int>( train_->Size() );
for( size_t i = 0; i < evals_.size(); ++i ){
this->InteractRePredict( *evals_[i], buffer_offset );
buffer_offset += static_cast<int>( evals_[i]->Size() );
}
}
private:
/*! \brief get the transformed predictions, given data */
inline void InteractPredict( std::vector<float> &preds, const DMatrix &data, unsigned buffer_offset ){
preds.resize( data.Size() );
const unsigned ndata = static_cast<unsigned>( data.Size() );
#pragma omp parallel for schedule( static )
for( unsigned j = 0; j < ndata; ++ j ){
preds[j] = mparam.PredTransform
( mparam.base_score + base_gbm.InteractPredict( data.data, j, buffer_offset + j ) );
}
}
/*! \brief repredict trial */
inline void InteractRePredict( const DMatrix &data, unsigned buffer_offset ){
const unsigned ndata = static_cast<unsigned>( data.Size() );
#pragma omp parallel for schedule( static )
for( unsigned j = 0; j < ndata; ++ j ){
base_gbm.InteractRePredict( data.data, j, buffer_offset + j );
}
}
private:
/*! \brief get the transformed predictions, given data */
inline void PredictBuffer( std::vector<float> &preds, const DMatrix &data, unsigned buffer_offset ){
preds.resize( data.Size() );
const unsigned ndata = static_cast<unsigned>( data.Size() );
#pragma omp parallel for schedule( static )
for( unsigned j = 0; j < ndata; ++ j ){
preds[j] = mparam.PredTransform
( mparam.base_score + base_gbm.Predict( data.data, j, buffer_offset + j ) );
}
}
/*! \brief get the first order and second order gradient, given the transformed predictions and labels */
inline void GetGradient( const std::vector<float> &preds,
const std::vector<float> &labels,
std::vector<float> &grad,
std::vector<float> &hess ){
grad.resize( preds.size() ); hess.resize( preds.size() );
const unsigned ndata = static_cast<unsigned>( preds.size() );
#pragma omp parallel for schedule( static )
for( unsigned j = 0; j < ndata; ++ j ){
grad[j] = mparam.FirstOrderGradient( preds[j], labels[j] );
hess[j] = mparam.SecondOrderGradient( preds[j], labels[j] );
}
}
private:
enum LossType{
kLinearSquare = 0,
kLogisticNeglik = 1,
kLogisticClassify = 2
};
/*! \brief training parameter for regression */
struct ModelParam{
/* \brief global bias */
float base_score;
/* \brief type of loss function */
int loss_type;
/* \brief number of features */
int num_feature;
/*! \brief reserved field */
int reserved[ 16 ];
/*! \brief constructor */
ModelParam( void ){
base_score = 0.5f;
loss_type = 0;
num_feature = 0;
memset( reserved, 0, sizeof( reserved ) );
}
/*!
* \brief set parameters from outside
* \param name name of the parameter
* \param val value of the parameter
*/
inline void SetParam( const char *name, const char *val ){
if( !strcmp("base_score", name ) ) base_score = (float)atof( val );
if( !strcmp("loss_type", name ) ) loss_type = atoi( val );
if( !strcmp("bst:num_feature", name ) ) num_feature = atoi( val );
}
/*!
* \brief adjust base_score
*/
inline void AdjustBase( void ){
if( loss_type == 1 || loss_type == 2 ){
utils::Assert( base_score > 0.0f && base_score < 1.0f, "sigmoid range constrain" );
base_score = - logf( 1.0f / base_score - 1.0f );
}
}
/*!
* \brief transform the linear sum to prediction
* \param x linear sum of boosting ensemble
* \return transformed prediction
*/
inline float PredTransform( float x ){
switch( loss_type ){
case kLinearSquare: return x;
case kLogisticClassify:
case kLogisticNeglik: return 1.0f/(1.0f + expf(-x));
default: utils::Error("unknown loss_type"); return 0.0f;
}
}
/*!
* \brief calculate first order gradient of loss, given transformed prediction
* \param predt transformed prediction
* \param label true label
* \return first order gradient
*/
inline float FirstOrderGradient( float predt, float label ) const{
switch( loss_type ){
case kLinearSquare: return predt - label;
case kLogisticClassify:
case kLogisticNeglik: return predt - label;
default: utils::Error("unknown loss_type"); return 0.0f;
}
}
/*!
* \brief calculate second order gradient of loss, given transformed prediction
* \param predt transformed prediction
* \param label true label
* \return second order gradient
*/
inline float SecondOrderGradient( float predt, float label ) const{
switch( loss_type ){
case kLinearSquare: return 1.0f;
case kLogisticClassify:
case kLogisticNeglik: return predt * ( 1 - predt );
default: utils::Error("unknown loss_type"); return 0.0f;
}
}
/*!
* \brief calculating the loss, given the predictions, labels and the loss type
* \param preds the given predictions
* \param labels the given labels
* \return the specified loss
*/
inline float Loss(const std::vector<float> &preds, const std::vector<float> &labels) const{
switch( loss_type ){
case kLinearSquare: return SquareLoss(preds,labels);
case kLogisticNeglik:
case kLogisticClassify: return NegLoglikelihoodLoss(preds,labels);
default: utils::Error("unknown loss_type"); return 0.0f;
}
}
/*!
* \brief calculating the square loss, given the predictions and labels
* \param preds the given predictions
* \param labels the given labels
* \return the summation of square loss
*/
inline float SquareLoss(const std::vector<float> &preds, const std::vector<float> &labels) const{
float ans = 0.0;
for(size_t i = 0; i < preds.size(); i++){
float dif = preds[i] - labels[i];
ans += dif * dif;
}
return ans;
}
/*!
* \brief calculating the square loss, given the predictions and labels
* \param preds the given predictions
* \param labels the given labels
* \return the summation of square loss
*/
inline float NegLoglikelihoodLoss(const std::vector<float> &preds, const std::vector<float> &labels) const{
float ans = 0.0;
for(size_t i = 0; i < preds.size(); i++)
ans -= labels[i] * logf(preds[i]) + ( 1 - labels[i] ) * logf(1 - preds[i]);
return ans;
}
};
private:
int silent;
EvalSet evaluator_;
booster::GBMBase base_gbm;
ModelParam mparam;
const DMatrix *train_;
std::vector<DMatrix *> evals_;
std::vector<std::string> evname_;
std::vector<unsigned> buffer_index_;
private:
std::vector<float> grad_, hess_, preds_;
std::vector< std::vector<float> > eval_preds_;
};
}
};
#endif

View File

@@ -0,0 +1,155 @@
#ifndef XGBOOST_REG_DATA_H
#define XGBOOST_REG_DATA_H
/*!
* \file xgboost_reg_data.h
* \brief input data structure for regression and binary classification task.
* Format:
* The data should contain each data instance in each line.
* The format of line data is as below:
* label <nonzero feature dimension> [feature index:feature value]+
* \author Kailong Chen: chenkl198812@gmail.com, Tianqi Chen: tianqi.tchen@gmail.com
*/
#include <cstdio>
#include <vector>
#include "../booster/xgboost_data.h"
#include "../utils/xgboost_utils.h"
#include "../utils/xgboost_stream.h"
namespace xgboost{
namespace regression{
/*! \brief data matrix for regression content */
struct DMatrix{
public:
/*! \brief maximum feature dimension */
unsigned num_feature;
/*! \brief feature data content */
booster::FMatrixS data;
/*! \brief label of each instance */
std::vector<float> labels;
public:
/*! \brief default constructor */
DMatrix( void ){}
/*! \brief get the number of instances */
inline size_t Size() const{
return labels.size();
}
/*!
* \brief load from text file
* \param fname name of text data
* \param silent whether print information or not
*/
inline void LoadText( const char* fname, bool silent = false ){
data.Clear();
FILE* file = utils::FopenCheck( fname, "r" );
float label; bool init = true;
char tmp[ 1024 ];
std::vector<booster::bst_uint> findex;
std::vector<booster::bst_float> fvalue;
while( fscanf( file, "%s", tmp ) == 1 ){
unsigned index; float value;
if( sscanf( tmp, "%u:%f", &index, &value ) == 2 ){
findex.push_back( index ); fvalue.push_back( value );
}else{
if( !init ){
labels.push_back( label );
data.AddRow( findex, fvalue );
}
findex.clear(); fvalue.clear();
utils::Assert( sscanf( tmp, "%f", &label ) == 1, "invalid format" );
init = false;
}
}
labels.push_back( label );
data.AddRow( findex, fvalue );
// initialize column support as well
data.InitData();
if( !silent ){
printf("%ux%u matrix with %lu entries is loaded from %s\n",
(unsigned)data.NumRow(), (unsigned)data.NumCol(), (unsigned long)data.NumEntry(), fname );
}
fclose(file);
}
/*!
* \brief load from binary file
* \param fname name of binary data
* \param silent whether print information or not
* \return whether loading is success
*/
inline bool LoadBinary( const char* fname, bool silent = false ){
FILE *fp = fopen64( fname, "rb" );
if( fp == NULL ) return false;
utils::FileStream fs( fp );
data.LoadBinary( fs );
labels.resize( data.NumRow() );
utils::Assert( fs.Read( &labels[0], sizeof(float) * data.NumRow() ) != 0, "DMatrix LoadBinary" );
fs.Close();
// initialize column support as well
data.InitData();
if( !silent ){
printf("%ux%u matrix with %lu entries is loaded from %s\n",
(unsigned)data.NumRow(), (unsigned)data.NumCol(), (unsigned long)data.NumEntry(), fname );
}
return true;
}
/*!
* \brief save to binary file
* \param fname name of binary data
* \param silent whether print information or not
*/
inline void SaveBinary( const char* fname, bool silent = false ){
// initialize column support as well
data.InitData();
utils::FileStream fs( utils::FopenCheck( fname, "wb" ) );
data.SaveBinary( fs );
fs.Write( &labels[0], sizeof(float) * data.NumRow() );
fs.Close();
if( !silent ){
printf("%ux%u matrix with %lu entries is saved to %s\n",
(unsigned)data.NumRow(), (unsigned)data.NumCol(), (unsigned long)data.NumEntry(), fname );
}
}
/*!
* \brief cache load data given a file name, if filename ends with .buffer, direct load binary
* otherwise the function will first check if fname + '.buffer' exists,
* if binary buffer exists, it will reads from binary buffer, otherwise, it will load from text file,
* and try to create a buffer file
* \param fname name of binary data
* \param silent whether print information or not
* \param savebuffer whether do save binary buffer if it is text
*/
inline void CacheLoad( const char *fname, bool silent = false, bool savebuffer = true ){
int len = strlen( fname );
if( len > 8 && !strcmp( fname + len - 7, ".buffer") ){
this->LoadBinary( fname, silent ); return;
}
char bname[ 1024 ];
sprintf( bname, "%s.buffer", fname );
if( !this->LoadBinary( bname, silent ) ){
this->LoadText( fname, silent );
if( savebuffer ) this->SaveBinary( bname, silent );
}
}
private:
/*! \brief update num_feature info */
inline void UpdateInfo( void ){
this->num_feature = 0;
for( size_t i = 0; i < data.NumRow(); i ++ ){
booster::FMatrixS::Line sp = data[i];
for( unsigned j = 0; j < sp.len; j ++ ){
if( num_feature <= sp[j].findex ){
num_feature = sp[j].findex + 1;
}
}
}
}
};
};
};
#endif

View File

@@ -0,0 +1,119 @@
#ifndef XGBOOST_REG_EVAL_H
#define XGBOOST_REG_EVAL_H
/*!
* \file xgboost_reg_eval.h
* \brief evaluation metrics for regression and classification
* \author Kailong Chen: chenkl198812@gmail.com, Tianqi Chen: tianqi.tchen@gmail.com
*/
#include <cmath>
#include <vector>
#include <algorithm>
#include "../utils/xgboost_utils.h"
#include "../utils/xgboost_omp.h"
namespace xgboost{
namespace regression{
/*! \brief evaluator that evaluates the loss metrics */
struct IEvaluator{
/*!
* \brief evaluate a specific metric
* \param preds prediction
* \param labels label
*/
virtual float Eval( const std::vector<float> &preds,
const std::vector<float> &labels ) const= 0;
/*! \return name of metric */
virtual const char *Name( void ) const= 0;
};
/*! \brief RMSE */
struct EvalRMSE : public IEvaluator{
virtual float Eval( const std::vector<float> &preds,
const std::vector<float> &labels ) const{
const unsigned ndata = static_cast<unsigned>( preds.size() );
float sum = 0.0;
#pragma omp parallel for reduction(+:sum) schedule( static )
for( unsigned i = 0; i < ndata; ++ i ){
float diff = preds[i] - labels[i];
sum += diff * diff;
}
return sqrtf( sum / ndata );
}
virtual const char *Name( void ) const{
return "rmse";
}
};
/*! \brief Error */
struct EvalError : public IEvaluator{
virtual float Eval( const std::vector<float> &preds,
const std::vector<float> &labels ) const{
const unsigned ndata = static_cast<unsigned>( preds.size() );
unsigned nerr = 0;
#pragma omp parallel for reduction(+:nerr) schedule( static )
for( unsigned i = 0; i < ndata; ++ i ){
if( preds[i] > 0.5f ){
if( labels[i] < 0.5f ) nerr += 1;
}else{
if( labels[i] > 0.5f ) nerr += 1;
}
}
return static_cast<float>(nerr) / ndata;
}
virtual const char *Name( void ) const{
return "error";
}
};
/*! \brief Error */
struct EvalLogLoss : public IEvaluator{
virtual float Eval( const std::vector<float> &preds,
const std::vector<float> &labels ) const{
const unsigned ndata = static_cast<unsigned>( preds.size() );
unsigned nerr = 0;
#pragma omp parallel for reduction(+:nerr) schedule( static )
for( unsigned i = 0; i < ndata; ++ i ){
const float y = labels[i];
const float py = preds[i];
nerr -= y * std::log(py) + (1.0f-y)*std::log(1-py);
}
return static_cast<float>(nerr) / ndata;
}
virtual const char *Name( void ) const{
return "negllik";
}
};
};
namespace regression{
/*! \brief a set of evaluators */
struct EvalSet{
public:
inline void AddEval( const char *name ){
if( !strcmp( name, "rmse") ) evals_.push_back( &rmse_ );
if( !strcmp( name, "error") ) evals_.push_back( &error_ );
if( !strcmp( name, "logloss") ) evals_.push_back( &logloss_ );
}
inline void Init( void ){
std::sort( evals_.begin(), evals_.end() );
evals_.resize( std::unique( evals_.begin(), evals_.end() ) - evals_.begin() );
}
inline void Eval( FILE *fo, const char *evname,
const std::vector<float> &preds,
const std::vector<float> &labels ) const{
for( size_t i = 0; i < evals_.size(); ++ i ){
float res = evals_[i]->Eval( preds, labels );
fprintf( fo, "\t%s-%s:%f", evname, evals_[i]->Name(), res );
}
}
private:
EvalRMSE rmse_;
EvalError error_;
EvalLogLoss logloss_;
std::vector<const IEvaluator*> evals_;
};
};
};
#endif

View File

@@ -0,0 +1,280 @@
#define _CRT_SECURE_NO_WARNINGS
#define _CRT_SECURE_NO_DEPRECATE
#include <ctime>
#include <string>
#include <cstring>
#include "xgboost_reg.h"
#include "../utils/xgboost_fmap.h"
#include "../utils/xgboost_random.h"
#include "../utils/xgboost_config.h"
namespace xgboost{
namespace regression{
/*!
* \brief wrapping the training process of the gradient boosting regression model,
* given the configuation
* \author Kailong Chen: chenkl198812@gmail.com, Tianqi Chen: tianqi.chen@gmail.com
*/
class RegBoostTask{
public:
inline int Run( int argc, char *argv[] ){
if( argc < 2 ){
printf("Usage: <config>\n");
return 0;
}
utils::ConfigIterator itr( argv[1] );
while( itr.Next() ){
this->SetParam( itr.name(), itr.val() );
}
for( int i = 2; i < argc; i ++ ){
char name[256], val[256];
if( sscanf( argv[i], "%[^=]=%s", name, val ) == 2 ){
this->SetParam( name, val );
}
}
this->InitData();
this->InitLearner();
if( task == "dump" ){
this->TaskDump();
return 0;
}
if( task == "interact" ){
this->TaskInteractive(); return 0;
}
if( task == "dumppath" ){
this->TaskDumpPath(); return 0;
}
if( task == "eval" ){
this->TaskEval(); return 0;
}
if( task == "pred" ){
this->TaskPred();
}else{
this->TaskTrain();
}
return 0;
}
inline void SetParam( const char *name, const char *val ){
if( !strcmp("silent", name ) ) silent = atoi( val );
if( !strcmp("use_buffer", name ) ) use_buffer = atoi( val );
if( !strcmp("seed", name ) ) random::Seed( atoi(val) );
if( !strcmp("num_round", name ) ) num_round = atoi( val );
if( !strcmp("save_period", name ) ) save_period = atoi( val );
if( !strcmp("task", name ) ) task = val;
if( !strcmp("data", name ) ) train_path = val;
if( !strcmp("test:data", name ) ) test_path = val;
if( !strcmp("model_in", name ) ) model_in = val;
if( !strcmp("model_out", name ) ) model_out = val;
if( !strcmp("model_dir", name ) ) model_dir_path = val;
if( !strcmp("fmap", name ) ) name_fmap = val;
if( !strcmp("name_dump", name ) ) name_dump = val;
if( !strcmp("name_dumppath", name ) ) name_dumppath = val;
if( !strcmp("name_pred", name ) ) name_pred = val;
if( !strcmp("dump_stats", name ) ) dump_model_stats = atoi( val );
if( !strcmp("interact:action", name ) ) interact_action = val;
if( !strncmp("batch:", name, 6 ) ){
cfg_batch.PushBack( name + 6, val );
}
if( !strncmp("eval[", name, 5 ) ) {
char evname[ 256 ];
utils::Assert( sscanf( name, "eval[%[^]]", evname ) == 1, "must specify evaluation name for display");
eval_data_names.push_back( std::string( evname ) );
eval_data_paths.push_back( std::string( val ) );
}
cfg.PushBack( name, val );
}
public:
RegBoostTask( void ){
// default parameters
silent = 0;
use_buffer = 1;
num_round = 10;
save_period = 0;
dump_model_stats = 0;
task = "train";
model_in = "NULL";
model_out = "NULL";
name_fmap = "NULL";
name_pred = "pred.txt";
name_dump = "dump.txt";
name_dumppath = "dump.path.txt";
model_dir_path = "./";
interact_action = "update";
}
~RegBoostTask( void ){
for( size_t i = 0; i < deval.size(); i ++ ){
delete deval[i];
}
}
private:
inline void InitData( void ){
if( name_fmap != "NULL" ) fmap.LoadText( name_fmap.c_str() );
if( task == "dump" ) return;
if( task == "pred" || task == "dumppath" ){
data.CacheLoad( test_path.c_str(), silent!=0, use_buffer!=0 );
}else{
// training
data.CacheLoad( train_path.c_str(), silent!=0, use_buffer!=0 );
utils::Assert( eval_data_names.size() == eval_data_paths.size() );
for( size_t i = 0; i < eval_data_names.size(); ++ i ){
deval.push_back( new DMatrix() );
deval.back()->CacheLoad( eval_data_paths[i].c_str(), silent!=0, use_buffer!=0 );
}
}
learner.SetData( &data, deval, eval_data_names );
}
inline void InitLearner( void ){
cfg.BeforeFirst();
while( cfg.Next() ){
learner.SetParam( cfg.name(), cfg.val() );
}
if( model_in != "NULL" ){
utils::FileStream fi( utils::FopenCheck( model_in.c_str(), "rb") );
learner.LoadModel( fi );
fi.Close();
}else{
utils::Assert( task == "train", "model_in not specified" );
learner.InitModel();
}
learner.InitTrainer();
}
inline void TaskTrain( void ){
const time_t start = time( NULL );
unsigned long elapsed = 0;
for( int i = 0; i < num_round; ++ i ){
elapsed = (unsigned long)(time(NULL) - start);
if( !silent ) printf("boosting round %d, %lu sec elapsed\n", i , elapsed );
learner.UpdateOneIter( i );
learner.EvalOneIter( i );
if( save_period != 0 && (i+1) % save_period == 0 ){
this->SaveModel( i );
}
elapsed = (unsigned long)(time(NULL) - start);
}
// always save final round
if( save_period == 0 || num_round % save_period != 0 ){
if( model_out == "NULL" ){
this->SaveModel( num_round - 1 );
}else{
this->SaveModel( model_out.c_str() );
}
}
if( !silent ){
printf("\nupdating end, %lu sec in all\n", elapsed );
}
}
inline void TaskEval( void ){
learner.EvalOneIter( 0 );
}
inline void TaskInteractive( void ){
const time_t start = time( NULL );
unsigned long elapsed = 0;
int batch_action = 0;
cfg_batch.BeforeFirst();
while( cfg_batch.Next() ){
if( !strcmp( cfg_batch.name(), "run" ) ){
learner.UpdateInteract( interact_action );
batch_action += 1;
} else{
learner.SetParam( cfg_batch.name(), cfg_batch.val() );
}
}
if( batch_action == 0 ){
learner.UpdateInteract( interact_action );
}
utils::Assert( model_out != "NULL", "interactive mode must specify model_out" );
this->SaveModel( model_out.c_str() );
elapsed = (unsigned long)(time(NULL) - start);
if( !silent ){
printf("\ninteractive update, %d batch actions, %lu sec in all\n", batch_action, elapsed );
}
}
inline void TaskDump( void ){
FILE *fo = utils::FopenCheck( name_dump.c_str(), "w" );
learner.DumpModel( fo, fmap, dump_model_stats != 0 );
fclose( fo );
}
inline void TaskDumpPath( void ){
FILE *fo = utils::FopenCheck( name_dumppath.c_str(), "w" );
learner.DumpPath( fo, data );
fclose( fo );
}
inline void SaveModel( const char *fname ) const{
utils::FileStream fo( utils::FopenCheck( fname, "wb" ) );
learner.SaveModel( fo );
fo.Close();
}
inline void SaveModel( int i ) const{
char fname[256];
sprintf( fname ,"%s/%04d.model", model_dir_path.c_str(), i+1 );
this->SaveModel( fname );
}
inline void TaskPred( void ){
std::vector<float> preds;
if( !silent ) printf("start prediction...\n");
learner.Predict( preds, data );
if( !silent ) printf("writing prediction to %s\n", name_pred.c_str() );
FILE *fo = utils::FopenCheck( name_pred.c_str(), "w" );
for( size_t i = 0; i < preds.size(); i ++ ){
fprintf( fo, "%f\n", preds[i] );
}
fclose( fo );
}
private:
/* \brief whether silent */
int silent;
/* \brief whether use auto binary buffer */
int use_buffer;
/* \brief number of boosting iterations */
int num_round;
/* \brief the period to save the model, 0 means only save the final round model */
int save_period;
/*! \brief interfact action */
std::string interact_action;
/* \brief the path of training/test data set */
std::string train_path, test_path;
/* \brief the path of test model file, or file to restart training */
std::string model_in;
/* \brief the path of final model file, to be saved */
std::string model_out;
/* \brief the path of directory containing the saved models */
std::string model_dir_path;
/* \brief task to perform */
std::string task;
/* \brief name of predict file */
std::string name_pred;
/* \brief whether dump statistics along with model */
int dump_model_stats;
/* \brief name of feature map */
std::string name_fmap;
/* \brief name of dump file */
std::string name_dump;
/* \brief name of dump path file */
std::string name_dumppath;
/* \brief the paths of validation data sets */
std::vector<std::string> eval_data_paths;
/* \brief the names of the evaluation data used in output log */
std::vector<std::string> eval_data_names;
/*! \brief saves configurations */
utils::ConfigSaver cfg;
/*! \brief batch configurations */
utils::ConfigSaver cfg_batch;
private:
DMatrix data;
std::vector<DMatrix*> deval;
utils::FeatMap fmap;
RegBoostLearner learner;
};
};
};
int main( int argc, char *argv[] ){
xgboost::random::Seed( 0 );
xgboost::regression::RegBoostTask tsk;
return tsk.Run( argc, argv );
}

214
utils/xgboost_config.h Normal file
View File

@@ -0,0 +1,214 @@
#ifndef XGBOOST_CONFIG_H
#define XGBOOST_CONFIG_H
/*!
* \file xgboost_config.h
* \brief helper class to load in configures from file
* \author Tianqi Chen: tianqi.tchen@gmail.com
*/
#define _CRT_SECURE_NO_WARNINGS
#include <cstdio>
#include <cstring>
#include <string>
#include "xgboost_utils.h"
#include <vector>
namespace xgboost{
namespace utils{
/*!
* \brief an iterator that iterates over a configure file and gets the configures
*/
class ConfigIterator{
public:
/*!
* \brief constructor
* \param fname name of configure file
*/
ConfigIterator( const char *fname ){
fi = FopenCheck( fname, "r");
ch_buf = fgetc( fi );
}
/*! \brief destructor */
~ConfigIterator(){
fclose( fi );
}
/*!
* \brief get current name, called after Next returns true
* \return current parameter name
*/
inline const char *name( void )const{
return s_name;
}
/*!
* \brief get current value, called after Next returns true
* \return current parameter value
*/
inline const char *val( void ) const{
return s_val;
}
/*!
* \brief move iterator to next position
* \return true if there is value in next position
*/
inline bool Next( void ){
while( !feof( fi ) ){
GetNextToken( s_name );
if( s_name[0] == '=') return false;
if( GetNextToken( s_buf ) || s_buf[0] != '=' ) return false;
if( GetNextToken( s_val ) || s_val[0] == '=' ) return false;
return true;
}
return false;
}
private:
FILE *fi;
char ch_buf;
char s_name[256],s_val[256],s_buf[246];
inline void SkipLine(){
do{
ch_buf = fgetc( fi );
}while( ch_buf != EOF && ch_buf != '\n' && ch_buf != '\r' );
}
inline void ParseStr( char tok[] ){
int i = 0;
while( (ch_buf = fgetc(fi)) != EOF ){
switch( ch_buf ){
case '\\': tok[i++] = fgetc( fi ); break;
case '\"': tok[i++] = '\0';
return;
case '\r':
case '\n': Error("unterminated string"); break;
default: tok[i++] = ch_buf;
}
}
Error("unterminated string");
}
// return newline
inline bool GetNextToken( char tok[] ){
int i = 0;
bool new_line = false;
while( ch_buf != EOF ){
switch( ch_buf ){
case '#' : SkipLine(); new_line = true; break;
case '\"':
if( i == 0 ){
ParseStr( tok );ch_buf = fgetc(fi); return new_line;
}else{
Error("token followed directly by string");
}
case '=':
if( i == 0 ) {
ch_buf = fgetc( fi );
tok[0] = '=';
tok[1] = '\0';
}else{
tok[i] = '\0';
}
return new_line;
case '\r':
case '\n':
if( i == 0 ) new_line = true;
case '\t':
case ' ' :
ch_buf = fgetc( fi );
if( i > 0 ){
tok[i] = '\0';
return new_line;
}
break;
default:
tok[i++] = ch_buf;
ch_buf = fgetc( fi );
break;
}
}
return true;
}
};
};
namespace utils{
/*!
* \brief a class that save parameter configurations
* temporally and allows to get them out later
* there are two kinds of priority in ConfigSaver
*/
class ConfigSaver{
public:
/*! \brief constructor */
ConfigSaver( void ){ idx = 0; }
/*! \brief clear all saves */
inline void Clear( void ){
idx = 0;
names.clear(); values.clear();
names_high.clear(); values_high.clear();
}
/*!
* \brief push back a parameter setting
* \param name name of parameter
* \param val value of parameter
* \param priority whether the setting has higher priority: high priority occurs
* latter when read from ConfigSaver, and can overwrite existing settings
*/
inline void PushBack( const char *name, const char *val, int priority = 0 ){
if( priority == 0 ){
names.push_back( std::string( name ) );
values.push_back( std::string( val ) );
}else{
names_high.push_back( std::string( name ) );
values_high.push_back( std::string( val ) );
}
}
/*! \brief set pointer to beginning of the ConfigSaver */
inline void BeforeFirst( void ){
idx = 0;
}
/*!
* \brief move iterator to next position
* \return true if there is value in next position
*/
inline bool Next( void ){
if( idx >= names.size() + names_high.size() ){
return false;
}
idx ++;
return true;
}
/*!
* \brief get current name, called after Next returns true
* \return current parameter name
*/
inline const char *name( void ) const{
Assert( idx > 0, "can't call name before first");
size_t i = idx - 1;
if( i >= names.size() ){
return names_high[ i - names.size() ].c_str();
}else{
return names[ i ].c_str();
}
}
/*!
* \brief get current value, called after Next returns true
* \return current parameter value
*/
inline const char *val( void ) const{
Assert( idx > 0, "can't call name before first");
size_t i = idx - 1;
if( i >= values.size() ){
return values_high[ i - values.size() ].c_str();
}else{
return values[ i ].c_str();
}
}
private:
std::vector<std::string> names;
std::vector<std::string> values;
std::vector<std::string> names_high;
std::vector<std::string> values_high;
size_t idx;
};
};
};
#endif

123
utils/xgboost_fmap.h Normal file
View File

@@ -0,0 +1,123 @@
#ifndef XGBOOST_FMAP_H
#define XGBOOST_FMAP_H
/*!
* \file xgboost_fmap.h
* \brief helper class that holds the feature names and interpretations
* \author Tianqi Chen: tianqi.tchen@gmail.com
*/
#include <vector>
#include <string>
#include <cstring>
#include "xgboost_utils.h"
namespace xgboost{
namespace utils{
/*! \brief helper class that holds the feature names and interpretations */
class FeatMap{
public:
enum Type{
kIndicator = 0,
kQuantitive = 1,
kInteger = 2,
kFloat = 3
};
public:
/*! \brief load feature map from text format */
inline void LoadText( const char *fname ){
FILE *fi = utils::FopenCheck( fname, "r" );
this->LoadText( fi );
fclose( fi );
}
/*! \brief load feature map from text format */
inline void LoadText( FILE *fi ){
int fid;
char fname[256], ftype[256];
while( fscanf( fi, "%d%s%s", &fid, fname, ftype ) == 3 ){
utils::Assert( fid == (int)names_.size(), "invalid fmap format" );
names_.push_back( std::string(fname) );
types_.push_back( GetType( ftype ) );
}
}
/*! \brief number of known features */
size_t size( void ) const{
return names_.size();
}
/*! \brief return name of specific feature */
const char* name( size_t idx ) const{
utils::Assert( idx < names_.size(), "utils::FMap::name feature index exceed bound" );
return names_[ idx ].c_str();
}
/*! \brief return type of specific feature */
const Type& type( size_t idx ) const{
utils::Assert( idx < names_.size(), "utils::FMap::name feature index exceed bound" );
return types_[ idx ];
}
private:
inline static Type GetType( const char *tname ){
if( !strcmp( "i", tname ) ) return kIndicator;
if( !strcmp( "q", tname ) ) return kQuantitive;
if( !strcmp( "int", tname ) ) return kInteger;
if( !strcmp( "float", tname ) ) return kFloat;
utils::Error("unknown feature type, use i for indicator and q for quantity");
return kIndicator;
}
private:
/*! \brief name of the feature */
std::vector<std::string> names_;
/*! \brief type of the feature */
std::vector<Type> types_;
};
}; // namespace utils
namespace utils{
/*! \brief feature constraint, allow or disallow some feature during training */
class FeatConstrain{
public:
FeatConstrain( void ){
default_state_ = +1;
}
/*!\brief set parameters */
inline void SetParam( const char *name, const char *val ){
int a, b;
if( !strcmp( name, "fban") ){
this->ParseRange( val, a, b );
this->SetRange( a, b, -1 );
}
if( !strcmp( name, "fpass") ){
this->ParseRange( val, a, b );
this->SetRange( a, b, +1 );
}
if( !strcmp( name, "fdefault") ){
default_state_ = atoi( val );
}
}
/*! \brief whether constrain is specified */
inline bool HasConstrain( void ) const {
return state_.size() != 0 && default_state_ == 1;
}
/*! \brief whether a feature index is banned or not */
inline bool NotBanned( unsigned index ) const{
int rt = index < state_.size() ? state_[index] : default_state_;
if( rt == 0 ) rt = default_state_;
return rt == 1;
}
private:
inline void SetRange( int a, int b, int st ){
if( b > (int)state_.size() ) state_.resize( b, 0 );
for( int i = a; i < b; ++ i ){
state_[i] = st;
}
}
inline void ParseRange( const char *val, int &a, int &b ){
if( sscanf( val, "%d-%d", &a, &b ) == 2 ) return;
utils::Assert( sscanf( val, "%d", &a ) == 1 );
b = a + 1;
}
/*! \brief default state */
int default_state_;
/*! \brief whether the state here is, +1:pass, -1: ban, 0:default */
std::vector<int> state_;
};
}; // namespace utils
}; // namespace xgboost
#endif // XGBOOST_FMAP_H

155
utils/xgboost_matrix_csr.h Normal file
View File

@@ -0,0 +1,155 @@
/*!
* \file xgboost_matrix_csr.h
* \brief this file defines some easy to use STL based class for in memory sparse CSR matrix
* \author Tianqi Chen: tianqi.tchen@gmail.com
*/
#ifndef XGBOOST_MATRIX_CSR_H
#define XGBOOST_MATRIX_CSR_H
#include <vector>
#include <algorithm>
#include "xgboost_utils.h"
namespace xgboost{
namespace utils{
/*!
* \brief a class used to help construct CSR format matrix,
* can be used to convert row major CSR to column major CSR
* \tparam IndexType type of index used to store the index position, usually unsigned or size_t
* \tparam whether enabling the usage of aclist, this option must be enabled manually
*/
template<typename IndexType,bool UseAcList = false>
struct SparseCSRMBuilder{
private:
/*! \brief dummy variable used in the indicator matrix construction */
std::vector<size_t> dummy_aclist;
/*! \brief pointer to each of the row */
std::vector<size_t> &rptr;
/*! \brief index of nonzero entries in each row */
std::vector<IndexType> &findex;
/*! \brief a list of active rows, used when many rows are empty */
std::vector<size_t> &aclist;
public:
SparseCSRMBuilder( std::vector<size_t> &p_rptr,
std::vector<IndexType> &p_findex )
:rptr(p_rptr), findex( p_findex ), aclist( dummy_aclist ){
Assert( !UseAcList, "enabling bug" );
}
/*! \brief use with caution! rptr must be cleaned before use */
SparseCSRMBuilder( std::vector<size_t> &p_rptr,
std::vector<IndexType> &p_findex,
std::vector<size_t> &p_aclist )
:rptr(p_rptr), findex( p_findex ), aclist( p_aclist ){
Assert( UseAcList, "must manually enable the option use aclist" );
}
public:
/*!
* \brief step 1: initialize the number of rows in the data, not necessary exact
* \nrows number of rows in the matrix, can be smaller than expected
*/
inline void InitBudget( size_t nrows = 0 ){
if( !UseAcList ){
rptr.clear();
rptr.resize( nrows + 1, 0 );
}else{
Assert( nrows + 1 == rptr.size(), "rptr must be initialized already" );
this->Cleanup();
}
}
/*!
* \brief step 2: add budget to each rows, this function is called when aclist is used
* \param row_id the id of the row
* \param nelem number of element budget add to this row
*/
inline void AddBudget( size_t row_id, size_t nelem = 1 ){
if( rptr.size() < row_id + 2 ){
rptr.resize( row_id + 2, 0 );
}
if( UseAcList ){
if( rptr[ row_id + 1 ] == 0 ) aclist.push_back( row_id );
}
rptr[ row_id + 1 ] += nelem;
}
/*! \brief step 3: initialize the necessary storage */
inline void InitStorage( void ){
// initialize rptr to be beginning of each segment
size_t start = 0;
if( !UseAcList ){
for( size_t i = 1; i < rptr.size(); i ++ ){
size_t rlen = rptr[ i ];
rptr[ i ] = start;
start += rlen;
}
}else{
// case with active list
std::sort( aclist.begin(), aclist.end() );
for( size_t i = 0; i < aclist.size(); i ++ ){
size_t ridx = aclist[ i ];
size_t rlen = rptr[ ridx + 1 ];
rptr[ ridx + 1 ] = start;
// set previous rptr to right position if previous feature is not active
if( i == 0 || ridx != aclist[i-1] + 1 ) rptr[ ridx ] = start;
start += rlen;
}
}
findex.resize( start );
}
/*!
* \brief step 4:
* used in indicator matrix construction, add new
* element to each row, the number of calls shall be exactly same as add_budget
*/
inline void PushElem( size_t row_id, IndexType col_id ){
size_t &rp = rptr[ row_id + 1 ];
findex[ rp ++ ] = col_id;
}
/*!
* \brief step 5: only needed when aclist is used
* clean up the rptr for next usage
*/
inline void Cleanup( void ){
Assert( UseAcList, "this function can only be called use AcList" );
for( size_t i = 0; i < aclist.size(); i ++ ){
const size_t ridx = aclist[i];
rptr[ ridx ] = 0; rptr[ ridx + 1 ] = 0;
}
aclist.clear();
}
};
};
namespace utils{
/*!
* \brief simple sparse matrix container
* \tparam IndexType type of index used to store the index position, usually unsigned or size_t
*/
template<typename IndexType>
struct SparseCSRMat{
private:
/*! \brief pointer to each of the row */
std::vector<size_t> rptr;
/*! \brief index of nonzero entries in each row */
std::vector<IndexType> findex;
public:
/*! \brief matrix builder*/
SparseCSRMBuilder<IndexType> builder;
public:
SparseCSRMat( void ):builder( rptr, findex ){
}
public:
/*! \return number of rows in the matrx */
inline size_t NumRow( void ) const{
return rptr.size() - 1;
}
/*! \return number of elements r-th row */
inline size_t NumElem( size_t r ) const{
return rptr[ r + 1 ] - rptr[ r ];
}
/*! \return r-th row */
inline const IndexType *operator[]( size_t r ) const{
return &findex[ rptr[r] ];
}
};
};
};
#endif

18
utils/xgboost_omp.h Normal file
View File

@@ -0,0 +1,18 @@
#ifndef XGBOOST_OMP_H
#define XGBOOST_OMP_H
/*!
* \file xgboost_omp.h
* \brief header to handle OpenMP compatibility issues
*
* \author Tianqi Chen: tianqi.tchen@gmail.com
*/
#if defined(_OPENMP)
#include <omp.h>
#else
#warning "OpenMP is not available, compile to single thread code"
inline int omp_get_thread_num() { return 0; }
inline int omp_get_num_threads() { return 1; }
inline void omp_set_num_threads( int nthread ) {}
#endif
#endif

131
utils/xgboost_random.h Normal file
View File

@@ -0,0 +1,131 @@
#ifndef XGBOOST_RANDOM_H
#define XGBOOST_RANDOM_H
/*!
* \file xgboost_random.h
* \brief PRNG to support random number generation
* \author Tianqi Chen: tianqi.tchen@gmail.com
*
* Use standard PRNG from stdlib
*/
#include <cmath>
#include <cstdlib>
#include <vector>
#ifdef _MSC_VER
typedef unsigned char uint8_t;
typedef unsigned short int uint16_t;
typedef unsigned int uint32_t;
#else
#include <inttypes.h>
#endif
/*! namespace of PRNG */
namespace xgboost{
namespace random{
/*! \brief seed the PRNG */
inline void Seed( uint32_t seed ){
srand( seed );
}
/*! \brief return a real number uniform in [0,1) */
inline double NextDouble(){
return static_cast<double>( rand() ) / (static_cast<double>( RAND_MAX )+1.0);
}
/*! \brief return a real numer uniform in (0,1) */
inline double NextDouble2(){
return (static_cast<double>( rand() ) + 1.0 ) / (static_cast<double>(RAND_MAX) + 2.0);
}
};
namespace random{
/*! \brief return a random number */
inline uint32_t NextUInt32( void ){
return (uint32_t)rand();
}
/*! \brief return a random number in n */
inline uint32_t NextUInt32( uint32_t n ){
return (uint32_t) floor( NextDouble() * n ) ;
}
/*! \brief return x~N(0,1) */
inline double SampleNormal(){
double x,y,s;
do{
x = 2 * NextDouble2() - 1.0;
y = 2 * NextDouble2() - 1.0;
s = x*x + y*y;
}while( s >= 1.0 || s == 0.0 );
return x * sqrt( -2.0 * log(s) / s ) ;
}
/*! \brief return iid x,y ~N(0,1) */
inline void SampleNormal2D( double &xx, double &yy ){
double x,y,s;
do{
x = 2 * NextDouble2() - 1.0;
y = 2 * NextDouble2() - 1.0;
s = x*x + y*y;
}while( s >= 1.0 || s == 0.0 );
double t = sqrt( -2.0 * log(s) / s ) ;
xx = x * t;
yy = y * t;
}
/*! \brief return x~N(mu,sigma^2) */
inline double SampleNormal( double mu, double sigma ){
return SampleNormal() * sigma + mu;
}
/*! \brief return 1 with probability p, coin flip */
inline int SampleBinary( double p ){
return NextDouble() < p;
}
/*! \brief return distribution from Gamma( alpha, beta ) */
inline double SampleGamma( double alpha, double beta ) {
if ( alpha < 1.0 ) {
double u;
do {
u = NextDouble();
} while (u == 0.0);
return SampleGamma(alpha + 1.0, beta) * pow(u, 1.0 / alpha);
} else {
double d,c,x,v,u;
d = alpha - 1.0/3.0;
c = 1.0 / sqrt( 9.0 * d );
do {
do {
x = SampleNormal();
v = 1.0 + c*x;
} while ( v <= 0.0 );
v = v * v * v;
u = NextDouble();
} while ( (u >= (1.0 - 0.0331 * (x*x) * (x*x)))
&& (log(u) >= (0.5 * x * x + d * (1.0 - v + log(v)))) );
return d * v / beta;
}
}
template<typename T>
inline void Exchange( T &a, T &b ){
T c;
c = a;
a = b;
b = c;
}
template<typename T>
inline void Shuffle( T *data, size_t sz ){
if( sz == 0 ) return;
for( uint32_t i = (uint32_t)sz - 1; i > 0; i-- ){
Exchange( data[i], data[ NextUInt32( i+1 ) ] );
}
}
// random shuffle the data inside, require PRNG
template<typename T>
inline void Shuffle( std::vector<T> &data ){
Shuffle( &data[0], data.size() );
}
};
};
#endif

54
utils/xgboost_stream.h Normal file
View File

@@ -0,0 +1,54 @@
#ifndef XGBOOST_STREAM_H
#define XGBOOST_STREAM_H
#include <cstdio>
/*!
* \file xgboost_stream.h
* \brief general stream interface for serialization
* \author Tianqi Chen: tianqi.tchen@gmail.com
*/
namespace xgboost{
namespace utils{
/*!
* \brief interface of stream I/O, used to serialize model
*/
class IStream{
public:
/*!
* \brief read data from stream
* \param ptr pointer to memory buffer
* \param size size of block
* \return usually is the size of data readed
*/
virtual size_t Read( void *ptr, size_t size ) = 0;
/*!
* \brief write data to stream
* \param ptr pointer to memory buffer
* \param size size of block
*/
virtual void Write( const void *ptr, size_t size ) = 0;
/*! \brief virtual destructor */
virtual ~IStream( void ){}
};
/*! \brief implementation of file i/o stream */
class FileStream: public IStream{
private:
FILE *fp;
public:
FileStream( FILE *fp ){
this->fp = fp;
}
virtual size_t Read( void *ptr, size_t size ){
return fread( ptr, size, 1, fp );
}
virtual void Write( const void *ptr, size_t size ){
fwrite( ptr, size, 1, fp );
}
inline void Close( void ){
fclose( fp );
}
};
};
};
#endif

68
utils/xgboost_utils.h Normal file
View File

@@ -0,0 +1,68 @@
#ifndef XGBOOST_UTILS_H
#define XGBOOST_UTILS_H
/*!
* \file xgboost_utils.h
* \brief simple utils to support the code
* \author Tianqi Chen: tianqi.tchen@gmail.com
*/
#define _CRT_SECURE_NO_WARNINGS
#ifdef _MSC_VER
#define fopen64 fopen
#else
// use 64 bit offset, either to include this header in the beginning, or
#ifdef _FILE_OFFSET_BITS
#if _FILE_OFFSET_BITS == 32
#warning "FILE OFFSET BITS defined to be 32 bit"
#endif
#endif
#ifdef __APPLE__
#define off64_t off_t
#define fopen64 fopen
#endif
#define _FILE_OFFSET_BITS 64
extern "C"{
#include <sys/types.h>
};
#include <cstdio>
#endif
#include <cstdio>
#include <cstdlib>
namespace xgboost{
/*! \brief namespace for helper utils of the project */
namespace utils{
inline void Error( const char *msg ){
fprintf( stderr, "Error:%s\n",msg );
exit( -1 );
}
inline void Assert( bool exp ){
if( !exp ) Error( "AssertError" );
}
inline void Assert( bool exp, const char *msg ){
if( !exp ) Error( msg );
}
inline void Warning( const char *msg ){
fprintf( stderr, "warning:%s\n",msg );
}
/*! \brief replace fopen, report error when the file open fails */
inline FILE *FopenCheck( const char *fname , const char *flag ){
FILE *fp = fopen64( fname , flag );
if( fp == NULL ){
fprintf( stderr, "can not open file \"%s\"\n",fname );
exit( -1 );
}
return fp;
}
};
};
#endif