Compare commits

...

166 Commits
v0.1 ... v0.22

Author SHA1 Message Date
tqchen@graphlab.com
56b1a3301f Merge branch 'master' of ssh://github.com/tqchen/xgboost 2014-08-15 13:36:56 -07:00
tqchen@graphlab.com
920f9f3565 save name_obj from now 2014-08-15 13:36:19 -07:00
Tianqi Chen
c1a868e7ff Update README.md 2014-08-12 14:57:28 -07:00
Tianqi Chen
63c4025656 Update README.md 2014-08-12 14:57:05 -07:00
Tianqi Chen
4a622da67b Update README.md 2014-08-12 14:56:51 -07:00
Tianqi Chen
b10efa2e4b Update README.md 2014-08-12 14:56:12 -07:00
tqchen
0d6b977395 support for multiclass output prob 2014-08-01 11:21:17 -07:00
Tianqi Chen
ca4b3b7541 Update xgboost_regrank.h 2014-07-12 10:14:30 -07:00
Tianqi Chen
4a98205ef1 Merge pull request #16 from smly/minor-leak
fix (trivial) leak in xgboost_regrank, Thanks for the fix
2014-07-12 09:58:07 -07:00
Kohei Ozaki
982d16b2b6 fix (trivial) leak in xgboost_regrank 2014-07-12 17:29:49 +09:00
tqchen
fde318716f fix combine buffer 2014-05-25 16:46:03 -07:00
tqchen
094d0a4497 add rand seeds back 2014-05-25 10:18:04 -07:00
tqchen
d8b0edf133 ok 2014-05-25 10:15:57 -07:00
Tianqi Chen
bf5fcec8e8 change rank order output to follow kaggle convention 2014-05-25 10:08:38 -07:00
tqchen
278b788b34 make python random seed invariant in each round 2014-05-24 20:57:39 -07:00
tqchen
76c44072d1 fix sometimes python cachelist problem 2014-05-20 15:42:19 -07:00
tqchen
ccde443590 more clean demo 2014-05-20 08:33:35 -07:00
tqchen
cf710bfa59 fix bug in classification, scale_pos_weight initialization 2014-05-20 08:30:19 -07:00
tqchen
be2c3d299e chg 2014-05-19 10:02:01 -07:00
Tianqi Chen
2eba59000a Merge pull request #7 from jrings/master
Compatibility with both Python 2(.7) and 3
2014-05-19 09:48:34 -07:00
Joerg Rings
a958fe8d52 Compatibility with both Python 2(.7) and 3 2014-05-19 11:23:53 -05:00
Tianqi Chen
96667b8bad Merge pull request #6 from tqchen/dev
Fix the bug in MAC
2014-05-17 11:07:42 -07:00
tqchen
95f4052aae add omp flag back 2014-05-17 11:07:12 -07:00
tqchen
e9e3e0281d use back g++ 2014-05-17 11:06:36 -07:00
tqchen
c23d8c8b88 force handle as void_p, seems fix mac problem 2014-05-17 11:03:21 -07:00
Tianqi Chen
e59f4d5a18 Merge pull request #5 from tqchen/dev
add return type for xgboost, don't know if it is mac problem. #4
2014-05-17 09:19:20 -07:00
tqchen
e267f4c5f9 add return type for xgboost, don't know if it is mac problem 2014-05-17 09:13:54 -07:00
Tianqi Chen
505e65ac08 Update README.md 2014-05-16 22:54:24 -07:00
Tianqi Chen
13fc48623e Merge pull request #2 from tqchen/dev
fix loss_type
2014-05-16 21:30:09 -07:00
tqchen
591a43ac0e some cleanup 2014-05-16 21:29:14 -07:00
tqchen
5375ac5c23 fix for loss_type problem in outside reset base 2014-05-16 21:28:03 -07:00
tqchen
6930758294 Merge branch 'master' of ssh://github.com/tqchen/xgboost 2014-05-16 20:58:03 -07:00
tqchen
e09d6ab9de chg 2014-05-16 20:57:54 -07:00
antinucleon
db4a100f6b del 2014-05-17 03:57:38 +00:00
Tianqi Chen
495e37e0dc Merge pull request #1 from tqchen/dev
2.0 version, lots of changes
2014-05-16 20:53:19 -07:00
Tianqi Chen
b56b34944e Update README.md 2014-05-16 20:49:05 -07:00
tqchen
d4530b7a47 Merge branch 'dev' of ssh://github.com/tqchen/xgboost into dev 2014-05-16 20:46:18 -07:00
tqchen
334cf5de9b add ignore 2014-05-16 20:46:08 -07:00
tqchen
004e8d811e final check 2014-05-16 20:44:02 -07:00
Tianqi Chen
4baefd857e Update README.md 2014-05-16 20:41:59 -07:00
Tianqi Chen
b52f01d61d Update README.md 2014-05-16 20:41:43 -07:00
Tianqi Chen
35f9ef684a Update README.md 2014-05-16 20:41:21 -07:00
Tianqi Chen
6f34096613 Update README.md 2014-05-16 20:41:05 -07:00
tqchen
31c5d7843f Merge branch 'dev' of ssh://github.com/tqchen/xgboost into dev 2014-05-16 20:37:55 -07:00
tqchen
f60dbe299e ok 2014-05-16 20:37:45 -07:00
yepyao
a77debc0c5 Merge branch 'dev' of https://github.com/tqchen/xgboost into dev 2014-05-17 11:36:12 +08:00
yepyao
dc2b9c86e6 small change 2014-05-17 11:35:43 +08:00
yepyao
73bc8c0de4 small change 2014-05-17 11:34:24 +08:00
tqchen
ad8eb21fcd Merge branch 'dev' of ssh://github.com/tqchen/xgboost into dev 2014-05-16 20:29:17 -07:00
tqchen
416050d5c0 fix softmax 2014-05-16 20:28:07 -07:00
antinucleon
d5f6fba82d chg 2014-05-16 21:27:37 -06:00
tqchen
23f4c41035 chg 2014-05-16 20:18:34 -07:00
Tianqi Chen
7ea988a76b Update train.py 2014-05-16 20:16:10 -07:00
tqchen
d3c0ed14f3 multi class 2014-05-16 20:12:04 -07:00
antinucleon
2fcd875675 demo 2014-05-16 21:05:11 -06:00
antinucleon
615074efb6 Merge branch 'dev' of github.com:tqchen/xgboost into dev 2014-05-16 21:03:32 -06:00
Tianqi Chen
945b336fc6 Update README.md 2014-05-16 20:00:20 -07:00
antinucleon
8e8b8a8ee3 demo 2014-05-17 02:59:10 +00:00
antinucleon
42267807f5 demo 2014-05-16 20:57:42 -06:00
tqchen
df23464a20 do not need to dump in rank 2014-05-16 19:52:39 -07:00
tqchen
2ea8d9c511 Merge branch 'dev' of ssh://github.com/tqchen/xgboost into dev 2014-05-16 19:51:41 -07:00
tqchen
3206235a5e before commit 2014-05-16 19:51:33 -07:00
yepyao
956fc09da0 small change 2014-05-17 10:50:15 +08:00
yepyao
da482500c7 Merge branch 'dev' of https://github.com/tqchen/xgboost into dev
Conflicts:
	demo/rank/mq2008.conf
	demo/rank/runexp.sh
	regrank/xgboost_regrank_obj.h
2014-05-17 10:40:12 +08:00
yepyao
b19f2bfda8 fix small bug 2014-05-17 10:35:10 +08:00
tqchen
21b21e69de add bing to author list 2014-05-16 19:33:59 -07:00
Tianqi Chen
b90d1dc92b Update demo.py 2014-05-16 19:30:32 -07:00
tqchen
3429ab3447 chgs 2014-05-16 19:24:53 -07:00
tqchen
ebcce4a2bf chg all settings to obj 2014-05-16 19:10:52 -07:00
tqchen
1839e6efe9 pre-release version 2014-05-16 18:49:02 -07:00
tqchen
9bc6e83afe chg scripts 2014-05-16 18:46:43 -07:00
tqchen
fd2774e133 cleanup 2014-05-16 18:40:46 -07:00
tqchen
72d3a6a3cc chg rank demo 2014-05-16 18:38:40 -07:00
tqchen
5febbecd88 Merge branch 'dev' of ssh://github.com/tqchen/xgboost into dev 2014-05-16 18:29:37 -07:00
tqchen
b3c3ecd9c9 chng few things 2014-05-16 18:25:01 -07:00
tqchen
c28a1be34c minor changes 2014-05-16 18:19:57 -07:00
antinucleon
ae70b9b152 new speed test 2014-05-16 18:05:17 -06:00
antinucleon
e0a0343ae6 speedtest 2014-05-16 17:48:03 -06:00
yepyao
0e0d3efd6a use ndcg@all in lambdarank for ndcg 2014-05-16 23:06:24 +08:00
yepyao
a3bd5000ba small change 2014-05-16 21:20:41 +08:00
yepyao
dd71c0e070 Download data set from web site 2014-05-16 21:18:32 +08:00
kalenhaha
d9ea324057 Impement new Lambda rank interface 2014-05-16 20:42:46 +08:00
tqchen
0d29610c40 new lambda rank interface 2014-05-16 00:02:26 -07:00
Bing Xu
0af2c92d3b Update README.md 2014-05-16 01:30:29 -04:00
tqchen
f9cdce077b ok 2014-05-15 21:17:17 -07:00
tqchen
59183b9ed8 a correct version 2014-05-15 21:11:46 -07:00
tqchen
6ff272eec6 fix numpy convert 2014-05-15 20:28:34 -07:00
tqchen
c8073e13e4 ok 2014-05-15 20:05:22 -07:00
tqchen
698fa87bc3 ok 2014-05-15 18:56:28 -07:00
tqchen
8f56671901 bug fix in pairwise rank 2014-05-15 15:37:58 -07:00
tqchen
9ea9a7a01e cleanup code 2014-05-15 15:01:41 -07:00
tqchen
d59940f1d5 add xgcombine_buffer with weights 2014-05-15 14:41:11 -07:00
tqchen
6aa190e10c change data format to include weight in binary file, add get weight to python 2014-05-15 14:37:56 -07:00
tqchen
54c486bcf1 ok 2014-05-15 14:25:44 -07:00
tqchen
88ff293de5 add ams 2014-05-14 23:23:27 -07:00
tqchen
50af92e29e some fix 2014-05-14 16:55:59 -07:00
tqchen
bbe4957cd2 add AMS metric 2014-05-14 11:30:45 -07:00
kalenhaha
789ad18d36 add in grad and hess rescale in lambdarank 2014-05-14 23:13:27 +08:00
kalenhaha
2b34d5a25e small bug in ndcg eval 2014-05-13 14:30:42 +08:00
kalenhaha
bd574e4967 Merge branch 'dev' of https://github.com/tqchen/xgboost into dev 2014-05-12 22:22:32 +08:00
kalenhaha
e8d81c1da5 Add LETOR MQ2008 for rank demo 2014-05-12 22:21:07 +08:00
kalenhaha
c84bbc91d1 remove sampler 2014-05-11 14:31:57 +08:00
kalenhaha
61e3d1562c small change 2014-05-11 14:25:30 +08:00
kalenhaha
97db8c29f2 small change 2014-05-11 14:03:21 +08:00
tqchen
f2552f8ef2 simple chgs 2014-05-09 20:39:15 -07:00
kalenhaha
2563b6d2d6 fix some warnings 2014-05-09 14:14:43 +08:00
kalenhaha
e90ffece67 Merge branch 'dev' of https://github.com/tqchen/xgboost into dev 2014-05-09 14:07:06 +08:00
kalenhaha
85f92681f9 Separating Lambda MAP and Lambda NDCG 2014-05-09 14:05:52 +08:00
tqchen
5e0d52cb8c add python o3 2014-05-08 20:15:23 -07:00
tqchen
c9d156d99e faster convert to numpy array 2014-05-08 19:35:06 -07:00
tqchen
ecf6e8f49f commit the fix 2014-05-08 19:31:32 -07:00
tqchen
93778aa4aa Merge branch 'dev' of ssh://github.com/tqchen/xgboost into dev 2014-05-07 12:00:17 -07:00
tqchen
f8cacc7308 fix omp for bug in obj 2014-05-07 11:52:12 -07:00
kalenhaha
c0e1e9fe7a Merge branch 'dev' of https://github.com/tqchen/xgboost into dev
Conflicts:
	regrank/xgboost_regrank_obj.hpp
2014-05-07 22:15:59 +08:00
tqchen
fa5afe2141 fix 2014-05-06 16:53:37 -07:00
tqchen
f7789ecf14 Merge branch 'dev' of ssh://github.com/tqchen/xgboost into dev 2014-05-06 16:51:18 -07:00
tqchen
a57fbe091a Merge branch 'dev' of ssh://github.com/tqchen/xgboost into dev
Conflicts:
	regrank/xgboost_regrank_data.h
2014-05-06 16:51:11 -07:00
tqchen
9f82b53366 add regrank utils 2014-05-06 16:50:46 -07:00
tqchen
248b2cf74d right group size 2014-05-06 16:49:10 -07:00
tqchen
5fb9376af4 add cutomized training 2014-05-04 13:57:10 -07:00
tqchen
9c2bb12cd1 add cutomized training 2014-05-04 13:55:58 -07:00
tqchen
ebde99bde8 add boost group support to xgboost. now have beta multi-class classification 2014-05-04 12:10:03 -07:00
kalenhaha
ef7be5398d c++11 features removed 2014-05-04 16:58:44 +08:00
kalenhaha
2ef61bf982 c++11 features removed 2014-05-04 16:56:57 +08:00
tqchen
d4d141347a fix 2014-05-04 00:09:16 -07:00
tqchen
e18ba04751 add interact mode 2014-05-03 23:24:22 -07:00
tqchen
3388d1a8b5 add python interface for xgboost 2014-05-03 23:04:02 -07:00
tqchen
65917bb831 finish python lib 2014-05-03 22:18:25 -07:00
tqchen
140499ac9e finish matrix 2014-05-03 17:12:25 -07:00
tqchen
ccd037292d good 2014-05-03 16:15:44 -07:00
tqchen
59939d0b14 ok 2014-05-03 14:24:00 -07:00
tqchen
9a2c00554d important change to regrank interface, need some more test 2014-05-03 14:20:27 -07:00
tqchen
ee30c1728b try python 2014-05-03 10:54:08 -07:00
tqchen
8f75b0ef75 pass test 2014-05-02 18:04:45 -07:00
tqchen
3128e718e2 add new combine tool as promised 2014-05-02 12:55:34 -07:00
tqchen
657c617215 Merge branch 'dev' of ssh://github.com/tqchen/xgboost into dev 2014-05-01 11:01:05 -07:00
tqchen
439d4725a0 cleanup of evaluation metric, move c++11 codes into sample.h for backup, add lambda in a clean way latter 2014-05-01 11:00:50 -07:00
Tianqi Chen
8491bb3651 Update xgboost_omp.h 2014-05-01 10:16:05 -07:00
kalenhaha
cce96e8f41 fix some bugs in linux 2014-05-02 00:16:12 +08:00
kalenhaha
f02dd68713 lambda rank added 2014-05-01 22:17:26 +08:00
tqchen
ec14d32756 add softmax 2014-04-30 22:11:26 -07:00
tqchen
38577d45b0 add pre @ n 2014-04-30 22:00:53 -07:00
tqchen
ab0e7a3ddc use omp parallel sortting 2014-04-30 09:48:41 -07:00
tqchen
bbd952a021 add rank 2014-04-30 09:32:42 -07:00
tqchen
77e3051b1d add pairwise rank first version 2014-04-29 21:12:30 -07:00
tqchen
924e164c14 new AUC code 2014-04-29 17:26:58 -07:00
tqchen
25ff5ef169 new AUC evaluator, now compatible with weighted loss 2014-04-29 17:03:34 -07:00
tqchen
3ea29eccae make regression module compatible with rank loss, now support weighted loss 2014-04-29 16:16:02 -07:00
tqchen
0f8a3d21a5 chg fmap format 2014-04-29 09:59:10 -07:00
tqchen
7487c2f668 add auc evaluation metric 2014-04-24 22:20:40 -07:00
tqchen
88787b8573 remove unwanted private field 2014-04-21 10:42:19 -07:00
tqchen
17559a90f9 expose fmatrixs 2014-04-18 18:18:19 -07:00
tqchen
24696071a8 Merge branch 'master' of ssh://github.com/tqchen/xgboost
Conflicts:
	regression/xgboost_reg_data.h
2014-04-18 17:46:44 -07:00
tqchen
cca67af8d7 simplify data 2014-04-18 17:43:44 -07:00
kalenhaha
2beb92745f Lambda rank added 2014-04-11 10:50:13 +08:00
kalenhaha
d6b582dc70 Merge branch 'master' of https://github.com/tqchen/xgboost 2014-04-11 10:48:45 +08:00
kalenhaha
218320daf2 Lambda rank added 2014-04-10 22:11:15 +08:00
kalenhaha
f83942d3e9 lambda rank added 2014-04-10 22:09:19 +08:00
Tianqi Chen
60d79eb2e7 Update xgboost_utils.h 2014-04-07 16:25:21 -07:00
kalenhaha
1136c71e64 rank pass toy 2014-04-07 23:25:35 +08:00
tqchen
1bbbb0cf7f add deleted main back 2014-04-06 09:32:27 -07:00
kalenhaha
1756fde0c6 small fix 2014-04-06 22:54:41 +08:00
kalenhaha
7f30fc1468 compiled 2014-04-06 22:51:52 +08:00
tqchen
d5607fbb55 add dev 2014-04-04 10:42:13 -07:00
kalenhaha
05d984d83d pairwise ranking implemented 2014-04-05 00:14:55 +08:00
kalenhaha
1110ae7421 Adding ranking task 2014-04-03 16:22:55 +08:00
58 changed files with 12394 additions and 1647 deletions

9
.gitignore vendored
View File

@@ -16,4 +16,11 @@
*conf *conf
*buffer *buffer
*model *model
xgboost xgboost
*pyc
*train
*test
*group
*rar
*vali
*data

View File

@@ -1,4 +1,4 @@
Copyright (c) 2014 Tianqi Chen Copyright (c) 2014 by Tianqi Chen and Contributors
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.

View File

@@ -1,6 +1,6 @@
export CC = gcc export CC = gcc
export CXX = g++ export CXX = g++
export CFLAGS = -Wall -O3 -msse2 -Wno-unknown-pragmas -fopenmp export CFLAGS = -Wall -O3 -msse2 -Wno-unknown-pragmas -fopenmp
# specify tensor path # specify tensor path
BIN = xgboost BIN = xgboost
@@ -10,7 +10,8 @@ OBJ =
all: $(BIN) $(OBJ) all: $(BIN) $(OBJ)
export LDFLAGS= -pthread -lm export LDFLAGS= -pthread -lm
xgboost: regression/xgboost_reg_main.cpp regression/*.h booster/*.h booster/*/*.hpp booster/*.hpp xgboost: regrank/xgboost_regrank_main.cpp regrank/*.h regrank/*.hpp booster/*.h booster/*/*.hpp booster/*.hpp
$(BIN) : $(BIN) :
$(CXX) $(CFLAGS) $(LDFLAGS) -o $@ $(filter %.cpp %.o %.c, $^) $(CXX) $(CFLAGS) $(LDFLAGS) -o $@ $(filter %.cpp %.o %.c, $^)

View File

@@ -1,20 +1,23 @@
xgboost: eXtreme Gradient Boosting xgboost: eXtreme Gradient Boosting
======= =======
A General purpose gradient boosting (tree) library. An optimized general purpose gradient boosting (tree) library.
Authors: Contributors: https://github.com/tqchen/xgboost/graphs/contributors
* Tianqi Chen, project creater
* Kailong Chen, contributes regression module
Turorial and Documentation: https://github.com/tqchen/xgboost/wiki Turorial and Documentation: https://github.com/tqchen/xgboost/wiki
Questions and Issues: [https://github.com/tqchen/xgboost/issues](https://github.com/tqchen/xgboost/issues?q=is%3Aissue+label%3Aquestion)
Features Features
======= =======
* Sparse feature format: * Sparse feature format:
- Sparse feature format allows easy handling of missing values, and improve computation efficiency. - Sparse feature format allows easy handling of missing values, and improve computation efficiency.
* Push the limit on single machine: * Push the limit on single machine:
- Efficient implementation that optimizes memory and computation. - Efficient implementation that optimizes memory and computation.
* Layout of gradient boosting algorithm to support generic tasks, see project wiki. * Speed: XGBoost is very fast
- IN [demo/higgs/speedtest.py](demo/kaggle-higgs/speedtest.py), kaggle higgs data it is faster(on our machine 20 times faster using 4 threads) than sklearn.ensemble.GradientBoostingClassifier
* Layout of gradient boosting algorithm to support user defined objective
* Python interface, works with numpy and scipy.sparse matrix
Supported key components Supported key components
======= =======
@@ -33,6 +36,12 @@ Planned components
- matrix factorization - matrix factorization
- structured prediction - structured prediction
Build
======
* Simply type make
* If your compiler does not come with OpenMP support, it will fire an warning telling you that the code will compile into single thread mode, and you will get single thread xgboost
- You may get a error: -lgomp is not found, you can remove -fopenmp flag in Makefile to get single thread xgboost, or upgrade your compiler to compile multi-thread version
File extension convention File extension convention
======= =======
* .h are interface, utils and data structures, with detailed comment; * .h are interface, utils and data structures, with detailed comment;

View File

@@ -49,9 +49,8 @@ namespace xgboost{
}; };
private: private:
Entry best_entry; Entry best_entry;
const TreeParamTrain &param;
public: public:
RTSelecter( const TreeParamTrain &p ):param( p ){ RTSelecter( void ){
memset( &best_entry, 0, sizeof(best_entry) ); memset( &best_entry, 0, sizeof(best_entry) );
best_entry.loss_chg = 0.0f; best_entry.loss_chg = 0.0f;
} }
@@ -211,7 +210,7 @@ namespace xgboost{
const SCEntry *entry, size_t start, size_t end, const SCEntry *entry, size_t start, size_t end,
int findex, float parent_base_weight ){ int findex, float parent_base_weight ){
// local selecter // local selecter
RTSelecter slocal( param ); RTSelecter slocal;
if( param.need_forward_search() ){ if( param.need_forward_search() ){
// forward process, default right // forward process, default right
@@ -320,7 +319,7 @@ namespace xgboost{
// after this point, tmp_rptr and entry is ready to use // after this point, tmp_rptr and entry is ready to use
// global selecter // global selecter
RTSelecter sglobal( param ); RTSelecter sglobal;
// gain root // gain root
const double root_gain = param.CalcRootGain( rsum_grad, rsum_hess ); const double root_gain = param.CalcRootGain( rsum_grad, rsum_hess );
// KEY: layerwise, weight of current node if it is leaf // KEY: layerwise, weight of current node if it is leaf

View File

@@ -2,7 +2,7 @@
#define XGBOOST_INL_HPP #define XGBOOST_INL_HPP
/*! /*!
* \file xgboost-inl.hpp * \file xgboost-inl.hpp
* \brief bootser implementations * \brief bootser implementations
* \author Tianqi Chen: tianqi.tchen@gmail.com * \author Tianqi Chen: tianqi.tchen@gmail.com
*/ */
// implementation of boosters go to here // implementation of boosters go to here
@@ -18,7 +18,7 @@
#include "linear/xgboost_linear.hpp" #include "linear/xgboost_linear.hpp"
namespace xgboost{ namespace xgboost{
namespace booster{ namespace booster{
/*! /*!
* \brief create a gradient booster, given type of booster * \brief create a gradient booster, given type of booster
* \param booster_type type of gradient booster, can be used to specify implements * \param booster_type type of gradient booster, can be used to specify implements
@@ -26,14 +26,14 @@ namespace xgboost{
* \return the pointer to the gradient booster created * \return the pointer to the gradient booster created
*/ */
template<typename FMatrix> template<typename FMatrix>
inline InterfaceBooster<FMatrix> *CreateBooster( int booster_type ){ inline InterfaceBooster<FMatrix> *CreateBooster(int booster_type){
switch( booster_type ){ switch (booster_type){
case 0: return new RegTreeTrainer<FMatrix>(); case 0: return new RegTreeTrainer<FMatrix>();
case 1: return new LinearBooster<FMatrix>(); case 1: return new LinearBooster<FMatrix>();
default: utils::Error("unknown booster_type"); return NULL; default: utils::Error("unknown booster_type"); return NULL;
} }
} }
}; // namespace booster }; // namespace booster
}; // namespace xgboost }; // namespace xgboost
#endif // XGBOOST_INL_HPP #endif // XGBOOST_INL_HPP

View File

@@ -19,8 +19,8 @@
namespace xgboost{ namespace xgboost{
/*! \brief namespace for boosters */ /*! \brief namespace for boosters */
namespace booster{ namespace booster{
/*! /*!
* \brief interface of a gradient boosting learner * \brief interface of a gradient boosting learner
* \tparam FMatrix the feature matrix format that the booster takes * \tparam FMatrix the feature matrix format that the booster takes
*/ */
template<typename FMatrix> template<typename FMatrix>
@@ -35,101 +35,101 @@ namespace xgboost{
// call booster->LoadModel // call booster->LoadModel
// (3) booster->DoBoost to update the model // (3) booster->DoBoost to update the model
// (4) booster->Predict to get new prediction // (4) booster->Predict to get new prediction
/*! /*!
* \brief set parameters from outside * \brief set parameters from outside
* \param name name of the parameter * \param name name of the parameter
* \param val value of the parameter * \param val value of the parameter
*/ */
virtual void SetParam( const char *name, const char *val ) = 0; virtual void SetParam(const char *name, const char *val) = 0;
/*! /*!
* \brief load model from stream * \brief load model from stream
* \param fi input stream * \param fi input stream
*/ */
virtual void LoadModel( utils::IStream &fi ) = 0; virtual void LoadModel(utils::IStream &fi) = 0;
/*! /*!
* \brief save model to stream * \brief save model to stream
* \param fo output stream * \param fo output stream
*/ */
virtual void SaveModel( utils::IStream &fo ) const = 0; virtual void SaveModel(utils::IStream &fo) const = 0;
/*! /*!
* \brief initialize solver before training, called before training * \brief initialize solver before training, called before training
* this function is reserved for solver to allocate necessary space and do other preparation * this function is reserved for solver to allocate necessary space and do other preparation
*/ */
virtual void InitModel( void ) = 0; virtual void InitModel(void) = 0;
public: public:
/*! /*!
* \brief do gradient boost training for one step, using the information given, * \brief do gradient boost training for one step, using the information given,
* Note: content of grad and hess can change after DoBoost * Note: content of grad and hess can change after DoBoost
* \param grad first order gradient of each instance * \param grad first order gradient of each instance
* \param hess second order gradient of each instance * \param hess second order gradient of each instance
* \param feats features of each instance * \param feats features of each instance
* \param root_index pre-partitioned root index of each instance, * \param root_index pre-partitioned root index of each instance,
* root_index.size() can be 0 which indicates that no pre-partition involved * root_index.size() can be 0 which indicates that no pre-partition involved
*/ */
virtual void DoBoost( std::vector<float> &grad, virtual void DoBoost(std::vector<float> &grad,
std::vector<float> &hess, std::vector<float> &hess,
const FMatrix &feats, const FMatrix &feats,
const std::vector<unsigned> &root_index ) = 0; const std::vector<unsigned> &root_index) = 0;
/*! /*!
* \brief predict the path ids along a trees, for given sparse feature vector. When booster is a tree * \brief predict the path ids along a trees, for given sparse feature vector. When booster is a tree
* \param path the result of path * \param path the result of path
* \param feats feature matrix * \param feats feature matrix
* \param row_index row index in the feature matrix * \param row_index row index in the feature matrix
* \param root_index root id of current instance, default = 0 * \param root_index root id of current instance, default = 0
*/ */
virtual void PredPath( std::vector<int> &path, const FMatrix &feats, virtual void PredPath(std::vector<int> &path, const FMatrix &feats,
bst_uint row_index, unsigned root_index = 0 ){ bst_uint row_index, unsigned root_index = 0){
utils::Error( "not implemented" ); utils::Error("not implemented");
} }
/*! /*!
* \brief predict values for given sparse feature vector * \brief predict values for given sparse feature vector
* *
* NOTE: in tree implementation, Sparse Predict is OpenMP threadsafe, but not threadsafe in general, * NOTE: in tree implementation, Sparse Predict is OpenMP threadsafe, but not threadsafe in general,
* dense version of Predict to ensures threadsafety * dense version of Predict to ensures threadsafety
* \param feats feature matrix * \param feats feature matrix
* \param row_index row index in the feature matrix * \param row_index row index in the feature matrix
* \param root_index root id of current instance, default = 0 * \param root_index root id of current instance, default = 0
* \return prediction * \return prediction
*/ */
virtual float Predict( const FMatrix &feats, bst_uint row_index, unsigned root_index = 0 ){ virtual float Predict(const FMatrix &feats, bst_uint row_index, unsigned root_index = 0){
utils::Error( "not implemented" ); utils::Error("not implemented");
return 0.0f; return 0.0f;
} }
/*! /*!
* \brief predict values for given dense feature vector * \brief predict values for given dense feature vector
* \param feat feature vector in dense format * \param feat feature vector in dense format
* \param funknown indicator that the feature is missing * \param funknown indicator that the feature is missing
* \param rid root id of current instance, default = 0 * \param rid root id of current instance, default = 0
* \return prediction * \return prediction
*/ */
virtual float Predict( const std::vector<float> &feat, virtual float Predict(const std::vector<float> &feat,
const std::vector<bool> &funknown, const std::vector<bool> &funknown,
unsigned rid = 0 ){ unsigned rid = 0){
utils::Error( "not implemented" ); utils::Error("not implemented");
return 0.0f; return 0.0f;
} }
/*! /*!
* \brief print information * \brief print information
* \param fo output stream * \param fo output stream
*/ */
virtual void PrintInfo( FILE *fo ){} virtual void PrintInfo(FILE *fo){}
/*! /*!
* \brief dump model into text file * \brief dump model into text file
* \param fo output stream * \param fo output stream
* \param fmap feature map that may help give interpretations of feature * \param fmap feature map that may help give interpretations of feature
* \param with_stats whether print statistics * \param with_stats whether print statistics
*/ */
virtual void DumpModel( FILE *fo, const utils::FeatMap& fmap, bool with_stats = false ){ virtual void DumpModel(FILE *fo, const utils::FeatMap& fmap, bool with_stats = false){
utils::Error( "not implemented" ); utils::Error("not implemented");
} }
public: public:
/*! \brief virtual destructor */ /*! \brief virtual destructor */
virtual ~InterfaceBooster( void ){} virtual ~InterfaceBooster(void){}
}; };
}; };
namespace booster{ namespace booster{
/*! /*!
* \brief this will is the most commonly used booster interface * \brief this will is the most commonly used booster interface
* we try to make booster invariant of data structures, but most cases, FMatrixS is what we wnat * we try to make booster invariant of data structures, but most cases, FMatrixS is what we wnat
*/ */
typedef InterfaceBooster<FMatrixS> IBooster; typedef InterfaceBooster<FMatrixS> IBooster;
@@ -138,7 +138,7 @@ namespace xgboost{
namespace xgboost{ namespace xgboost{
namespace booster{ namespace booster{
/*! /*!
* \brief create a gradient booster, given type of booster * \brief create a gradient booster, given type of booster
* normally we use FMatrixS, by calling CreateBooster<FMatrixS> * normally we use FMatrixS, by calling CreateBooster<FMatrixS>
* \param booster_type type of gradient booster, can be used to specify implements * \param booster_type type of gradient booster, can be used to specify implements
@@ -146,7 +146,7 @@ namespace xgboost{
* \return the pointer to the gradient booster created * \return the pointer to the gradient booster created
*/ */
template<typename FMatrix> template<typename FMatrix>
inline InterfaceBooster<FMatrix> *CreateBooster( int booster_type ); inline InterfaceBooster<FMatrix> *CreateBooster(int booster_type);
}; };
}; };

View File

@@ -21,76 +21,76 @@ namespace xgboost{
typedef unsigned bst_uint; typedef unsigned bst_uint;
/*! \brief float type used in boost */ /*! \brief float type used in boost */
typedef float bst_float; typedef float bst_float;
/*! \brief debug option for booster */ /*! \brief debug option for booster */
const bool bst_debug = false; const bool bst_debug = false;
}; };
}; };
namespace xgboost{ namespace xgboost{
namespace booster{ namespace booster{
/** /**
* \brief This is a interface, defining the way to access features, * \brief This is a interface, defining the way to access features,
* by column or by row. This interface is used to make implementation * by column or by row. This interface is used to make implementation
* of booster does not depend on how feature is stored. * of booster does not depend on how feature is stored.
* *
* Why template instead of virtual class: for efficiency * Why template instead of virtual class: for efficiency
* feature matrix is going to be used by most inner loop of the algorithm * feature matrix is going to be used by most inner loop of the algorithm
* *
* \tparam Derived type of actual implementation * \tparam Derived type of actual implementation
* \sa FMatrixS: most of time FMatrixS is sufficient, refer to it if you find it confusing * \sa FMatrixS: most of time FMatrixS is sufficient, refer to it if you find it confusing
*/ */
template<typename Derived> template<typename Derived>
struct FMatrix{ struct FMatrix{
public: public:
/*! \brief exmaple iterator over one row */ /*! \brief exmaple iterator over one row */
struct RowIter{ struct RowIter{
/*! /*!
* \brief move to next position * \brief move to next position
* \return whether there is element in next position * \return whether there is element in next position
*/ */
inline bool Next( void ); inline bool Next(void);
/*! \return feature index in current position */ /*! \return feature index in current position */
inline bst_uint findex( void ) const; inline bst_uint findex(void) const;
/*! \return feature value in current position */ /*! \return feature value in current position */
inline bst_float fvalue( void ) const; inline bst_float fvalue(void) const;
}; };
/*! \brief example iterator over one column */ /*! \brief example iterator over one column */
struct ColIter{ struct ColIter{
/*! /*!
* \brief move to next position * \brief move to next position
* \return whether there is element in next position * \return whether there is element in next position
*/ */
inline bool Next( void ); inline bool Next(void);
/*! \return row index of current position */ /*! \return row index of current position */
inline bst_uint rindex( void ) const; inline bst_uint rindex(void) const;
/*! \return feature value in current position */ /*! \return feature value in current position */
inline bst_float fvalue( void ) const; inline bst_float fvalue(void) const;
}; };
/*! \brief backward iterator over column */ /*! \brief backward iterator over column */
struct ColBackIter : public ColIter {}; struct ColBackIter : public ColIter {};
public: public:
/*! /*!
* \brief get number of rows * \brief get number of rows
* \return number of rows * \return number of rows
*/ */
inline size_t NumRow( void ) const; inline size_t NumRow(void) const;
/*! /*!
* \brief get number of columns * \brief get number of columns
* \return number of columns * \return number of columns
*/ */
inline size_t NumCol( void ) const; inline size_t NumCol(void) const;
/*! /*!
* \brief get row iterator * \brief get row iterator
* \param ridx row index * \param ridx row index
* \return row iterator * \return row iterator
*/ */
inline RowIter GetRow( size_t ridx ) const; inline RowIter GetRow(size_t ridx) const;
/*! /*!
* \brief get number of column groups, this ise used together with GetRow( ridx, gid ) * \brief get number of column groups, this ise used together with GetRow( ridx, gid )
* \return number of column group * \return number of column group
*/ */
inline unsigned NumColGroup( void ) const{ inline unsigned NumColGroup(void) const{
return 1; return 1;
} }
/*! /*!
@@ -99,32 +99,32 @@ namespace xgboost{
* \param gid colmun group id * \param gid colmun group id
* \return row iterator, only iterates over features of specified column group * \return row iterator, only iterates over features of specified column group
*/ */
inline RowIter GetRow( size_t ridx, unsigned gid ) const; inline RowIter GetRow(size_t ridx, unsigned gid) const;
/*! \return whether column access is enabled */ /*! \return whether column access is enabled */
inline bool HaveColAccess( void ) const; inline bool HaveColAccess(void) const;
/*! /*!
* \brief get column iterator, the columns must be sorted by feature value * \brief get column iterator, the columns must be sorted by feature value
* \param ridx column index * \param ridx column index
* \return column iterator * \return column iterator
*/ */
inline ColIter GetSortedCol( size_t ridx ) const; inline ColIter GetSortedCol(size_t ridx) const;
/*! /*!
* \brief get column backward iterator, starts from biggest fvalue, and iterator back * \brief get column backward iterator, starts from biggest fvalue, and iterator back
* \param ridx column index * \param ridx column index
* \return reverse column iterator * \return reverse column iterator
*/ */
inline ColBackIter GetReverseSortedCol( size_t ridx ) const; inline ColBackIter GetReverseSortedCol(size_t ridx) const;
}; };
}; };
}; };
namespace xgboost{ namespace xgboost{
namespace booster{ namespace booster{
/*! /*!
* \brief feature matrix to store training instance, in sparse CSR format * \brief feature matrix to store training instance, in sparse CSR format
*/ */
class FMatrixS: public FMatrix<FMatrixS>{ class FMatrixS : public FMatrix<FMatrixS>{
public: public:
/*! \brief one entry in a row */ /*! \brief one entry in a row */
struct REntry{ struct REntry{
@@ -133,10 +133,10 @@ namespace xgboost{
/*! \brief feature value */ /*! \brief feature value */
bst_float fvalue; bst_float fvalue;
/*! \brief constructor */ /*! \brief constructor */
REntry( void ){} REntry(void){}
/*! \brief constructor */ /*! \brief constructor */
REntry( bst_uint findex, bst_float fvalue ) : findex(findex), fvalue(fvalue){} REntry(bst_uint findex, bst_float fvalue) : findex(findex), fvalue(fvalue){}
inline static bool cmp_fvalue( const REntry &a, const REntry &b ){ inline static bool cmp_fvalue(const REntry &a, const REntry &b){
return a.fvalue < b.fvalue; return a.fvalue < b.fvalue;
} }
}; };
@@ -147,79 +147,79 @@ namespace xgboost{
/*! \brief size of the data */ /*! \brief size of the data */
bst_uint len; bst_uint len;
/*! \brief get k-th element */ /*! \brief get k-th element */
inline const REntry& operator[]( unsigned i ) const{ inline const REntry& operator[](unsigned i) const{
return data_[i]; return data_[i];
} }
}; };
/*! \brief row iterator */ /*! \brief row iterator */
struct RowIter{ struct RowIter{
const REntry *dptr_, *end_; const REntry *dptr_, *end_;
RowIter( const REntry* dptr, const REntry* end ) RowIter(const REntry* dptr, const REntry* end)
:dptr_(dptr),end_(end){} :dptr_(dptr), end_(end){}
inline bool Next( void ){ inline bool Next(void){
if( dptr_ == end_ ) return false; if (dptr_ == end_) return false;
else{ else{
++ dptr_; return true; ++dptr_; return true;
} }
} }
inline bst_uint findex( void ) const{ inline bst_uint findex(void) const{
return dptr_->findex; return dptr_->findex;
} }
inline bst_float fvalue( void ) const{ inline bst_float fvalue(void) const{
return dptr_->fvalue; return dptr_->fvalue;
} }
}; };
/*! \brief column iterator */ /*! \brief column iterator */
struct ColIter: public RowIter{ struct ColIter : public RowIter{
ColIter( const REntry* dptr, const REntry* end ) ColIter(const REntry* dptr, const REntry* end)
:RowIter( dptr, end ){} :RowIter(dptr, end){}
inline bst_uint rindex( void ) const{ inline bst_uint rindex(void) const{
return this->findex(); return this->findex();
} }
}; };
/*! \brief reverse column iterator */ /*! \brief reverse column iterator */
struct ColBackIter: public ColIter{ struct ColBackIter : public ColIter{
ColBackIter( const REntry* dptr, const REntry* end ) ColBackIter(const REntry* dptr, const REntry* end)
:ColIter( dptr, end ){} :ColIter(dptr, end){}
// shadows RowIter::Next // shadows RowIter::Next
inline bool Next( void ){ inline bool Next(void){
if( dptr_ == end_ ) return false; if (dptr_ == end_) return false;
else{ else{
-- dptr_; return true; --dptr_; return true;
} }
} }
}; };
public: public:
/*! \brief constructor */ /*! \brief constructor */
FMatrixS( void ){ this->Clear(); } FMatrixS(void){ this->Clear(); }
/*! \brief get number of rows */ /*! \brief get number of rows */
inline size_t NumRow( void ) const{ inline size_t NumRow(void) const{
return row_ptr_.size() - 1; return row_ptr_.size() - 1;
} }
/*! /*!
* \brief get number of nonzero entries * \brief get number of nonzero entries
* \return number of nonzero entries * \return number of nonzero entries
*/ */
inline size_t NumEntry( void ) const{ inline size_t NumEntry(void) const{
return row_data_.size(); return row_data_.size();
} }
/*! \brief clear the storage */ /*! \brief clear the storage */
inline void Clear( void ){ inline void Clear(void){
row_ptr_.clear(); row_ptr_.clear();
row_ptr_.push_back( 0 ); row_ptr_.push_back(0);
row_data_.clear(); row_data_.clear();
col_ptr_.clear(); col_ptr_.clear();
col_data_.clear(); col_data_.clear();
} }
/*! \brief get sparse part of current row */ /*! \brief get sparse part of current row */
inline Line operator[]( size_t sidx ) const{ inline Line operator[](size_t sidx) const{
Line sp; Line sp;
utils::Assert( !bst_debug || sidx < this->NumRow(), "row id exceed bound" ); utils::Assert(!bst_debug || sidx < this->NumRow(), "row id exceed bound");
sp.len = static_cast<bst_uint>( row_ptr_[ sidx + 1 ] - row_ptr_[ sidx ] ); sp.len = static_cast<bst_uint>(row_ptr_[sidx + 1] - row_ptr_[sidx]);
sp.data_ = &row_data_[ row_ptr_[ sidx ] ]; sp.data_ = &row_data_[row_ptr_[sidx]];
return sp; return sp;
} }
/*! /*!
* \brief add a row to the matrix, with data stored in STL container * \brief add a row to the matrix, with data stored in STL container
* \param findex feature index * \param findex feature index
* \param fvalue feature value * \param fvalue feature value
@@ -227,158 +227,161 @@ namespace xgboost{
* \param fend end bound range of feature * \param fend end bound range of feature
* \return the row id added line * \return the row id added line
*/ */
inline size_t AddRow( const std::vector<bst_uint> &findex, inline size_t AddRow(const std::vector<bst_uint> &findex,
const std::vector<bst_float> &fvalue, const std::vector<bst_float> &fvalue,
unsigned fstart = 0, unsigned fend = UINT_MAX ){ unsigned fstart = 0, unsigned fend = UINT_MAX){
utils::Assert( findex.size() == fvalue.size() ); utils::Assert(findex.size() == fvalue.size());
unsigned cnt = 0; unsigned cnt = 0;
for( size_t i = 0; i < findex.size(); i ++ ){ for (size_t i = 0; i < findex.size(); i++){
if( findex[i] < fstart || findex[i] >= fend ) continue; if (findex[i] < fstart || findex[i] >= fend) continue;
row_data_.push_back( REntry( findex[i], fvalue[i] ) ); row_data_.push_back(REntry(findex[i], fvalue[i]));
cnt ++; cnt++;
} }
row_ptr_.push_back( row_ptr_.back() + cnt ); row_ptr_.push_back(row_ptr_.back() + cnt);
return row_ptr_.size() - 2; return row_ptr_.size() - 2;
} }
/*! \brief get row iterator*/ /*! \brief get row iterator*/
inline RowIter GetRow( size_t ridx ) const{ inline RowIter GetRow(size_t ridx) const{
utils::Assert( !bst_debug || ridx < this->NumRow(), "row id exceed bound" ); utils::Assert(!bst_debug || ridx < this->NumRow(), "row id exceed bound");
return RowIter( &row_data_[ row_ptr_[ridx] ] - 1, &row_data_[ row_ptr_[ridx+1] ] - 1 ); return RowIter(&row_data_[row_ptr_[ridx]] - 1, &row_data_[row_ptr_[ridx + 1]] - 1);
} }
/*! \brief get row iterator*/ /*! \brief get row iterator*/
inline RowIter GetRow( size_t ridx, unsigned gid ) const{ inline RowIter GetRow(size_t ridx, unsigned gid) const{
utils::Assert( gid == 0, "FMatrixS only have 1 column group" ); utils::Assert(gid == 0, "FMatrixS only have 1 column group");
return FMatrixS::GetRow( ridx ); return FMatrixS::GetRow(ridx);
} }
public: public:
/*! \return whether column access is enabled */ /*! \return whether column access is enabled */
inline bool HaveColAccess( void ) const{ inline bool HaveColAccess(void) const{
return col_ptr_.size() != 0 && col_data_.size() == row_data_.size(); return col_ptr_.size() != 0 && col_data_.size() == row_data_.size();
} }
/*! \brief get number of colmuns */ /*! \brief get number of colmuns */
inline size_t NumCol( void ) const{ inline size_t NumCol(void) const{
utils::Assert( this->HaveColAccess() ); utils::Assert(this->HaveColAccess());
return col_ptr_.size() - 1; return col_ptr_.size() - 1;
} }
/*! \brief get col iterator*/ /*! \brief get col iterator*/
inline ColIter GetSortedCol( size_t cidx ) const{ inline ColIter GetSortedCol(size_t cidx) const{
utils::Assert( !bst_debug || cidx < this->NumCol(), "col id exceed bound" ); utils::Assert(!bst_debug || cidx < this->NumCol(), "col id exceed bound");
return ColIter( &col_data_[ col_ptr_[cidx] ] - 1, &col_data_[ col_ptr_[cidx+1] ] - 1 ); return ColIter(&col_data_[col_ptr_[cidx]] - 1, &col_data_[col_ptr_[cidx + 1]] - 1);
} }
/*! \brief get col iterator */ /*! \brief get col iterator */
inline ColBackIter GetReverseSortedCol( size_t cidx ) const{ inline ColBackIter GetReverseSortedCol(size_t cidx) const{
utils::Assert( !bst_debug || cidx < this->NumCol(), "col id exceed bound" ); utils::Assert(!bst_debug || cidx < this->NumCol(), "col id exceed bound");
return ColBackIter( &col_data_[ col_ptr_[cidx+1] ], &col_data_[ col_ptr_[cidx] ] ); return ColBackIter(&col_data_[col_ptr_[cidx + 1]], &col_data_[col_ptr_[cidx]]);
} }
/*! /*!
* \brief intialize the data so that we have both column and row major * \brief intialize the data so that we have both column and row major
* access, call this whenever we need column access * access, call this whenever we need column access
*/ */
inline void InitData( void ){ inline void InitData(void){
utils::SparseCSRMBuilder<REntry> builder( col_ptr_, col_data_ ); utils::SparseCSRMBuilder<REntry> builder(col_ptr_, col_data_);
builder.InitBudget( 0 ); builder.InitBudget(0);
for( size_t i = 0; i < this->NumRow(); i ++ ){ for (size_t i = 0; i < this->NumRow(); i++){
for( RowIter it = this->GetRow(i); it.Next(); ){ for (RowIter it = this->GetRow(i); it.Next();){
builder.AddBudget( it.findex() ); builder.AddBudget(it.findex());
} }
} }
builder.InitStorage(); builder.InitStorage();
for( size_t i = 0; i < this->NumRow(); i ++ ){ for (size_t i = 0; i < this->NumRow(); i++){
for( RowIter it = this->GetRow(i); it.Next(); ){ for (RowIter it = this->GetRow(i); it.Next();){
builder.PushElem( it.findex(), REntry( (bst_uint)i, it.fvalue() ) ); builder.PushElem(it.findex(), REntry((bst_uint)i, it.fvalue()));
} }
} }
// sort columns // sort columns
unsigned ncol = static_cast<unsigned>( this->NumCol() ); unsigned ncol = static_cast<unsigned>(this->NumCol());
for( unsigned i = 0; i < ncol; i ++ ){ #pragma omp parallel for schedule(static)
std::sort( &col_data_[ col_ptr_[ i ] ], &col_data_[ col_ptr_[ i+1 ] ], REntry::cmp_fvalue ); for (unsigned i = 0; i < ncol; i++){
std::sort(&col_data_[col_ptr_[i]], &col_data_[col_ptr_[i + 1]], REntry::cmp_fvalue);
} }
} }
/*! /*!
* \brief save data to binary stream * \brief save data to binary stream
* note: since we have size_t in ptr, * note: since we have size_t in ptr,
* the function is not consistent between 64bit and 32bit machine * the function is not consistent between 64bit and 32bit machine
* \param fo output stream * \param fo output stream
*/ */
inline void SaveBinary( utils::IStream &fo ) const{ inline void SaveBinary(utils::IStream &fo) const{
FMatrixS::SaveBinary( fo, row_ptr_, row_data_ ); FMatrixS::SaveBinary(fo, row_ptr_, row_data_);
int col_access = this->HaveColAccess() ? 1 : 0; int col_access = this->HaveColAccess() ? 1 : 0;
fo.Write( &col_access, sizeof(int) ); fo.Write(&col_access, sizeof(int));
if( col_access != 0 ){ if (col_access != 0){
FMatrixS::SaveBinary( fo, col_ptr_, col_data_ ); FMatrixS::SaveBinary(fo, col_ptr_, col_data_);
} }
} }
/*! /*!
* \brief load data from binary stream * \brief load data from binary stream
* note: since we have size_t in ptr, * note: since we have size_t in ptr,
* the function is not consistent between 64bit and 32bit machin * the function is not consistent between 64bit and 32bit machin
* \param fi input stream * \param fi input stream
*/ */
inline void LoadBinary( utils::IStream &fi ){ inline void LoadBinary(utils::IStream &fi){
FMatrixS::LoadBinary( fi, row_ptr_, row_data_ ); FMatrixS::LoadBinary(fi, row_ptr_, row_data_);
int col_access; int col_access;
fi.Read( &col_access, sizeof(int) ); fi.Read(&col_access, sizeof(int));
if( col_access != 0 ){ if (col_access != 0){
FMatrixS::LoadBinary( fi, col_ptr_, col_data_ ); FMatrixS::LoadBinary(fi, col_ptr_, col_data_);
}else{
this->InitData();
} }
} }
/*! /*!
* \brief load from text file * \brief load from text file
* \param fi input file pointer * \param fi input file pointer
*/ */
inline void LoadText( FILE *fi ){ inline void LoadText(FILE *fi){
this->Clear(); this->Clear();
int ninst; int ninst;
while( fscanf( fi, "%d", &ninst ) == 1 ){ while (fscanf(fi, "%d", &ninst) == 1){
std::vector<booster::bst_uint> findex; std::vector<booster::bst_uint> findex;
std::vector<booster::bst_float> fvalue; std::vector<booster::bst_float> fvalue;
while( ninst -- ){ while (ninst--){
unsigned index; float value; unsigned index; float value;
utils::Assert( fscanf( fi, "%u:%f", &index, &value ) == 2, "load Text" ); utils::Assert(fscanf(fi, "%u:%f", &index, &value) == 2, "load Text");
findex.push_back( index ); fvalue.push_back( value ); findex.push_back(index); fvalue.push_back(value);
} }
this->AddRow( findex, fvalue ); this->AddRow(findex, fvalue);
} }
// initialize column support as well // initialize column support as well
this->InitData(); this->InitData();
} }
private: private:
/*! /*!
* \brief save data to binary stream * \brief save data to binary stream
* \param fo output stream * \param fo output stream
* \param ptr pointer data * \param ptr pointer data
* \param data data content * \param data data content
*/ */
inline static void SaveBinary( utils::IStream &fo, inline static void SaveBinary(utils::IStream &fo,
const std::vector<size_t> &ptr, const std::vector<size_t> &ptr,
const std::vector<REntry> &data ){ const std::vector<REntry> &data){
size_t nrow = ptr.size() - 1; size_t nrow = ptr.size() - 1;
fo.Write( &nrow, sizeof(size_t) ); fo.Write(&nrow, sizeof(size_t));
fo.Write( &ptr[0], ptr.size() * sizeof(size_t) ); fo.Write(&ptr[0], ptr.size() * sizeof(size_t));
if( data.size() != 0 ){ if (data.size() != 0){
fo.Write( &data[0] , data.size() * sizeof(REntry) ); fo.Write(&data[0], data.size() * sizeof(REntry));
} }
} }
/*! /*!
* \brief load data from binary stream * \brief load data from binary stream
* \param fi input stream * \param fi input stream
* \param ptr pointer data * \param ptr pointer data
* \param data data content * \param data data content
*/ */
inline static void LoadBinary( utils::IStream &fi, inline static void LoadBinary(utils::IStream &fi,
std::vector<size_t> &ptr, std::vector<size_t> &ptr,
std::vector<REntry> &data ){ std::vector<REntry> &data){
size_t nrow; size_t nrow;
utils::Assert( fi.Read( &nrow, sizeof(size_t) ) != 0, "Load FMatrixS" ); utils::Assert(fi.Read(&nrow, sizeof(size_t)) != 0, "Load FMatrixS");
ptr.resize( nrow + 1 ); ptr.resize(nrow + 1);
utils::Assert( fi.Read( &ptr[0], ptr.size() * sizeof(size_t) ), "Load FMatrixS" ); utils::Assert(fi.Read(&ptr[0], ptr.size() * sizeof(size_t)) != 0, "Load FMatrixS");
data.resize( ptr.back() ); data.resize(ptr.back());
if( data.size() != 0 ){ if (data.size() != 0){
utils::Assert( fi.Read( &data[0] , data.size() * sizeof(REntry) ) , "Load FMatrixS" ); utils::Assert(fi.Read(&data[0], data.size() * sizeof(REntry)) != 0, "Load FMatrixS");
} }
} }
protected: public:
/*! \brief row pointer of CSR sparse storage */ /*! \brief row pointer of CSR sparse storage */
std::vector<size_t> row_ptr_; std::vector<size_t> row_ptr_;
/*! \brief data in the row */ /*! \brief data in the row */
@@ -387,7 +390,7 @@ namespace xgboost{
std::vector<size_t> col_ptr_; std::vector<size_t> col_ptr_;
/*! \brief column datas */ /*! \brief column datas */
std::vector<REntry> col_data_; std::vector<REntry> col_data_;
}; };
}; };
}; };
#endif #endif

View File

@@ -8,25 +8,25 @@
#include "../utils/xgboost_config.h" #include "../utils/xgboost_config.h"
/*! /*!
* \file xgboost_gbmbase.h * \file xgboost_gbmbase.h
* \brief a base model class, * \brief a base model class,
* that assembles the ensembles of booster together and do model update * that assembles the ensembles of booster together and do model update
* this class can be used as base code to create booster variants * this class can be used as base code to create booster variants
* *
* The detailed implementation of boosters should start by using the class * The detailed implementation of boosters should start by using the class
* provided by this file * provided by this file
* *
* \author Tianqi Chen: tianqi.tchen@gmail.com * \author Tianqi Chen: tianqi.tchen@gmail.com
*/ */
namespace xgboost{ namespace xgboost{
namespace booster{ namespace booster{
/*! /*!
* \brief a base model class, * \brief a base model class,
* that assembles the ensembles of booster together and provide single routines to do prediction buffer and update * that assembles the ensembles of booster together and provide single routines to do prediction buffer and update
* this class can be used as base code to create booster variants * this class can be used as base code to create booster variants
* * * *
* relation to xgboost.h: * relation to xgboost.h:
* (1) xgboost.h provides a interface to a single booster(e.g. a single regression tree ) * (1) xgboost.h provides a interface to a single booster(e.g. a single regression tree )
* while GBMBaseModel builds upon IBooster to build a class that * while GBMBaseModel builds upon IBooster to build a class that
* ensembls the boosters together; * ensembls the boosters together;
* (2) GBMBaseModel provides prediction buffering scheme to speedup training; * (2) GBMBaseModel provides prediction buffering scheme to speedup training;
* (3) Summary: GBMBaseModel is a standard wrapper for boosting ensembles; * (3) Summary: GBMBaseModel is a standard wrapper for boosting ensembles;
@@ -37,259 +37,286 @@ namespace xgboost{
* (3) model.InitTrainer before calling model.Predict and model.DoBoost * (3) model.InitTrainer before calling model.Predict and model.DoBoost
* (4) model.Predict to get predictions given a instance * (4) model.Predict to get predictions given a instance
* (4) model.DoBoost to update the ensembles, add new booster to the model * (4) model.DoBoost to update the ensembles, add new booster to the model
* (4) model.SaveModel to save learned results * (4) model.SaveModel to save learned results
* *
* Bufferring: each instance comes with a buffer_index in Predict. * Bufferring: each instance comes with a buffer_index in Predict.
* when mparam.num_pbuffer != 0, a unique buffer index can be * when mparam.num_pbuffer != 0, a unique buffer index can be
* assigned to each instance to buffer previous results of boosters, * assigned to each instance to buffer previous results of boosters,
* this helps to speedup training, so consider assign buffer_index * this helps to speedup training, so consider assign buffer_index
* for each training instances, if buffer_index = -1, the code * for each training instances, if buffer_index = -1, the code
* recalculate things from scratch and will still works correctly * recalculate things from scratch and will still works correctly
*/ */
class GBMBase{ class GBMBase{
public: public:
/*! \brief number of thread used */ /*! \brief number of thread used */
GBMBase( void ){} GBMBase(void){}
/*! \brief destructor */ /*! \brief destructor */
virtual ~GBMBase( void ){ virtual ~GBMBase(void){
this->FreeSpace(); this->FreeSpace();
} }
/*! /*!
* \brief set parameters from outside * \brief set parameters from outside
* \param name name of the parameter * \param name name of the parameter
* \param val value of the parameter * \param val value of the parameter
*/ */
inline void SetParam( const char *name, const char *val ){ inline void SetParam(const char *name, const char *val){
if( !strncmp( name, "bst:", 4 ) ){ if (!strncmp(name, "bst:", 4)){
cfg.PushBack( name + 4, val ); cfg.PushBack(name + 4, val);
} }
if( !strcmp( name, "silent") ){ if (!strcmp(name, "silent")){
cfg.PushBack( name, val ); cfg.PushBack(name, val);
} }
tparam.SetParam( name, val ); tparam.SetParam(name, val);
if( boosters.size() == 0 ) mparam.SetParam( name, val ); if (boosters.size() == 0) mparam.SetParam(name, val);
} }
/*! /*!
* \brief load model from stream * \brief load model from stream
* \param fi input stream * \param fi input stream
*/ */
inline void LoadModel( utils::IStream &fi ){ inline void LoadModel(utils::IStream &fi){
if( boosters.size() != 0 ) this->FreeSpace(); if (boosters.size() != 0) this->FreeSpace();
utils::Assert( fi.Read( &mparam, sizeof(ModelParam) ) != 0 ); utils::Assert(fi.Read(&mparam, sizeof(ModelParam)) != 0);
boosters.resize( mparam.num_boosters ); boosters.resize(mparam.num_boosters);
for( size_t i = 0; i < boosters.size(); i ++ ){ for (size_t i = 0; i < boosters.size(); i++){
boosters[ i ] = booster::CreateBooster<FMatrixS>( mparam.booster_type ); boosters[i] = booster::CreateBooster<FMatrixS>(mparam.booster_type);
boosters[ i ]->LoadModel( fi ); boosters[i]->LoadModel(fi);
} }
{// load info {// load info
booster_info.resize( mparam.num_boosters ); booster_info.resize(mparam.num_boosters);
if( mparam.num_boosters != 0 ){ if (mparam.num_boosters != 0){
utils::Assert( fi.Read( &booster_info[0], sizeof(int)*mparam.num_boosters ) != 0 ); utils::Assert(fi.Read(&booster_info[0], sizeof(int)*mparam.num_boosters) != 0);
} }
} }
if( mparam.num_pbuffer != 0 ){ if (mparam.num_pbuffer != 0){
pred_buffer.resize ( mparam.num_pbuffer ); pred_buffer.resize(mparam.PredBufferSize());
pred_counter.resize( mparam.num_pbuffer ); pred_counter.resize(mparam.PredBufferSize());
utils::Assert( fi.Read( &pred_buffer[0] , pred_buffer.size()*sizeof(float) ) != 0 ); utils::Assert(fi.Read(&pred_buffer[0], pred_buffer.size()*sizeof(float)) != 0);
utils::Assert( fi.Read( &pred_counter[0], pred_counter.size()*sizeof(unsigned) ) != 0 ); utils::Assert(fi.Read(&pred_counter[0], pred_counter.size()*sizeof(unsigned)) != 0);
} }
} }
/*! /*!
* \brief save model to stream * \brief save model to stream
* \param fo output stream * \param fo output stream
*/ */
inline void SaveModel( utils::IStream &fo ) const { inline void SaveModel(utils::IStream &fo) const {
utils::Assert( mparam.num_boosters == (int)boosters.size() ); utils::Assert(mparam.num_boosters == (int)boosters.size());
fo.Write( &mparam, sizeof(ModelParam) ); fo.Write(&mparam, sizeof(ModelParam));
for( size_t i = 0; i < boosters.size(); i ++ ){ for (size_t i = 0; i < boosters.size(); i++){
boosters[ i ]->SaveModel( fo ); boosters[i]->SaveModel(fo);
} }
if( booster_info.size() != 0 ){ if (booster_info.size() != 0){
fo.Write( &booster_info[0], sizeof(int) * booster_info.size() ); fo.Write(&booster_info[0], sizeof(int)* booster_info.size());
} }
if( mparam.num_pbuffer != 0 ){ if (mparam.num_pbuffer != 0){
fo.Write( &pred_buffer[0] , pred_buffer.size()*sizeof(float) ); fo.Write(&pred_buffer[0], pred_buffer.size()*sizeof(float));
fo.Write( &pred_counter[0], pred_counter.size()*sizeof(unsigned) ); fo.Write(&pred_counter[0], pred_counter.size()*sizeof(unsigned));
} }
} }
/*! /*!
* \brief initialize the current data storage for model, if the model is used first time, call this function * \brief initialize the current data storage for model, if the model is used first time, call this function
*/ */
inline void InitModel( void ){ inline void InitModel(void){
pred_buffer.clear(); pred_counter.clear(); pred_buffer.clear(); pred_counter.clear();
pred_buffer.resize ( mparam.num_pbuffer, 0.0 ); pred_buffer.resize(mparam.PredBufferSize(), 0.0);
pred_counter.resize( mparam.num_pbuffer, 0 ); pred_counter.resize(mparam.PredBufferSize(), 0);
utils::Assert( mparam.num_boosters == 0 ); utils::Assert(mparam.num_boosters == 0);
utils::Assert( boosters.size() == 0 ); utils::Assert(boosters.size() == 0);
} }
/*! /*!
* \brief initialize solver before training, called before training * \brief initialize solver before training, called before training
* this function is reserved for solver to allocate necessary space and do other preparation * this function is reserved for solver to allocate necessary space and do other preparation
*/ */
inline void InitTrainer( void ){ inline void InitTrainer(void){
if( tparam.nthread != 0 ){ if (tparam.nthread != 0){
omp_set_num_threads( tparam.nthread ); omp_set_num_threads(tparam.nthread);
} }
if (mparam.num_booster_group == 0) mparam.num_booster_group = 1;
// make sure all the boosters get the latest parameters // make sure all the boosters get the latest parameters
for( size_t i = 0; i < this->boosters.size(); i ++ ){ for (size_t i = 0; i < this->boosters.size(); i++){
this->ConfigBooster( this->boosters[i] ); this->ConfigBooster(this->boosters[i]);
} }
} }
/*! /*!
* \brief DumpModel * \brief DumpModel
* \param fo text file * \param fo text file
* \param fmap feature map that may help give interpretations of feature * \param fmap feature map that may help give interpretations of feature
* \param with_stats whether print statistics * \param with_stats whether print statistics
*/ */
inline void DumpModel( FILE *fo, const utils::FeatMap& fmap, bool with_stats ){ inline void DumpModel(FILE *fo, const utils::FeatMap& fmap, bool with_stats){
for( size_t i = 0; i < boosters.size(); i ++ ){ for (size_t i = 0; i < boosters.size(); i++){
fprintf( fo, "booster[%d]\n", (int)i ); fprintf(fo, "booster[%d]\n", (int)i);
boosters[i]->DumpModel( fo, fmap, with_stats ); boosters[i]->DumpModel(fo, fmap, with_stats);
} }
} }
/*! /*!
* \brief Dump path of all trees * \brief Dump path of all trees
* \param fo text file * \param fo text file
* \param data input data * \param data input data
*/ */
inline void DumpPath( FILE *fo, const FMatrixS &data ){ inline void DumpPath(FILE *fo, const FMatrixS &data){
for( size_t i = 0; i < data.NumRow(); ++ i ){ for (size_t i = 0; i < data.NumRow(); ++i){
for( size_t j = 0; j < boosters.size(); ++ j ){ for (size_t j = 0; j < boosters.size(); ++j){
if( j != 0 ) fprintf( fo, "\t" ); if (j != 0) fprintf(fo, "\t");
std::vector<int> path; std::vector<int> path;
boosters[j]->PredPath( path, data, i ); boosters[j]->PredPath(path, data, i);
fprintf( fo, "%d", path[0] ); fprintf(fo, "%d", path[0]);
for( size_t k = 1; k < path.size(); ++ k ){ for (size_t k = 1; k < path.size(); ++k){
fprintf( fo, ",%d", path[k] ); fprintf(fo, ",%d", path[k]);
} }
} }
fprintf( fo, "\n" ); fprintf(fo, "\n");
} }
} }
public: public:
/*! /*!
* \brief do gradient boost training for one step, using the information given * \brief do gradient boost training for one step, using the information given
* Note: content of grad and hess can change after DoBoost * Note: content of grad and hess can change after DoBoost
* \param grad first order gradient of each instance * \param grad first order gradient of each instance
* \param hess second order gradient of each instance * \param hess second order gradient of each instance
* \param feats features of each instance * \param feats features of each instance
* \param root_index pre-partitioned root index of each instance, * \param root_index pre-partitioned root index of each instance,
* root_index.size() can be 0 which indicates that no pre-partition involved * root_index.size() can be 0 which indicates that no pre-partition involved
* \param bst_group which booster group it belongs to, by default, we only have 1 booster group, and leave this parameter as default
*/ */
inline void DoBoost( std::vector<float> &grad, inline void DoBoost(std::vector<float> &grad,
std::vector<float> &hess, std::vector<float> &hess,
const booster::FMatrixS &feats, const booster::FMatrixS &feats,
const std::vector<unsigned> &root_index ) { const std::vector<unsigned> &root_index,
booster::IBooster *bst = this->GetUpdateBooster(); int bst_group = 0 ) {
bst->DoBoost( grad, hess, feats, root_index ); booster::IBooster *bst = this->GetUpdateBooster( bst_group );
bst->DoBoost(grad, hess, feats, root_index);
} }
/*! /*!
* \brief predict values for given sparse feature vector * \brief predict values for given sparse feature vector
* NOTE: in tree implementation, this is only OpenMP threadsafe, but not threadsafe * NOTE: in tree implementation, this is only OpenMP threadsafe, but not threadsafe
* \param feats feature matrix * \param feats feature matrix
* \param row_index row index in the feature matrix * \param row_index row index in the feature matrix
* \param buffer_index the buffer index of the current feature line, default -1 means no buffer assigned * \param buffer_index the buffer index of the current feature line, default -1 means no buffer assigned
* \param root_index root id of current instance, default = 0 * \param root_index root id of current instance, default = 0
* \return prediction * \param bst_group booster group index
* \return prediction
*/ */
inline float Predict( const FMatrixS &feats, bst_uint row_index, int buffer_index = -1, unsigned root_index = 0 ){ inline float Predict(const FMatrixS &feats, bst_uint row_index,
size_t istart = 0; int buffer_index = -1, unsigned root_index = 0, int bst_group = 0 ){
size_t itop = 0;
float psum = 0.0f; float psum = 0.0f;
const int bid = mparam.BufferOffset(buffer_index, bst_group);
// load buffered results if any // load buffered results if any
if( mparam.do_reboost == 0 && buffer_index >= 0 ){ if (mparam.do_reboost == 0 && bid >= 0){
utils::Assert( buffer_index < mparam.num_pbuffer, "buffer index exceed num_pbuffer" ); itop = this->pred_counter[bid];
istart = this->pred_counter[ buffer_index ]; psum = this->pred_buffer[bid];
psum = this->pred_buffer [ buffer_index ]; }
for (size_t i = itop; i < this->boosters.size(); ++i ){
if( booster_info[i] == bst_group ){
psum += this->boosters[i]->Predict(feats, row_index, root_index);
}
} }
for( size_t i = istart; i < this->boosters.size(); i ++ ){
psum += this->boosters[ i ]->Predict( feats, row_index, root_index );
}
// updated the buffered results // updated the buffered results
if( mparam.do_reboost == 0 && buffer_index >= 0 ){ if (mparam.do_reboost == 0 && bid >= 0){
this->pred_counter[ buffer_index ] = static_cast<unsigned>( boosters.size() ); this->pred_counter[bid] = static_cast<unsigned>(boosters.size());
this->pred_buffer [ buffer_index ] = psum; this->pred_buffer[bid] = psum;
} }
return psum; return psum;
} }
/*! \return number of boosters so far */
inline int NumBoosters(void) const{
return mparam.num_boosters;
}
/*! \return number of booster groups */
inline int NumBoosterGroup(void) const{
if( mparam.num_booster_group == 0 ) return 1;
return mparam.num_booster_group;
}
public: public:
//--------trial code for interactive update an existing booster------ //--------trial code for interactive update an existing booster------
//-------- usually not needed, ignore this region --------- //-------- usually not needed, ignore this region ---------
/*! /*!
* \brief same as Predict, but removes the prediction of booster to be updated * \brief same as Predict, but removes the prediction of booster to be updated
* this function must be called once and only once for every data with pbuffer * this function must be called once and only once for every data with pbuffer
*/ */
inline float InteractPredict( const FMatrixS &feats, bst_uint row_index, int buffer_index = -1, unsigned root_index = 0 ){ inline float InteractPredict(const FMatrixS &feats, bst_uint row_index,
float psum = this->Predict( feats, row_index, buffer_index, root_index ); int buffer_index = -1, unsigned root_index = 0, int bst_group = 0){
if( tparam.reupdate_booster != -1 ){ float psum = this->Predict(feats, row_index, buffer_index, root_index);
if (tparam.reupdate_booster != -1){
const int bid = tparam.reupdate_booster; const int bid = tparam.reupdate_booster;
utils::Assert( bid >= 0 && bid < (int)boosters.size(), "interact:booster_index exceed existing bound" ); utils::Assert(bid >= 0 && bid < (int)boosters.size(), "interact:booster_index exceed existing bound");
psum -= boosters[ bid ]->Predict( feats, row_index, root_index ); if( bst_group == booster_info[bid] ){
if( mparam.do_reboost == 0 && buffer_index >= 0 ){ psum -= boosters[bid]->Predict(feats, row_index, root_index);
this->pred_buffer[ buffer_index ] = psum; }
if (mparam.do_reboost == 0 && buffer_index >= 0){
this->pred_buffer[mparam.BufferOffset(buffer_index,bst_group)] = psum;
} }
} }
return psum; return psum;
} }
/*! \brief delete the specified booster */ /*! \brief delete the specified booster */
inline void DelteBooster( void ){ inline void DelteBooster(void){
const int bid = tparam.reupdate_booster; const int bid = tparam.reupdate_booster;
utils::Assert( bid >= 0 && bid < mparam.num_boosters , "must specify booster index for deletion"); utils::Assert(bid >= 0 && bid < mparam.num_boosters, "must specify booster index for deletion");
delete boosters[ bid ]; delete boosters[bid];
for( int i = bid + 1; i < mparam.num_boosters; ++ i ){ for (int i = bid + 1; i < mparam.num_boosters; ++i){
boosters[i-1] = boosters[ i ]; boosters[i - 1] = boosters[i];
booster_info[i-1] = booster_info[ i ]; booster_info[i - 1] = booster_info[i];
} }
boosters.resize( mparam.num_boosters -= 1 ); boosters.resize(mparam.num_boosters -= 1);
booster_info.resize( boosters.size() ); booster_info.resize(boosters.size());
// update pred counter
for( size_t i = 0; i < pred_counter.size(); ++ i ){
if( pred_counter[i] > (unsigned)bid ) pred_counter[i] -= 1;
}
} }
/*! \brief update the prediction buffer, after booster have been updated */ /*! \brief update the prediction buffer, after booster have been updated */
inline void InteractRePredict( const FMatrixS &feats, bst_uint row_index, int buffer_index = -1, unsigned root_index = 0 ){ inline void InteractRePredict(const FMatrixS &feats, bst_uint row_index,
if( tparam.reupdate_booster != -1 ){ int buffer_index = -1, unsigned root_index = 0, int bst_group = 0 ){
if (tparam.reupdate_booster != -1){
const int bid = tparam.reupdate_booster; const int bid = tparam.reupdate_booster;
utils::Assert( bid >= 0 && bid < (int)boosters.size(), "interact:booster_index exceed existing bound" ); if( booster_info[bid] != bst_group ) return;
if( mparam.do_reboost == 0 && buffer_index >= 0 ){ utils::Assert(bid >= 0 && bid < (int)boosters.size(), "interact:booster_index exceed existing bound");
this->pred_buffer[ buffer_index ] += boosters[ bid ]->Predict( feats, row_index, root_index ); if (mparam.do_reboost == 0 && buffer_index >= 0){
this->pred_buffer[mparam.BufferOffset(buffer_index,bst_group)] += boosters[bid]->Predict(feats, row_index, root_index);
} }
} }
} }
//-----------non public fields afterwards------------- //-----------non public fields afterwards-------------
protected: protected:
/*! \brief free space of the model */ /*! \brief free space of the model */
inline void FreeSpace( void ){ inline void FreeSpace(void){
for( size_t i = 0; i < boosters.size(); i ++ ){ for (size_t i = 0; i < boosters.size(); i++){
delete boosters[i]; delete boosters[i];
} }
boosters.clear(); booster_info.clear(); mparam.num_boosters = 0; boosters.clear(); booster_info.clear(); mparam.num_boosters = 0;
} }
/*! \brief configure a booster */ /*! \brief configure a booster */
inline void ConfigBooster( booster::IBooster *bst ){ inline void ConfigBooster(booster::IBooster *bst){
cfg.BeforeFirst(); cfg.BeforeFirst();
while( cfg.Next() ){ while (cfg.Next()){
bst->SetParam( cfg.name(), cfg.val() ); bst->SetParam(cfg.name(), cfg.val());
} }
} }
/*! /*!
* \brief get a booster to update * \brief get a booster to update
* \return the booster created * \return the booster created
*/ */
inline booster::IBooster *GetUpdateBooster( void ){ inline booster::IBooster *GetUpdateBooster(int bst_group){
if( tparam.reupdate_booster != -1 ){ if (tparam.reupdate_booster != -1){
const int bid = tparam.reupdate_booster; const int bid = tparam.reupdate_booster;
utils::Assert( bid >= 0 && bid < (int)boosters.size(), "interact:booster_index exceed existing bound" ); utils::Assert(bid >= 0 && bid < (int)boosters.size(), "interact:booster_index exceed existing bound");
this->ConfigBooster( boosters[bid] ); this->ConfigBooster(boosters[bid]);
return boosters[ bid ]; utils::Assert( bst_group == booster_info[bid], "booster group must match existing reupdate booster");
return boosters[bid];
} }
if( mparam.do_reboost == 0 || boosters.size() == 0 ){ if (mparam.do_reboost == 0 || boosters.size() == 0){
mparam.num_boosters += 1; mparam.num_boosters += 1;
boosters.push_back( booster::CreateBooster<FMatrixS>( mparam.booster_type ) ); boosters.push_back(booster::CreateBooster<FMatrixS>(mparam.booster_type));
booster_info.push_back( 0 ); booster_info.push_back(bst_group);
this->ConfigBooster( boosters.back() ); this->ConfigBooster(boosters.back());
boosters.back()->InitModel(); boosters.back()->InitModel();
}else{ }
this->ConfigBooster( boosters.back() ); else{
this->ConfigBooster(boosters.back());
} }
return boosters.back(); return boosters.back();
} }
@@ -306,76 +333,93 @@ namespace xgboost{
int num_feature; int num_feature;
/*! \brief size of predicton buffer allocated for buffering boosting computation */ /*! \brief size of predicton buffer allocated for buffering boosting computation */
int num_pbuffer; int num_pbuffer;
/*! /*!
* \brief whether we repeatly update a single booster each round: default 0 * \brief whether we repeatly update a single booster each round: default 0
* set to 1 for linear booster, so that regularization term can be considered * set to 1 for linear booster, so that regularization term can be considered
*/ */
int do_reboost; int do_reboost;
/*!
* \brief number of booster group, how many predictions a single
* input instance could corresponds to
*/
int num_booster_group;
/*! \brief reserved parameters */ /*! \brief reserved parameters */
int reserved[ 32 ]; int reserved[31];
/*! \brief constructor */ /*! \brief constructor */
ModelParam( void ){ ModelParam(void){
num_boosters = 0; num_boosters = 0;
booster_type = 0; booster_type = 0;
num_roots = num_feature = 0; num_roots = num_feature = 0;
do_reboost = 0; do_reboost = 0;
num_pbuffer = 0; num_pbuffer = 0;
memset( reserved, 0, sizeof( reserved ) ); num_booster_group = 1;
memset(reserved, 0, sizeof(reserved));
} }
/*! /*!
* \brief set parameters from outside * \brief set parameters from outside
* \param name name of the parameter * \param name name of the parameter
* \param val value of the parameter * \param val value of the parameter
*/ */
inline void SetParam( const char *name, const char *val ){ inline void SetParam(const char *name, const char *val){
if( !strcmp("booster_type", name ) ){ if (!strcmp("booster_type", name)){
booster_type = atoi( val ); booster_type = atoi(val);
// linear boost automatically set do reboost // linear boost automatically set do reboost
if( booster_type == 1 ) do_reboost = 1; if (booster_type == 1) do_reboost = 1;
} }
if( !strcmp("num_pbuffer", name ) ) num_pbuffer = atoi( val ); if (!strcmp("num_pbuffer", name)) num_pbuffer = atoi(val);
if( !strcmp("do_reboost", name ) ) do_reboost = atoi( val ); if (!strcmp("do_reboost", name)) do_reboost = atoi(val);
if( !strcmp("bst:num_roots", name ) ) num_roots = atoi( val ); if (!strcmp("num_booster_group", name)) num_booster_group = atoi(val);
if( !strcmp("bst:num_feature", name ) ) num_feature = atoi( val ); if (!strcmp("bst:num_roots", name)) num_roots = atoi(val);
if (!strcmp("bst:num_feature", name)) num_feature = atoi(val);
}
inline int PredBufferSize(void) const{
if (num_booster_group == 0) return num_pbuffer;
else return num_booster_group * num_pbuffer;
}
inline int BufferOffset( int buffer_index, int bst_group ) const{
if( buffer_index < 0 ) return -1;
utils::Assert( buffer_index < num_pbuffer, "buffer_indexexceed num_pbuffer" );
return buffer_index + num_pbuffer * bst_group;
} }
}; };
/*! \brief training parameters */ /*! \brief training parameters */
struct TrainParam{ struct TrainParam{
/*! \brief number of OpenMP threads */ /*! \brief number of OpenMP threads */
int nthread; int nthread;
/*! /*!
* \brief index of specific booster to be re-updated, default = -1: update new booster * \brief index of specific booster to be re-updated, default = -1: update new booster
* parameter this is part of trial interactive update mode * parameter this is part of trial interactive update mode
*/ */
int reupdate_booster; int reupdate_booster;
/*! \brief constructor */ /*! \brief constructor */
TrainParam( void ) { TrainParam(void) {
nthread = 1; nthread = 1;
reupdate_booster = -1; reupdate_booster = -1;
} }
/*! /*!
* \brief set parameters from outside * \brief set parameters from outside
* \param name name of the parameter * \param name name of the parameter
* \param val value of the parameter * \param val value of the parameter
*/ */
inline void SetParam( const char *name, const char *val ){ inline void SetParam(const char *name, const char *val){
if( !strcmp("nthread", name ) ) nthread = atoi( val ); if (!strcmp("nthread", name)) nthread = atoi(val);
if( !strcmp("interact:booster_index", name ) ) reupdate_booster = atoi( val ); if (!strcmp("interact:booster_index", name)) reupdate_booster = atoi(val);
} }
}; };
protected: protected:
/*! \brief model parameters */ /*! \brief model parameters */
ModelParam mparam; ModelParam mparam;
/*! \brief training parameters */ /*! \brief training parameters */
TrainParam tparam; TrainParam tparam;
protected: protected:
/*! \brief component boosters */ /*! \brief component boosters */
std::vector<booster::IBooster*> boosters; std::vector<booster::IBooster*> boosters;
/*! \brief some information indicator of the booster, reserved */ /*! \brief some information indicator of the booster, reserved */
std::vector<int> booster_info; std::vector<int> booster_info;
/*! \brief prediction buffer */ /*! \brief prediction buffer */
std::vector<float> pred_buffer; std::vector<float> pred_buffer;
/*! \brief prediction buffer counter, record the progress so fart of the buffer */ /*! \brief prediction buffer counter, record the progress so fart of the buffer */
std::vector<unsigned> pred_counter; std::vector<unsigned> pred_counter;
/*! \brief configurations saved for each booster */ /*! \brief configurations saved for each booster */
utils::ConfigSaver cfg; utils::ConfigSaver cfg;

View File

@@ -24,7 +24,7 @@ def loadfmap( fname ):
return fmap, nmap return fmap, nmap
def write_nmap( fo, nmap ): def write_nmap( fo, nmap ):
for i in xrange( len(nmap) ): for i in range( len(nmap) ):
fo.write('%d\t%s\ti\n' % (i, nmap[i]) ) fo.write('%d\t%s\ti\n' % (i, nmap[i]) )
# start here # start here
@@ -41,7 +41,7 @@ for l in open( 'agaricus-lepiota.data' ):
else: else:
assert arr[0] == 'e' assert arr[0] == 'e'
fo.write('0') fo.write('0')
for i in xrange( 1,len(arr) ): for i in range( 1,len(arr) ):
fo.write( ' %d:1' % fmap[i][arr[i].strip()] ) fo.write( ' %d:1' % fmap[i][arr[i].strip()] )
fo.write('\n') fo.write('\n')

View File

@@ -3,7 +3,7 @@ import sys
import random import random
if len(sys.argv) < 2: if len(sys.argv) < 2:
print 'Usage:<filename> <k> [nfold = 5]' print ('Usage:<filename> <k> [nfold = 5]')
exit(0) exit(0)
random.seed( 10 ) random.seed( 10 )

View File

@@ -2,7 +2,7 @@
# choose the tree booster, 0: tree, 1: linear # choose the tree booster, 0: tree, 1: linear
booster_type = 0 booster_type = 0
# choose logistic regression loss function for binary classification # choose logistic regression loss function for binary classification
loss_type = 2 objective = binary:logistic
# Tree Booster Parameters # Tree Booster Parameters
# step size shrinkage # step size shrinkage
@@ -23,5 +23,7 @@ save_period = 0
data = "agaricus.txt.train" data = "agaricus.txt.train"
# The path of validation data, used to monitor training process, here [test] sets name of the validation set # The path of validation data, used to monitor training process, here [test] sets name of the validation set
eval[test] = "agaricus.txt.test" eval[test] = "agaricus.txt.test"
# evaluate on training data as well each round
eval_train = 1
# The path of test data # The path of test data
test:data = "agaricus.txt.test" test:data = "agaricus.txt.test"

View File

@@ -0,0 +1,19 @@
Guide for Kaggle Higgs Challenge
=====
This is the folder giving example of how to use XGBoost Python Module to run Kaggle Higgs competition
This script will achieve about 3.600 AMS score in public leadboard. To get start, you need do following step:
1. Compile the XGBoost python lib
```bash
cd ../../python
make
```
2. Put training.csv test.csv on folder './data' (you can create a symbolic link)
3. Run ./run.sh
Speed
=====
speedtest.py compares xgboost's speed on this dataset with sklearn.GBM

View File

@@ -0,0 +1,62 @@
#!/usr/bin/python
# this is the example script to use xgboost to train
import inspect
import os
import sys
import numpy as np
# add path of xgboost python module
code_path = os.path.join(
os.path.split(inspect.getfile(inspect.currentframe()))[0], "../../python")
sys.path.append(code_path)
import xgboost as xgb
test_size = 550000
# path to where the data lies
dpath = 'data'
# load in training data, directly use numpy
dtrain = np.loadtxt( dpath+'/training.csv', delimiter=',', skiprows=1, converters={32: lambda x:int(x=='s'.encode('utf-8')) } )
print ('finish loading from csv ')
label = dtrain[:,32]
data = dtrain[:,1:31]
# rescale weight to make it same as test set
weight = dtrain[:,31] * float(test_size) / len(label)
sum_wpos = sum( weight[i] for i in range(len(label)) if label[i] == 1.0 )
sum_wneg = sum( weight[i] for i in range(len(label)) if label[i] == 0.0 )
# print weight statistics
print ('weight statistics: wpos=%g, wneg=%g, ratio=%g' % ( sum_wpos, sum_wneg, sum_wneg/sum_wpos ))
# construct xgboost.DMatrix from numpy array, treat -999.0 as missing value
xgmat = xgb.DMatrix( data, label=label, missing = -999.0, weight=weight )
# setup parameters for xgboost
param = {}
# use logistic regression loss, use raw prediction before logistic transformation
# since we only need the rank
param['objective'] = 'binary:logitraw'
# scale weight of positive examples
param['scale_pos_weight'] = sum_wneg/sum_wpos
param['bst:eta'] = 0.1
param['bst:max_depth'] = 6
param['eval_metric'] = 'auc'
param['silent'] = 1
param['nthread'] = 16
# you can directly throw param in, though we want to watch multiple metrics here
plst = list(param.items())+[('eval_metric', 'ams@0.15')]
watchlist = [ (xgmat,'train') ]
# boost 120 tres
num_round = 120
print ('loading data end, start to boost trees')
bst = xgb.train( plst, xgmat, num_round, watchlist );
# save out model
bst.save_model('higgs.model')
print ('finish training')

54
demo/kaggle-higgs/higgs-pred.py Executable file
View File

@@ -0,0 +1,54 @@
#!/usr/bin/python
# make prediction
import sys
import numpy as np
# add path of xgboost python module
sys.path.append('../../python/')
import xgboost as xgb
# path to where the data lies
dpath = 'data'
modelfile = 'higgs.model'
outfile = 'higgs.pred.csv'
# make top 15% as positive
threshold_ratio = 0.15
# load in training data, directly use numpy
dtest = np.loadtxt( dpath+'/test.csv', delimiter=',', skiprows=1 )
data = dtest[:,1:31]
idx = dtest[:,0]
print ('finish loading from csv ')
xgmat = xgb.DMatrix( data, missing = -999.0 )
bst = xgb.Booster({'nthread':16})
bst.load_model( modelfile )
ypred = bst.predict( xgmat )
res = [ ( int(idx[i]), ypred[i] ) for i in range(len(ypred)) ]
rorder = {}
for k, v in sorted( res, key = lambda x:-x[1] ):
rorder[ k ] = len(rorder) + 1
# write out predictions
ntop = int( threshold_ratio * len(rorder ) )
fo = open(outfile, 'w')
nhit = 0
ntot = 0
fo.write('EventId,RankOrder,Class\n')
for k, v in res:
if rorder[k] <= ntop:
lb = 's'
nhit += 1
else:
lb = 'b'
# change output rank order to follow Kaggle convention
fo.write('%s,%d,%s\n' % ( k, len(rorder)+1-rorder[k], lb ) )
ntot += 1
fo.close()
print ('finished writing into prediction file')

14
demo/kaggle-higgs/run.sh Executable file
View File

@@ -0,0 +1,14 @@
#!/bin/bash
python -u higgs-numpy.py
ret=$?
if [[ $ret != 0 ]]; then
echo "ERROR in higgs-numpy.py"
exit $ret
fi
python -u higgs-pred.py
ret=$?
if [[ $ret != 0 ]]; then
echo "ERROR in higgs-pred.py"
exit $ret
fi

66
demo/kaggle-higgs/speedtest.py Executable file
View File

@@ -0,0 +1,66 @@
#!/usr/bin/python
# this is the example script to use xgboost to train
import sys
import numpy as np
# add path of xgboost python module
sys.path.append('../../python/')
import xgboost as xgb
from sklearn.ensemble import GradientBoostingClassifier
import time
test_size = 550000
# path to where the data lies
dpath = 'data'
# load in training data, directly use numpy
dtrain = np.loadtxt( dpath+'/training.csv', delimiter=',', skiprows=1, converters={32: lambda x:int(x=='s') } )
print ('finish loading from csv ')
label = dtrain[:,32]
data = dtrain[:,1:31]
# rescale weight to make it same as test set
weight = dtrain[:,31] * float(test_size) / len(label)
sum_wpos = sum( weight[i] for i in range(len(label)) if label[i] == 1.0 )
sum_wneg = sum( weight[i] for i in range(len(label)) if label[i] == 0.0 )
# print weight statistics
print ('weight statistics: wpos=%g, wneg=%g, ratio=%g' % ( sum_wpos, sum_wneg, sum_wneg/sum_wpos ))
# construct xgboost.DMatrix from numpy array, treat -999.0 as missing value
xgmat = xgb.DMatrix( data, label=label, missing = -999.0, weight=weight )
# setup parameters for xgboost
param = {}
# use logistic regression loss
param['objective'] = 'binary:logitraw'
# scale weight of positive examples
param['scale_pos_weight'] = sum_wneg/sum_wpos
param['bst:eta'] = 0.1
param['bst:max_depth'] = 6
param['eval_metric'] = 'auc'
param['silent'] = 1
param['nthread'] = 4
plst = param.items()+[('eval_metric', 'ams@0.15')]
watchlist = [ (xgmat,'train') ]
# boost 10 tres
num_round = 10
print ('loading data end, start to boost trees')
print ("training GBM from sklearn")
tmp = time.time()
gbm = GradientBoostingClassifier(n_estimators=num_round, max_depth=6, verbose=2)
gbm.fit(data, label)
print ("sklearn.GBM costs: %s seconds" % str(time.time() - tmp))
#raw_input()
print ("training xgboost")
threads = [1, 2, 4, 16]
for i in threads:
param['nthread'] = i
tmp = time.time()
plst = param.items()+[('eval_metric', 'ams@0.15')]
bst = xgb.train( plst, xgmat, num_round, watchlist );
print ("XGBoost with %d thread costs: %s seconds" % (i, str(time.time() - tmp)))
print ('finish training')

View File

@@ -0,0 +1,10 @@
Demonstrating how to use XGBoost accomplish Multi-Class classification task on [UCI Dermatology dataset](https://archive.ics.uci.edu/ml/datasets/Dermatology)
Make sure you make make xgboost python module in ../../python
1. Run runexp.sh
```bash
./runexp.sh
```
Explainations can be found in [wiki](https://github.com/tqchen/xgboost/wiki)

View File

@@ -0,0 +1,9 @@
#!/bin/bash
if [ -f dermatology.data ]
then
echo "use existing data to run multi class classification"
else
echo "getting data from uci, make sure you are connected to internet"
wget https://archive.ics.uci.edu/ml/machine-learning-databases/dermatology/dermatology.data
fi
python train.py

View File

@@ -0,0 +1,49 @@
#! /usr/bin/python
import sys
import numpy as np
sys.path.append('../../python/')
import xgboost as xgb
# label need to be 0 to num_class -1
data = np.loadtxt('./dermatology.data', delimiter=',',converters={33: lambda x:int(x == '?'), 34: lambda x:int(x)-1 } )
sz = data.shape
train = data[:int(sz[0] * 0.7), :]
test = data[int(sz[0] * 0.7):, :]
train_X = train[:,0:33]
train_Y = train[:, 34]
test_X = test[:,0:33]
test_Y = test[:, 34]
xg_train = xgb.DMatrix( train_X, label=train_Y)
xg_test = xgb.DMatrix(test_X, label=test_Y)
# setup parameters for xgboost
param = {}
# use softmax multi-class classification
param['objective'] = 'multi:softmax'
# scale weight of positive examples
param['bst:eta'] = 0.1
param['bst:max_depth'] = 6
param['silent'] = 1
param['nthread'] = 4
param['num_class'] = 6
watchlist = [ (xg_train,'train'), (xg_test, 'test') ]
num_round = 5
bst = xgb.train(param, xg_train, num_round, watchlist );
# get prediction
pred = bst.predict( xg_test );
print ('predicting, classification error=%f' % (sum( int(pred[i]) != test_Y[i] for i in range(len(test_Y))) / float(len(test_Y)) ))
# do the same thing again, but output probabilities
param['objective'] = 'multi:softprob'
bst = xgb.train(param, xg_train, num_round, watchlist );
# get prediction, this is in 1D array, need reshape to (nclass, ndata)
yprob = bst.predict( xg_test ).reshape( 6, test_Y.shape[0] )
ylabel = np.argmax( yprob, axis=0)
print ('predicting, classification error=%f' % (sum( int(ylabel[i]) != test_Y[i] for i in range(len(test_Y))) / float(len(test_Y)) ))

13
demo/rank/README Normal file
View File

@@ -0,0 +1,13 @@
Instructions:
The dataset for ranking demo is from LETOR04 MQ2008 fold1,
You can use the following command to run the example
Get the data: ./wgetdata.sh
Run the example: ./runexp.sh

30
demo/rank/mq2008.conf Normal file
View File

@@ -0,0 +1,30 @@
# General Parameters, see comment for each definition
# choose the tree booster, 0: tree, 1: linear
booster_type = 0
# specify objective
objective="rank:pairwise"
# Tree Booster Parameters
# step size shrinkage
bst:eta = 0.1
# minimum loss reduction required to make a further partition
bst:gamma = 1.0
# minimum sum of instance weight(hessian) needed in a child
bst:min_child_weight = 0.1
# maximum depth of a tree
bst:max_depth = 6
# Task parameters
# the number of round to do boosting
num_round = 4
# 0 means do not save any model except the final round model
save_period = 0
# The path of training data
data = "mq2008.train"
# The path of validation data, used to monitor training process, here [test] sets name of the validation set
eval[test] = "mq2008.vali"
# The path of test data
test:data = "mq2008.test"

11
demo/rank/runexp.sh Executable file
View File

@@ -0,0 +1,11 @@
python trans_data.py train.txt mq2008.train mq2008.train.group
python trans_data.py test.txt mq2008.test mq2008.test.group
python trans_data.py vali.txt mq2008.vali mq2008.vali.group
../../xgboost mq2008.conf
../../xgboost mq2008.conf task=pred model_in=0004.model

41
demo/rank/trans_data.py Normal file
View File

@@ -0,0 +1,41 @@
import sys
def save_data(group_data,output_feature,output_group):
if len(group_data) == 0:
return
output_group.write(str(len(group_data))+"\n")
for data in group_data:
# only include nonzero features
feats = [ p for p in data[2:] if float(p.split(':')[1]) != 0.0 ]
output_feature.write(data[0] + " " + " ".join(feats) + "\n")
if __name__ == "__main__":
if len(sys.argv) != 4:
print ("Usage: python trans_data.py [Ranksvm Format Input] [Output Feature File] [Output Group File]")
sys.exit(0)
fi = open(sys.argv[1])
output_feature = open(sys.argv[2],"w")
output_group = open(sys.argv[3],"w")
group_data = []
group = ""
for line in fi:
if not line:
break
if "#" in line:
line = line[:line.index("#")]
splits = line.strip().split(" ")
if splits[1] != group:
save_data(group_data,output_feature,output_group)
group_data = []
group = splits[1]
group_data.append(splits)
save_data(group_data,output_feature,output_group)
fi.close()
output_feature.close()
output_group.close()

4
demo/rank/wgetdata.sh Executable file
View File

@@ -0,0 +1,4 @@
#!/bin/bash
wget http://research.microsoft.com/en-us/um/beijing/projects/letor/LETOR4.0/Data/MQ2008.rar
unrar x MQ2008.rar
mv -f MQ2008/Fold1/*.txt .

View File

@@ -1,9 +1,9 @@
# General Parameters, see comment for each definition # General Parameters, see comment for each definition
# choose the tree booster, 0: tree, 1: linear # choose the tree booster, 0: tree, 1: linear
booster_type = 0 booster_type = 0
# this is the only difference with classification, use 0: linear regression # this is the only difference with classification, use reg:linear to do linear classification
# when labels are in [0,1] we can also use 1: logistic regression # when labels are in [0,1] we can also use reg:logistic
loss_type = 0 objective = reg:linear
# Tree Booster Parameters # Tree Booster Parameters
# step size shrinkage # step size shrinkage

View File

@@ -7,7 +7,7 @@ fmap = {}
for l in open( 'machine.data' ): for l in open( 'machine.data' ):
arr = l.split(',') arr = l.split(',')
fo.write(arr[8]) fo.write(arr[8])
for i in xrange( 0,6 ): for i in range( 0,6 ):
fo.write( ' %d:%s' %(i,arr[i+2]) ) fo.write( ' %d:%s' %(i,arr[i+2]) )
if arr[0] not in fmap: if arr[0] not in fmap:
@@ -24,9 +24,9 @@ fo = open('featmap.txt', 'w')
# list from machine.names # list from machine.names
names = ['vendor','MYCT', 'MMIN', 'MMAX', 'CACH', 'CHMIN', 'CHMAX', 'PRP', 'ERP' ]; names = ['vendor','MYCT', 'MMIN', 'MMAX', 'CACH', 'CHMIN', 'CHMAX', 'PRP', 'ERP' ];
for i in xrange(0,6): for i in range(0,6):
fo.write( '%d\t%s\tint\n' % (i, names[i+1])) fo.write( '%d\t%s\tint\n' % (i, names[i+1]))
for v, k in sorted( fmap.iteritems(), key = lambda x:x[1] ): for v, k in sorted( fmap.items(), key = lambda x:x[1] ):
fo.write( '%d\tvendor=%s\ti\n' % (k, v)) fo.write( '%d\tvendor=%s\ti\n' % (k, v))
fo.close() fo.close()

View File

@@ -3,7 +3,7 @@ import sys
import random import random
if len(sys.argv) < 2: if len(sys.argv) < 2:
print 'Usage:<filename> <k> [nfold = 5]' print ('Usage:<filename> <k> [nfold = 5]')
exit(0) exit(0)
random.seed( 10 ) random.seed( 10 )

26
python/Makefile Normal file
View File

@@ -0,0 +1,26 @@
export CC = gcc
export CXX = g++
export CFLAGS = -Wall -O3 -msse2 -Wno-unknown-pragmas -fopenmp
# specify tensor path
SLIB = libxgboostpy.so
.PHONY: clean all
all: $(SLIB)
export LDFLAGS= -pthread -lm
libxgboostpy.so: xgboost_python.cpp ../regrank/*.h ../booster/*.h ../booster/*/*.hpp ../booster/*.hpp
$(SLIB) :
$(CXX) $(CFLAGS) -fPIC $(LDFLAGS) -shared -o $@ $(filter %.cpp %.o %.c, $^)
$(BIN) :
$(CXX) $(CFLAGS) $(LDFLAGS) -o $@ $(filter %.cpp %.o %.c, $^)
$(OBJ) :
$(CXX) -c $(CFLAGS) -o $@ $(firstword $(filter %.cpp %.c, $^) )
install:
cp -f -r $(BIN) $(INSTALL_PATH)
clean:
$(RM) $(OBJ) $(BIN) $(SLIB) *~

3
python/README.md Normal file
View File

@@ -0,0 +1,3 @@
python wrapper for xgboost using ctypes
see example for usage

3
python/example/README.md Normal file
View File

@@ -0,0 +1,3 @@
example to use python xgboost, the data is generated from demo/binary_classification, in libsvm format
for usage: see demo.py and comments in demo.py

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

96
python/example/demo.py Executable file
View File

@@ -0,0 +1,96 @@
#!/usr/bin/python
import sys
import numpy as np
import scipy.sparse
# append the path to xgboost, you may need to change the following line
sys.path.append('../')
import xgboost as xgb
### simple example
# load file from text file, also binary buffer generated by xgboost
dtrain = xgb.DMatrix('agaricus.txt.train')
dtest = xgb.DMatrix('agaricus.txt.test')
# specify parameters via map, definition are same as c++ version
param = {'bst:max_depth':2, 'bst:eta':1, 'silent':1, 'objective':'binary:logistic' }
# specify validations set to watch performance
evallist = [(dtest,'eval'), (dtrain,'train')]
num_round = 2
bst = xgb.train( param, dtrain, num_round, evallist )
# this is prediction
preds = bst.predict( dtest )
labels = dtest.get_label()
print ('error=%f' % ( sum(1 for i in range(len(preds)) if int(preds[i]>0.5)!=labels[i]) /float(len(preds))))
bst.save_model('0001.model')
# dump model
bst.dump_model('dump.raw.txt')
# dump model with feature map
bst.dump_model('dump.raw.txt','featmap.txt')
###
# build dmatrix in python iteratively
#
print ('start running example of build DMatrix in python')
dtrain = xgb.DMatrix()
labels = []
for l in open('agaricus.txt.train'):
arr = l.split()
labels.append( int(arr[0]))
feats = []
for it in arr[1:]:
k,v = it.split(':')
feats.append( (int(k), float(v)) )
dtrain.add_row( feats )
dtrain.set_label( labels )
evallist = [(dtest,'eval'), (dtrain,'train')]
bst = xgb.train( param, dtrain, num_round, evallist )
###
# build dmatrix from scipy.sparse
print ('start running example of build DMatrix from scipy.sparse')
labels = []
row = []; col = []; dat = []
i = 0
for l in open('agaricus.txt.train'):
arr = l.split()
labels.append( int(arr[0]))
for it in arr[1:]:
k,v = it.split(':')
row.append(i); col.append(int(k)); dat.append(float(v))
i += 1
csr = scipy.sparse.csr_matrix( (dat, (row,col)) )
dtrain = xgb.DMatrix( csr )
dtrain.set_label(labels)
evallist = [(dtest,'eval'), (dtrain,'train')]
bst = xgb.train( param, dtrain, num_round, evallist )
print ('start running example of build DMatrix from numpy array')
# NOTE: npymat is numpy array, we will convert it into scipy.sparse.csr_matrix in internal implementation,then convert to DMatrix
npymat = csr.todense()
dtrain = xgb.DMatrix( npymat )
dtrain.set_label(labels)
evallist = [(dtest,'eval'), (dtrain,'train')]
bst = xgb.train( param, dtrain, num_round, evallist )
###
# advanced: cutomsized loss function, set loss_type to 0, so that predict get untransformed score
#
print ('start running example to used cutomized objective function')
# note: set objective= binary:logistic means the prediction will get logistic transformed
# in most case, we may want to leave it as default
param = {'bst:max_depth':2, 'bst:eta':1, 'silent':1, 'objective':'binary:logistic' }
# user define objective function, given prediction, return gradient and second order gradient
def logregobj( preds, dtrain ):
labels = dtrain.get_label()
grad = preds - labels
hess = preds * (1.0-preds)
return grad, hess
# training with customized objective, we can also do step by step training, simply look at xgboost.py's implementation of train
bst = xgb.train( param, dtrain, num_round, evallist, logregobj )

126
python/example/featmap.txt Normal file
View File

@@ -0,0 +1,126 @@
0 cap-shape=bell i
1 cap-shape=conical i
2 cap-shape=convex i
3 cap-shape=flat i
4 cap-shape=knobbed i
5 cap-shape=sunken i
6 cap-surface=fibrous i
7 cap-surface=grooves i
8 cap-surface=scaly i
9 cap-surface=smooth i
10 cap-color=brown i
11 cap-color=buff i
12 cap-color=cinnamon i
13 cap-color=gray i
14 cap-color=green i
15 cap-color=pink i
16 cap-color=purple i
17 cap-color=red i
18 cap-color=white i
19 cap-color=yellow i
20 bruises?=bruises i
21 bruises?=no i
22 odor=almond i
23 odor=anise i
24 odor=creosote i
25 odor=fishy i
26 odor=foul i
27 odor=musty i
28 odor=none i
29 odor=pungent i
30 odor=spicy i
31 gill-attachment=attached i
32 gill-attachment=descending i
33 gill-attachment=free i
34 gill-attachment=notched i
35 gill-spacing=close i
36 gill-spacing=crowded i
37 gill-spacing=distant i
38 gill-size=broad i
39 gill-size=narrow i
40 gill-color=black i
41 gill-color=brown i
42 gill-color=buff i
43 gill-color=chocolate i
44 gill-color=gray i
45 gill-color=green i
46 gill-color=orange i
47 gill-color=pink i
48 gill-color=purple i
49 gill-color=red i
50 gill-color=white i
51 gill-color=yellow i
52 stalk-shape=enlarging i
53 stalk-shape=tapering i
54 stalk-root=bulbous i
55 stalk-root=club i
56 stalk-root=cup i
57 stalk-root=equal i
58 stalk-root=rhizomorphs i
59 stalk-root=rooted i
60 stalk-root=missing i
61 stalk-surface-above-ring=fibrous i
62 stalk-surface-above-ring=scaly i
63 stalk-surface-above-ring=silky i
64 stalk-surface-above-ring=smooth i
65 stalk-surface-below-ring=fibrous i
66 stalk-surface-below-ring=scaly i
67 stalk-surface-below-ring=silky i
68 stalk-surface-below-ring=smooth i
69 stalk-color-above-ring=brown i
70 stalk-color-above-ring=buff i
71 stalk-color-above-ring=cinnamon i
72 stalk-color-above-ring=gray i
73 stalk-color-above-ring=orange i
74 stalk-color-above-ring=pink i
75 stalk-color-above-ring=red i
76 stalk-color-above-ring=white i
77 stalk-color-above-ring=yellow i
78 stalk-color-below-ring=brown i
79 stalk-color-below-ring=buff i
80 stalk-color-below-ring=cinnamon i
81 stalk-color-below-ring=gray i
82 stalk-color-below-ring=orange i
83 stalk-color-below-ring=pink i
84 stalk-color-below-ring=red i
85 stalk-color-below-ring=white i
86 stalk-color-below-ring=yellow i
87 veil-type=partial i
88 veil-type=universal i
89 veil-color=brown i
90 veil-color=orange i
91 veil-color=white i
92 veil-color=yellow i
93 ring-number=none i
94 ring-number=one i
95 ring-number=two i
96 ring-type=cobwebby i
97 ring-type=evanescent i
98 ring-type=flaring i
99 ring-type=large i
100 ring-type=none i
101 ring-type=pendant i
102 ring-type=sheathing i
103 ring-type=zone i
104 spore-print-color=black i
105 spore-print-color=brown i
106 spore-print-color=buff i
107 spore-print-color=chocolate i
108 spore-print-color=green i
109 spore-print-color=orange i
110 spore-print-color=purple i
111 spore-print-color=white i
112 spore-print-color=yellow i
113 population=abundant i
114 population=clustered i
115 population=numerous i
116 population=scattered i
117 population=several i
118 population=solitary i
119 habitat=grasses i
120 habitat=leaves i
121 habitat=meadows i
122 habitat=paths i
123 habitat=urban i
124 habitat=waste i
125 habitat=woods i

205
python/xgboost.py Normal file
View File

@@ -0,0 +1,205 @@
# Author: Tianqi Chen, Bing Xu
# module for xgboost
import ctypes
import os
# optinally have scipy sparse, though not necessary
import numpy
import numpy.ctypeslib
import scipy.sparse as scp
# set this line correctly
XGBOOST_PATH = os.path.dirname(__file__)+'/libxgboostpy.so'
# entry type of sparse matrix
class REntry(ctypes.Structure):
_fields_ = [("findex", ctypes.c_uint), ("fvalue", ctypes.c_float) ]
# load in xgboost library
xglib = ctypes.cdll.LoadLibrary(XGBOOST_PATH)
xglib.XGDMatrixCreate.restype = ctypes.c_void_p
xglib.XGDMatrixNumRow.restype = ctypes.c_ulong
xglib.XGDMatrixGetLabel.restype = ctypes.POINTER( ctypes.c_float )
xglib.XGDMatrixGetWeight.restype = ctypes.POINTER( ctypes.c_float )
xglib.XGDMatrixGetRow.restype = ctypes.POINTER( REntry )
xglib.XGBoosterCreate.restype = ctypes.c_void_p
xglib.XGBoosterPredict.restype = ctypes.POINTER( ctypes.c_float )
def ctypes2numpy( cptr, length ):
# convert a ctypes pointer array to numpy
assert isinstance( cptr, ctypes.POINTER( ctypes.c_float ) )
res = numpy.zeros( length, dtype='float32' )
assert ctypes.memmove( res.ctypes.data, cptr, length * res.strides[0] )
return res
# data matrix used in xgboost
class DMatrix:
# constructor
def __init__(self, data=None, label=None, missing=0.0, weight = None):
# force into void_p, mac need to pass things in as void_p
self.handle = ctypes.c_void_p( xglib.XGDMatrixCreate() )
if data == None:
return
if isinstance(data,str):
xglib.XGDMatrixLoad(self.handle, ctypes.c_char_p(data.encode('utf-8')), 1)
elif isinstance(data,scp.csr_matrix):
self.__init_from_csr(data)
elif isinstance(data, numpy.ndarray) and len(data.shape) == 2:
self.__init_from_npy2d(data, missing)
else:
try:
csr = scp.csr_matrix(data)
self.__init_from_csr(csr)
except:
raise Exception("can not intialize DMatrix from"+str(type(data)))
if label != None:
self.set_label(label)
if weight !=None:
self.set_weight(weight)
# convert data from csr matrix
def __init_from_csr(self,csr):
assert len(csr.indices) == len(csr.data)
xglib.XGDMatrixParseCSR( self.handle,
( ctypes.c_ulong * len(csr.indptr) )(*csr.indptr),
( ctypes.c_uint * len(csr.indices) )(*csr.indices),
( ctypes.c_float * len(csr.data) )(*csr.data),
len(csr.indptr), len(csr.data) )
# convert data from numpy matrix
def __init_from_npy2d(self,mat,missing):
data = numpy.array( mat.reshape(mat.size), dtype='float32' )
xglib.XGDMatrixParseMat( self.handle,
data.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
mat.shape[0], mat.shape[1], ctypes.c_float(missing) )
# destructor
def __del__(self):
xglib.XGDMatrixFree(self.handle)
# load data from file
def load(self, fname, silent=True):
xglib.XGDMatrixLoad(self.handle, ctypes.c_char_p(fname.encode('utf-8')), int(silent))
# load data from file
def save_binary(self, fname, silent=True):
xglib.XGDMatrixSaveBinary(self.handle, ctypes.c_char_p(fname.encode('utf-8')), int(silent))
# set label of dmatrix
def set_label(self, label):
xglib.XGDMatrixSetLabel(self.handle, (ctypes.c_float*len(label))(*label), len(label) )
# set group size of dmatrix, used for rank
def set_group(self, group):
xglib.XGDMatrixSetGroup(self.handle, (ctypes.c_uint*len(group))(*group), len(group) )
# set weight of each instances
def set_weight(self, weight):
xglib.XGDMatrixSetWeight(self.handle, (ctypes.c_float*len(weight))(*weight), len(weight) )
# get label from dmatrix
def get_label(self):
length = ctypes.c_ulong()
labels = xglib.XGDMatrixGetLabel(self.handle, ctypes.byref(length))
return ctypes2numpy( labels, length.value );
# get weight from dmatrix
def get_weight(self):
length = ctypes.c_ulong()
weights = xglib.XGDMatrixGetWeight(self.handle, ctypes.byref(length))
return ctypes2numpy( weights, length.value );
# clear everything
def clear(self):
xglib.XGDMatrixClear(self.handle)
def num_row(self):
return xglib.XGDMatrixNumRow(self.handle)
# append a row to DMatrix
def add_row(self, row):
xglib.XGDMatrixAddRow(self.handle, (REntry*len(row))(*row), len(row) )
# get n-throw from DMatrix
def __getitem__(self, ridx):
length = ctypes.c_ulong()
row = xglib.XGDMatrixGetRow(self.handle, ridx, ctypes.byref(length) );
return [ (int(row[i].findex),row[i].fvalue) for i in range(length.value) ]
class Booster:
"""learner class """
def __init__(self, params={}, cache=[]):
""" constructor, param: """
for d in cache:
assert isinstance(d,DMatrix)
dmats = ( ctypes.c_void_p * len(cache) )(*[ d.handle for d in cache])
self.handle = ctypes.c_void_p( xglib.XGBoosterCreate( dmats, len(cache) ) )
self.set_param( {'seed':0} )
self.set_param( params )
def __del__(self):
xglib.XGBoosterFree(self.handle)
def set_param(self, params, pv=None):
if isinstance(params,dict):
for k, v in params.items():
xglib.XGBoosterSetParam(
self.handle, ctypes.c_char_p(k.encode('utf-8')),
ctypes.c_char_p(str(v).encode('utf-8')))
elif isinstance(params,str) and pv != None:
xglib.XGBoosterSetParam(
self.handle, ctypes.c_char_p(params.encode('utf-8')),
ctypes.c_char_p(str(pv).encode('utf-8')) )
else:
for k, v in params:
xglib.XGBoosterSetParam(
self.handle, ctypes.c_char_p(k.encode('utf-8')),
ctypes.c_char_p(str(v).encode('utf-8')) )
def update(self, dtrain):
""" update """
assert isinstance(dtrain, DMatrix)
xglib.XGBoosterUpdateOneIter( self.handle, dtrain.handle )
def boost(self, dtrain, grad, hess, bst_group = -1):
""" update """
assert len(grad) == len(hess)
assert isinstance(dtrain, DMatrix)
xglib.XGBoosterBoostOneIter( self.handle, dtrain.handle,
(ctypes.c_float*len(grad))(*grad),
(ctypes.c_float*len(hess))(*hess),
len(grad), bst_group )
def update_interact(self, dtrain, action, booster_index=None):
""" beta: update with specified action"""
assert isinstance(dtrain, DMatrix)
if booster_index != None:
self.set_param('interact:booster_index', str(booster_index))
xglib.XGBoosterUpdateInteract(
self.handle, dtrain.handle, ctypes.c_char_p(str(action)) )
def eval_set(self, evals, it = 0):
for d in evals:
assert isinstance(d[0], DMatrix)
assert isinstance(d[1], str)
dmats = ( ctypes.c_void_p * len(evals) )(*[ d[0].handle for d in evals])
evnames = ( ctypes.c_char_p * len(evals) )(
*[ctypes.c_char_p(d[1].encode('utf-8')) for d in evals])
xglib.XGBoosterEvalOneIter( self.handle, it, dmats, evnames, len(evals) )
def eval(self, mat, name = 'eval', it = 0 ):
self.eval_set( [(mat,name)], it)
def predict(self, data, bst_group = -1):
length = ctypes.c_ulong()
preds = xglib.XGBoosterPredict( self.handle, data.handle, ctypes.byref(length), bst_group)
return ctypes2numpy( preds, length.value )
def save_model(self, fname):
""" save model to file """
xglib.XGBoosterSaveModel(self.handle, ctypes.c_char_p(fname.encode('utf-8')))
def load_model(self, fname):
"""load model from file"""
xglib.XGBoosterLoadModel( self.handle, ctypes.c_char_p(fname.encode('utf-8')) )
def dump_model(self, fname, fmap=''):
"""dump model into text file"""
xglib.XGBoosterDumpModel(
self.handle, ctypes.c_char_p(fname.encode('utf-8')),
ctypes.c_char_p(fmap.encode('utf-8')))
def train(params, dtrain, num_boost_round = 10, evals = [], obj=None):
""" train a booster with given paramaters """
bst = Booster(params, [dtrain]+[ d[0] for d in evals ] )
if obj == None:
for i in range(num_boost_round):
bst.update( dtrain )
if len(evals) != 0:
bst.eval_set( evals, i )
else:
# try customized objective function
for i in range(num_boost_round):
pred = bst.predict( dtrain )
grad, hess = obj( pred, dtrain )
bst.boost( dtrain, grad, hess )
if len(evals) != 0:
bst.eval_set( evals, i )
return bst

297
python/xgboost_python.cpp Normal file
View File

@@ -0,0 +1,297 @@
// implementations in ctypes
#include "xgboost_python.h"
#include "../regrank/xgboost_regrank.h"
#include "../regrank/xgboost_regrank_data.h"
namespace xgboost{
namespace python{
class DMatrix: public regrank::DMatrix{
public:
// whether column is initialized
bool init_col_;
public:
DMatrix(void){
init_col_ = false;
}
~DMatrix(void){}
public:
inline void Load(const char *fname, bool silent){
this->CacheLoad(fname, silent);
init_col_ = this->data.HaveColAccess();
}
inline void Clear( void ){
this->data.Clear();
this->info.labels.clear();
this->info.weights.clear();
this->info.group_ptr.clear();
}
inline size_t NumRow( void ) const{
return this->data.NumRow();
}
inline void AddRow( const XGEntry *data, size_t len ){
xgboost::booster::FMatrixS &mat = this->data;
mat.row_data_.resize( mat.row_ptr_.back() + len );
memcpy( &mat.row_data_[mat.row_ptr_.back()], data, sizeof(XGEntry)*len );
mat.row_ptr_.push_back( mat.row_ptr_.back() + len );
init_col_ = false;
}
inline const XGEntry* GetRow(unsigned ridx, size_t* len) const{
const xgboost::booster::FMatrixS &mat = this->data;
*len = mat.row_ptr_[ridx+1] - mat.row_ptr_[ridx];
return &mat.row_data_[ mat.row_ptr_[ridx] ];
}
inline void ParseCSR( const size_t *indptr,
const unsigned *indices,
const float *data,
size_t nindptr,
size_t nelem ){
xgboost::booster::FMatrixS &mat = this->data;
mat.row_ptr_.resize( nindptr );
memcpy( &mat.row_ptr_[0], indptr, sizeof(size_t)*nindptr );
mat.row_data_.resize( nelem );
for( size_t i = 0; i < nelem; ++ i ){
mat.row_data_[i] = XGEntry(indices[i], data[i]);
}
this->data.InitData();
this->init_col_ = true;
}
inline void ParseMat( const float *data,
size_t nrow,
size_t ncol,
float missing ){
xgboost::booster::FMatrixS &mat = this->data;
mat.Clear();
for( size_t i = 0; i < nrow; ++i, data += ncol ){
size_t nelem = 0;
for( size_t j = 0; j < ncol; ++j ){
if( data[j] != missing ){
mat.row_data_.push_back( XGEntry(j, data[j]) );
++ nelem;
}
}
mat.row_ptr_.push_back( mat.row_ptr_.back() + nelem );
}
this->data.InitData();
this->init_col_ = true;
}
inline void SetLabel( const float *label, size_t len ){
this->info.labels.resize( len );
memcpy( &(this->info).labels[0], label, sizeof(float)*len );
}
inline void SetGroup( const unsigned *group, size_t len ){
this->info.group_ptr.resize( len + 1 );
this->info.group_ptr[0] = 0;
for( size_t i = 0; i < len; ++ i ){
this->info.group_ptr[i+1] = this->info.group_ptr[i]+group[i];
}
}
inline void SetWeight( const float *weight, size_t len ){
this->info.weights.resize( len );
memcpy( &(this->info).weights[0], weight, sizeof(float)*len );
}
inline const float* GetLabel( size_t* len ) const{
*len = this->info.labels.size();
return &(this->info.labels[0]);
}
inline const float* GetWeight( size_t* len ) const{
*len = this->info.weights.size();
return &(this->info.weights[0]);
}
inline void CheckInit(void){
if(!init_col_){
this->data.InitData();
init_col_ = true;
}
utils::Assert( this->data.NumRow() == this->info.labels.size(), "DMatrix: number of labels must match number of rows in matrix");
}
};
class Booster: public xgboost::regrank::RegRankBoostLearner{
private:
bool init_trainer, init_model;
public:
Booster(const std::vector<regrank::DMatrix *> mats){
silent = 1;
init_trainer = false;
init_model = false;
this->SetCacheData(mats);
}
inline void CheckInit(void){
if( !init_trainer ){
this->InitTrainer(); init_trainer = true;
}
if( !init_model ){
this->InitModel(); init_model = true;
}
}
inline void LoadModel( const char *fname ){
xgboost::regrank::RegRankBoostLearner::LoadModel(fname);
this->init_model = true;
}
inline void SetParam( const char *name, const char *val ){
if( !strcmp( name, "seed" ) ) random::Seed(atoi(val));
xgboost::regrank::RegRankBoostLearner::SetParam( name, val );
}
const float *Pred( const DMatrix &dmat, size_t *len, int bst_group ){
this->CheckInit();
this->Predict( this->preds_, dmat, bst_group );
*len = this->preds_.size();
return &this->preds_[0];
}
inline void BoostOneIter( const DMatrix &train,
float *grad, float *hess, size_t len, int bst_group ){
this->grad_.resize( len ); this->hess_.resize( len );
memcpy( &this->grad_[0], grad, sizeof(float)*len );
memcpy( &this->hess_[0], hess, sizeof(float)*len );
if( grad_.size() == train.Size() ){
if( bst_group < 0 ) bst_group = 0;
base_gbm.DoBoost(grad_, hess_, train.data, train.info.root_index, bst_group);
}else{
utils::Assert( bst_group == -1, "must set bst_group to -1 to support all group boosting" );
int ngroup = base_gbm.NumBoosterGroup();
utils::Assert( grad_.size() == train.Size() * (size_t)ngroup, "BUG: UpdateOneIter: mclass" );
std::vector<float> tgrad( train.Size() ), thess( train.Size() );
for( int g = 0; g < ngroup; ++ g ){
memcpy( &tgrad[0], &grad_[g*tgrad.size()], sizeof(float)*tgrad.size() );
memcpy( &thess[0], &hess_[g*tgrad.size()], sizeof(float)*tgrad.size() );
base_gbm.DoBoost(tgrad, thess, train.data, train.info.root_index, g );
}
}
}
};
};
};
using namespace xgboost::python;
extern "C"{
void* XGDMatrixCreate( void ){
return new DMatrix();
}
void XGDMatrixFree( void *handle ){
delete static_cast<DMatrix*>(handle);
}
void XGDMatrixLoad( void *handle, const char *fname, int silent ){
static_cast<DMatrix*>(handle)->Load(fname, silent!=0);
}
void XGDMatrixSaveBinary( void *handle, const char *fname, int silent ){
static_cast<DMatrix*>(handle)->SaveBinary(fname, silent!=0);
}
void XGDMatrixParseCSR( void *handle,
const size_t *indptr,
const unsigned *indices,
const float *data,
size_t nindptr,
size_t nelem ){
static_cast<DMatrix*>(handle)->ParseCSR(indptr, indices, data, nindptr, nelem);
}
void XGDMatrixParseMat( void *handle,
const float *data,
size_t nrow,
size_t ncol,
float missing ){
static_cast<DMatrix*>(handle)->ParseMat(data, nrow, ncol, missing);
}
void XGDMatrixSetLabel( void *handle, const float *label, size_t len ){
static_cast<DMatrix*>(handle)->SetLabel(label,len);
}
void XGDMatrixSetWeight( void *handle, const float *weight, size_t len ){
static_cast<DMatrix*>(handle)->SetWeight(weight,len);
}
void XGDMatrixSetGroup( void *handle, const unsigned *group, size_t len ){
static_cast<DMatrix*>(handle)->SetGroup(group,len);
}
const float* XGDMatrixGetLabel( const void *handle, size_t* len ){
return static_cast<const DMatrix*>(handle)->GetLabel(len);
}
const float* XGDMatrixGetWeight( const void *handle, size_t* len ){
return static_cast<const DMatrix*>(handle)->GetWeight(len);
}
void XGDMatrixClear(void *handle){
static_cast<DMatrix*>(handle)->Clear();
}
void XGDMatrixAddRow( void *handle, const XGEntry *data, size_t len ){
static_cast<DMatrix*>(handle)->AddRow(data, len);
}
size_t XGDMatrixNumRow(const void *handle){
return static_cast<const DMatrix*>(handle)->NumRow();
}
const XGEntry* XGDMatrixGetRow(void *handle, unsigned ridx, size_t* len){
return static_cast<DMatrix*>(handle)->GetRow(ridx, len);
}
// xgboost implementation
void *XGBoosterCreate( void *dmats[], size_t len ){
std::vector<xgboost::regrank::DMatrix*> mats;
for( size_t i = 0; i < len; ++i ){
DMatrix *dtr = static_cast<DMatrix*>(dmats[i]);
dtr->CheckInit();
mats.push_back( dtr );
}
return new Booster( mats );
}
void XGBoosterFree( void *handle ){
delete static_cast<Booster*>(handle);
}
void XGBoosterSetParam( void *handle, const char *name, const char *value ){
static_cast<Booster*>(handle)->SetParam( name, value );
}
void XGBoosterUpdateOneIter( void *handle, void *dtrain ){
Booster *bst = static_cast<Booster*>(handle);
DMatrix *dtr = static_cast<DMatrix*>(dtrain);
bst->CheckInit(); dtr->CheckInit();
bst->UpdateOneIter( *dtr );
}
void XGBoosterBoostOneIter( void *handle, void *dtrain,
float *grad, float *hess, size_t len, int bst_group ){
Booster *bst = static_cast<Booster*>(handle);
DMatrix *dtr = static_cast<DMatrix*>(dtrain);
bst->CheckInit(); dtr->CheckInit();
bst->BoostOneIter( *dtr, grad, hess, len, bst_group );
}
void XGBoosterEvalOneIter( void *handle, int iter, void *dmats[], const char *evnames[], size_t len ){
Booster *bst = static_cast<Booster*>(handle);
bst->CheckInit();
std::vector<std::string> names;
std::vector<const xgboost::regrank::DMatrix*> mats;
for( size_t i = 0; i < len; ++i ){
mats.push_back( static_cast<DMatrix*>(dmats[i]) );
names.push_back( std::string( evnames[i]) );
}
bst->EvalOneIter( iter, mats, names, stderr );
}
const float *XGBoosterPredict( void *handle, void *dmat, size_t *len, int bst_group ){
return static_cast<Booster*>(handle)->Pred( *static_cast<DMatrix*>(dmat), len, bst_group );
}
void XGBoosterLoadModel( void *handle, const char *fname ){
static_cast<Booster*>(handle)->LoadModel( fname );
}
void XGBoosterSaveModel( const void *handle, const char *fname ){
static_cast<const Booster*>(handle)->SaveModel( fname );
}
void XGBoosterDumpModel( void *handle, const char *fname, const char *fmap ){
using namespace xgboost::utils;
FILE *fo = FopenCheck( fname, "w" );
FeatMap featmap;
if( strlen(fmap) != 0 ){
featmap.LoadText( fmap );
}
static_cast<Booster*>(handle)->DumpModel( fo, featmap, false );
fclose( fo );
}
void XGBoosterUpdateInteract( void *handle, void *dtrain, const char *action ){
Booster *bst = static_cast<Booster*>(handle);
DMatrix *dtr = static_cast<DMatrix*>(dtrain);
bst->CheckInit(); dtr->CheckInit();
std::string act( action );
bst->UpdateInteract( act, *dtr );
}
};

209
python/xgboost_python.h Normal file
View File

@@ -0,0 +1,209 @@
#ifndef XGBOOST_PYTHON_H
#define XGBOOST_PYTHON_H
/*!
* \file xgboost_python.h
* \author Tianqi Chen
* \brief python wrapper for xgboost, using ctypes,
* hides everything behind functions
* use c style interface
*/
#include "../booster/xgboost_data.h"
extern "C"{
/*! \brief type of row entry */
typedef xgboost::booster::FMatrixS::REntry XGEntry;
/*!
* \brief create a data matrix
* \return a new data matrix
*/
void* XGDMatrixCreate(void);
/*!
* \brief free space in data matrix
*/
void XGDMatrixFree(void *handle);
/*!
* \brief load a data matrix from text file or buffer(if exists)
* \param handle a instance of data matrix
* \param fname file name
* \param silent print statistics when loading
*/
void XGDMatrixLoad(void *handle, const char *fname, int silent);
/*!
* \brief load a data matrix into binary file
* \param handle a instance of data matrix
* \param fname file name
* \param silent print statistics when saving
*/
void XGDMatrixSaveBinary(void *handle, const char *fname, int silent);
/*!
* \brief set matrix content from csr format
* \param handle a instance of data matrix
* \param indptr pointer to row headers
* \param indices findex
* \param data fvalue
* \param nindptr number of rows in the matix + 1
* \param nelem number of nonzero elements in the matrix
*/
void XGDMatrixParseCSR( void *handle,
const size_t *indptr,
const unsigned *indices,
const float *data,
size_t nindptr,
size_t nelem );
/*!
* \brief set matrix content from data content
* \param handle a instance of data matrix
* \param data pointer to the data space
* \param nrow number of rows
* \param ncol number columns
* \param missing which value to represent missing value
*/
void XGDMatrixParseMat( void *handle,
const float *data,
size_t nrow,
size_t ncol,
float missing );
/*!
* \brief set label of the training matrix
* \param handle a instance of data matrix
* \param label pointer to label
* \param len length of array
*/
void XGDMatrixSetLabel( void *handle, const float *label, size_t len );
/*!
* \brief set label of the training matrix
* \param handle a instance of data matrix
* \param group pointer to group size
* \param len length of array
*/
void XGDMatrixSetGroup( void *handle, const unsigned *group, size_t len );
/*!
* \brief set weight of each instacne
* \param handle a instance of data matrix
* \param weight data pointer to weights
* \param len length of array
*/
void XGDMatrixSetWeight( void *handle, const float *weight, size_t len );
/*!
* \brief get label set from matrix
* \param handle a instance of data matrix
* \param len used to set result length
* \return pointer to the label
*/
const float* XGDMatrixGetLabel( const void *handle, size_t* len );
/*!
* \brief get weight set from matrix
* \param handle a instance of data matrix
* \param len used to set result length
* \return pointer to the weight
*/
const float* XGDMatrixGetWeight( const void *handle, size_t* len );
/*!
* \brief clear all the records, including feature matrix and label
* \param handle a instance of data matrix
*/
void XGDMatrixClear(void *handle);
/*!
* \brief return number of rows
*/
size_t XGDMatrixNumRow(const void *handle);
/*!
* \brief add row
* \param handle a instance of data matrix
* \param data array of row content
* \param len length of array
*/
void XGDMatrixAddRow(void *handle, const XGEntry *data, size_t len);
/*!
* \brief get ridx-th row of sparse matrix
* \param handle handle
* \param ridx row index
* \param len used to set result length
* \reurn pointer to the row
*/
const XGEntry* XGDMatrixGetRow(void *handle, unsigned ridx, size_t* len);
// --- start XGBoost class
/*!
* \brief create xgboost learner
* \param dmats matrices that are set to be cached
* \param create a booster
*/
void *XGBoosterCreate( void* dmats[], size_t len );
/*!
* \brief free obj in handle
* \param handle handle to be freed
*/
void XGBoosterFree( void* handle );
/*!
* \brief set parameters
* \param handle handle
* \param name parameter name
* \param val value of parameter
*/
void XGBoosterSetParam( void *handle, const char *name, const char *value );
/*!
* \brief update the model in one round using dtrain
* \param handle handle
* \param dtrain training data
*/
void XGBoosterUpdateOneIter( void *handle, void *dtrain );
/*!
* \brief update the model, by directly specify gradient and second order gradient,
* this can be used to replace UpdateOneIter, to support customized loss function
* \param handle handle
* \param dtrain training data
* \param grad gradient statistics
* \param hess second order gradient statistics
* \param len length of grad/hess array
* \param bst_group boost group we are working at, default = -1
*/
void XGBoosterBoostOneIter( void *handle, void *dtrain,
float *grad, float *hess, size_t len, int bst_group );
/*!
* \brief print evaluation statistics to stdout for xgboost
* \param handle handle
* \param iter current iteration rounds
* \param dmats pointers to data to be evaluated
* \param evnames pointers to names of each data
* \param len length of dmats
*/
void XGBoosterEvalOneIter( void *handle, int iter, void *dmats[], const char *evnames[], size_t len );
/*!
* \brief make prediction based on dmat
* \param handle handle
* \param dmat data matrix
* \param len used to store length of returning result
* \param bst_group booster group, if model contains multiple booster group, default = -1 means predict for all groups
*/
const float *XGBoosterPredict( void *handle, void *dmat, size_t *len, int bst_group );
/*!
* \brief load model from existing file
* \param handle handle
* \param fname file name
*/
void XGBoosterLoadModel( void *handle, const char *fname );
/*!
* \brief save model into existing file
* \param handle handle
* \param fname file name
*/
void XGBoosterSaveModel( const void *handle, const char *fname );
/*!
* \brief dump model into text file
* \param handle handle
* \param fname file name
* \param fmap name to fmap can be empty string
*/
void XGBoosterDumpModel( void *handle, const char *fname, const char *fmap );
/*!
* \brief interactively update model: beta
* \param handle handle
* \param dtrain training data
* \param action action name
*/
void XGBoosterUpdateInteract( void *handle, void *dtrain, const char* action );
};
#endif

401
regrank/xgboost_regrank.h Normal file
View File

@@ -0,0 +1,401 @@
#ifndef XGBOOST_REGRANK_H
#define XGBOOST_REGRANK_H
/*!
* \file xgboost_regrank.h
* \brief class for gradient boosted regression and ranking
* \author Kailong Chen: chenkl198812@gmail.com, Tianqi Chen: tianqi.tchen@gmail.com
*/
#include <cmath>
#include <cstdlib>
#include <cstring>
#include "xgboost_regrank_data.h"
#include "xgboost_regrank_eval.h"
#include "xgboost_regrank_obj.h"
#include "../utils/xgboost_omp.h"
#include "../booster/xgboost_gbmbase.h"
#include "../utils/xgboost_utils.h"
#include "../utils/xgboost_stream.h"
namespace xgboost{
namespace regrank{
/*! \brief class for gradient boosted regression and ranking */
class RegRankBoostLearner{
public:
/*! \brief constructor */
RegRankBoostLearner(void){
silent = 0;
obj_ = NULL;
name_obj_ = "reg:linear";
}
/*! \brief destructor */
~RegRankBoostLearner(void){
if( obj_ != NULL ) delete obj_;
}
/*!
* \brief a regression booter associated with training and evaluating data
* \param mats array of pointers to matrix whose prediction result need to be cached
*/
RegRankBoostLearner(const std::vector<DMatrix *>& mats){
silent = 0;
obj_ = NULL;
name_obj_ = "reg:linear";
this->SetCacheData(mats);
}
/*!
* \brief add internal cache space for mat, this can speedup prediction for matrix,
* please cache prediction for training and eval data
* warning: if the model is loaded from file from some previous training history
* set cache data must be called with exactly SAME
* data matrices to continue training otherwise it will cause error
* \param mats array of pointers to matrix whose prediction result need to be cached
*/
inline void SetCacheData(const std::vector<DMatrix *>& mats){
// estimate feature bound
int num_feature = 0;
// assign buffer index
unsigned buffer_size = 0;
utils::Assert( cache_.size() == 0, "can only call cache data once" );
for( size_t i = 0; i < mats.size(); ++i ){
bool dupilicate = false;
for( size_t j = 0; j < i; ++ j ){
if( mats[i] == mats[j] ) dupilicate = true;
}
if( dupilicate ) continue;
// set mats[i]'s cache learner pointer to this
mats[i]->cache_learner_ptr_ = this;
cache_.push_back( CacheEntry( mats[i], buffer_size, mats[i]->Size() ) );
buffer_size += static_cast<unsigned>(mats[i]->Size());
num_feature = std::max(num_feature, (int)(mats[i]->data.NumCol()));
}
char str_temp[25];
if (num_feature > mparam.num_feature){
mparam.num_feature = num_feature;
sprintf(str_temp, "%d", num_feature);
base_gbm.SetParam("bst:num_feature", str_temp);
}
sprintf(str_temp, "%u", buffer_size);
base_gbm.SetParam("num_pbuffer", str_temp);
if (!silent){
printf("buffer_size=%u\n", buffer_size);
}
}
/*!
* \brief set parameters from outside
* \param name name of the parameter
* \param val value of the parameter
*/
inline void SetParam(const char *name, const char *val){
if (!strcmp(name, "silent")) silent = atoi(val);
if (!strcmp(name, "eval_metric")) evaluator_.AddEval(val);
if (!strcmp(name, "objective") ) name_obj_ = val;
if (!strcmp(name, "num_class") ) base_gbm.SetParam("num_booster_group", val );
mparam.SetParam(name, val);
base_gbm.SetParam(name, val);
cfg_.push_back( std::make_pair( std::string(name), std::string(val) ) );
}
/*!
* \brief initialize solver before training, called before training
* this function is reserved for solver to allocate necessary space and do other preparation
*/
inline void InitTrainer(void){
if( mparam.num_class != 0 ){
if( name_obj_ != "multi:softmax" && name_obj_ != "multi:softprob"){
name_obj_ = "multi:softmax";
printf("auto select objective=softmax to support multi-class classification\n" );
}
}
base_gbm.InitTrainer();
obj_ = CreateObjFunction( name_obj_.c_str() );
for( size_t i = 0; i < cfg_.size(); ++ i ){
obj_->SetParam( cfg_[i].first.c_str(), cfg_[i].second.c_str() );
}
evaluator_.AddEval( obj_->DefaultEvalMetric() );
}
/*!
* \brief initialize the current data storage for model, if the model is used first time, call this function
*/
inline void InitModel(void){
base_gbm.InitModel();
mparam.AdjustBase(name_obj_.c_str());
}
/*!
* \brief load model from file
* \param fname file name
*/
inline void LoadModel(const char *fname){
utils::FileStream fi(utils::FopenCheck(fname, "rb"));
this->LoadModel(fi);
fi.Close();
}
/*!
* \brief load model from stream
* \param fi input stream
*/
inline void LoadModel(utils::IStream &fi){
base_gbm.LoadModel(fi);
utils::Assert(fi.Read(&mparam, sizeof(ModelParam)) != 0);
// save name obj
size_t len;
if( fi.Read(&len, sizeof(len)) != 0 ){
name_obj_.resize( len );
if( len != 0 ){
utils::Assert( fi.Read(&name_obj_[0], len*sizeof(char)) != 0 );
}
}
}
/*!
* \brief DumpModel
* \param fo text file
* \param fmap feature map that may help give interpretations of feature
* \param with_stats whether print statistics as well
*/
inline void DumpModel(FILE *fo, const utils::FeatMap& fmap, bool with_stats){
base_gbm.DumpModel(fo, fmap, with_stats);
}
/*!
* \brief Dump path of all trees
* \param fo text file
* \param data input data
*/
inline void DumpPath(FILE *fo, const DMatrix &data){
base_gbm.DumpPath(fo, data.data);
}
/*!
* \brief save model to stream
* \param fo output stream
*/
inline void SaveModel(utils::IStream &fo) const{
base_gbm.SaveModel(fo);
fo.Write(&mparam, sizeof(ModelParam));
// save name obj
size_t len = name_obj_.length();
fo.Write(&len, sizeof(len));
fo.Write(&name_obj_[0], len*sizeof(char));
}
/*!
* \brief save model into file
* \param fname file name
*/
inline void SaveModel(const char *fname) const{
utils::FileStream fo(utils::FopenCheck(fname, "wb"));
this->SaveModel(fo);
fo.Close();
}
/*!
* \brief update the model for one iteration
*/
inline void UpdateOneIter(const DMatrix &train){
this->PredictRaw(preds_, train);
obj_->GetGradient(preds_, train.info, base_gbm.NumBoosters(), grad_, hess_);
if( grad_.size() == train.Size() ){
base_gbm.DoBoost(grad_, hess_, train.data, train.info.root_index);
}else{
int ngroup = base_gbm.NumBoosterGroup();
utils::Assert( grad_.size() == train.Size() * (size_t)ngroup, "BUG: UpdateOneIter: mclass" );
std::vector<float> tgrad( train.Size() ), thess( train.Size() );
for( int g = 0; g < ngroup; ++ g ){
memcpy( &tgrad[0], &grad_[g*tgrad.size()], sizeof(float)*tgrad.size() );
memcpy( &thess[0], &hess_[g*tgrad.size()], sizeof(float)*tgrad.size() );
base_gbm.DoBoost(tgrad, thess, train.data, train.info.root_index, g );
}
}
}
/*!
* \brief evaluate the model for specific iteration
* \param iter iteration number
* \param evals datas i want to evaluate
* \param evname name of each dataset
* \param fo file to output log
*/
inline void EvalOneIter(int iter,
const std::vector<const DMatrix*> &evals,
const std::vector<std::string> &evname,
FILE *fo=stderr ){
fprintf(fo, "[%d]", iter);
for (size_t i = 0; i < evals.size(); ++i){
this->PredictRaw(preds_, *evals[i]);
obj_->EvalTransform(preds_);
evaluator_.Eval(fo, evname[i].c_str(), preds_, evals[i]->info);
}
fprintf(fo, "\n");
fflush(fo);
}
/*!
* \brief get prediction
* \param storage to store prediction
* \param data input data
* \param bst_group booster group we are in
*/
inline void Predict(std::vector<float> &preds, const DMatrix &data, int bst_group = -1){
this->PredictRaw( preds, data, bst_group );
obj_->PredTransform( preds );
}
public:
/*!
* \brief interactive update
* \param action action type
* \parma train training data
*/
inline void UpdateInteract(std::string action, const DMatrix& train){
for(size_t i = 0; i < cache_.size(); ++i){
this->InteractPredict(preds_, *cache_[i].mat_);
}
if (action == "remove"){
base_gbm.DelteBooster(); return;
}
obj_->GetGradient(preds_, train.info, base_gbm.NumBoosters(), grad_, hess_);
std::vector<unsigned> root_index;
base_gbm.DoBoost(grad_, hess_, train.data, root_index);
for(size_t i = 0; i < cache_.size(); ++i){
this->InteractRePredict(*cache_[i].mat_);
}
}
private:
/*! \brief get the transformed predictions, given data */
inline void InteractPredict(std::vector<float> &preds, const DMatrix &data){
int buffer_offset = this->FindBufferOffset(data);
utils::Assert( buffer_offset >=0, "interact mode must cache training data" );
preds.resize(data.Size());
const unsigned ndata = static_cast<unsigned>(data.Size());
#pragma omp parallel for schedule( static )
for (unsigned j = 0; j < ndata; ++j){
preds[j] = mparam.base_score + base_gbm.InteractPredict(data.data, j, buffer_offset + j);
}
obj_->PredTransform( preds );
}
/*! \brief repredict trial */
inline void InteractRePredict(const DMatrix &data){
int buffer_offset = this->FindBufferOffset(data);
utils::Assert( buffer_offset >=0, "interact mode must cache training data" );
const unsigned ndata = static_cast<unsigned>(data.Size());
#pragma omp parallel for schedule( static )
for (unsigned j = 0; j < ndata; ++j){
base_gbm.InteractRePredict(data.data, j, buffer_offset + j);
}
}
/*! \brief get un-transformed prediction*/
inline void PredictRaw(std::vector<float> &preds, const DMatrix &data, int bst_group = -1 ){
int buffer_offset = this->FindBufferOffset(data);
if( bst_group < 0 ){
int ngroup = base_gbm.NumBoosterGroup();
preds.resize( data.Size() * ngroup );
for( int g = 0; g < ngroup; ++ g ){
this->PredictBuffer(&preds[ data.Size() * g ], data, buffer_offset, g );
}
}else{
preds.resize( data.Size() );
this->PredictBuffer(&preds[0], data, buffer_offset, bst_group );
}
}
/*! \brief get the un-transformed predictions, given data */
inline void PredictBuffer(float *preds, const DMatrix &data, int buffer_offset, int bst_group ){
const unsigned ndata = static_cast<unsigned>(data.Size());
if( buffer_offset >= 0 ){
#pragma omp parallel for schedule( static )
for (unsigned j = 0; j < ndata; ++j){
preds[j] = mparam.base_score + base_gbm.Predict(data.data, j, buffer_offset + j, data.info.GetRoot(j), bst_group );
}
}else
#pragma omp parallel for schedule( static )
for (unsigned j = 0; j < ndata; ++j){
preds[j] = mparam.base_score + base_gbm.Predict(data.data, j, -1, data.info.GetRoot(j), bst_group );
}{
}
}
private:
/*! \brief training parameter for regression */
struct ModelParam{
/* \brief global bias */
float base_score;
/* \brief type of loss function */
int loss_type;
/* \brief number of features */
int num_feature;
/* \brief number of class, if it is multi-class classification */
int num_class;
/*! \brief reserved field */
int reserved[15];
/*! \brief constructor */
ModelParam(void){
base_score = 0.5f;
loss_type = -1;
num_feature = 0;
num_class = 0;
memset(reserved, 0, sizeof(reserved));
}
/*!
* \brief set parameters from outside
* \param name name of the parameter
* \param val value of the parameter
*/
inline void SetParam(const char *name, const char *val){
if (!strcmp("base_score", name)) base_score = (float)atof(val);
if (!strcmp("num_class", name)) num_class = atoi(val);
if (!strcmp("loss_type", name)) loss_type = atoi(val);
if (!strcmp("bst:num_feature", name)) num_feature = atoi(val);
}
/*!
* \brief adjust base_score based on loss type and objective function
*/
inline void AdjustBase(const char *obj){
// some tweaks for loss type
if( loss_type == -1 ){
loss_type = 1;
if( !strcmp("reg:linear", obj ) ) loss_type = 0;
}
if (loss_type == 1 || loss_type == 2|| loss_type == 3){
utils::Assert(base_score > 0.0f && base_score < 1.0f, "sigmoid range constrain");
base_score = -logf(1.0f / base_score - 1.0f);
}
}
};
private:
struct CacheEntry{
const DMatrix *mat_;
int buffer_offset_;
size_t num_row_;
CacheEntry(const DMatrix *mat, int buffer_offset, size_t num_row)
:mat_(mat), buffer_offset_(buffer_offset), num_row_(num_row){}
};
/*! \brief the entries indicates that we have internal prediction cache */
std::vector<CacheEntry> cache_;
private:
// find internal bufer offset for certain matrix, if not exist, return -1
inline int FindBufferOffset(const DMatrix &mat){
for(size_t i = 0; i < cache_.size(); ++i){
if( cache_[i].mat_ == &mat && mat.cache_learner_ptr_ == this ) {
if( cache_[i].num_row_ == mat.Size() ){
return cache_[i].buffer_offset_;
}else{
fprintf( stderr, "warning: number of rows in input matrix changed as remembered in cachelist, ignore cached results\n" );
fflush( stderr );
}
}
}
return -1;
}
protected:
int silent;
EvalSet evaluator_;
booster::GBMBase base_gbm;
ModelParam mparam;
// objective fnction
IObjFunction *obj_;
// name of objective function
std::string name_obj_;
std::vector< std::pair<std::string, std::string> > cfg_;
protected:
std::vector<float> grad_, hess_, preds_;
};
}
};
#endif

View File

@@ -0,0 +1,260 @@
#ifndef XGBOOST_REGRANK_DATA_H
#define XGBOOST_REGRANK_DATA_H
/*!
* \file xgboost_regrank_data.h
* \brief input data structure for regression, binary classification, and rankning.
* Format:
* The data should contain each data instance in each line.
* The format of line data is as below:
* label <nonzero feature dimension> [feature index:feature value]+
* When using rank, an addtional group file with suffix group must be provided, giving the number of instances in each group
* When using weighted aware classification(regression), an addtional weight file must be provided, giving the weight of each instance
*
* \author Kailong Chen: chenkl198812@gmail.com, Tianqi Chen: tianqi.tchen@gmail.com
*/
#include <cstdio>
#include <vector>
#include <string>
#include <cstring>
#include "../booster/xgboost_data.h"
#include "../utils/xgboost_utils.h"
#include "../utils/xgboost_stream.h"
namespace xgboost{
/*! \brief namespace to handle regression and rank */
namespace regrank{
/*! \brief data matrix for regression content */
struct DMatrix{
public:
/*! \brief data information besides the features */
struct Info{
/*! \brief label of each instance */
std::vector<float> labels;
/*! \brief the index of begin and end of a groupneeded when the learning task is ranking */
std::vector<unsigned> group_ptr;
/*! \brief weights of each instance, optional */
std::vector<float> weights;
/*! \brief specified root index of each instance, can be used for multi task setting*/
std::vector<unsigned> root_index;
/*! \brief get weight of each instances */
inline float GetWeight( size_t i ) const{
if( weights.size() != 0 ) return weights[i];
else return 1.0f;
}
inline float GetRoot( size_t i ) const{
if( root_index.size() != 0 ) return static_cast<float>(root_index[i]);
else return 0;
}
};
public:
/*! \brief feature data content */
booster::FMatrixS data;
/*! \brief information fields */
Info info;
/*!
* \brief cache pointer to verify if the data structure is cached in some learner
* this is a bit ugly, we need to have double check verification, so if one side get deleted,
* and some strange re-allocation gets the same pointer we will still be fine
*/
void *cache_learner_ptr_;
public:
/*! \brief default constructor */
DMatrix(void):cache_learner_ptr_(NULL){}
/*! \brief get the number of instances */
inline size_t Size() const{
return data.NumRow();
}
/*!
* \brief load from text file
* \param fname name of text data
* \param silent whether print information or not
*/
inline void LoadText(const char* fname, bool silent = false){
data.Clear();
FILE* file = utils::FopenCheck(fname, "r");
float label; bool init = true;
char tmp[1024];
std::vector<booster::bst_uint> findex;
std::vector<booster::bst_float> fvalue;
while (fscanf(file, "%s", tmp) == 1){
unsigned index; float value;
if (sscanf(tmp, "%u:%f", &index, &value) == 2){
findex.push_back(index); fvalue.push_back(value);
}
else{
if (!init){
info.labels.push_back(label);
data.AddRow(findex, fvalue);
}
findex.clear(); fvalue.clear();
utils::Assert(sscanf(tmp, "%f", &label) == 1, "invalid format");
init = false;
}
}
info.labels.push_back(label);
data.AddRow(findex, fvalue);
// initialize column support as well
data.InitData();
if (!silent){
printf("%ux%u matrix with %lu entries is loaded from %s\n",
(unsigned)data.NumRow(), (unsigned)data.NumCol(), (unsigned long)data.NumEntry(), fname);
}
fclose(file);
this->TryLoadGroup(fname, silent);
this->TryLoadWeight(fname, silent);
}
/*!
* \brief load from binary file
* \param fname name of binary data
* \param silent whether print information or not
* \return whether loading is success
*/
inline bool LoadBinary(const char* fname, bool silent = false){
FILE *fp = fopen64(fname, "rb");
if (fp == NULL) return false;
utils::FileStream fs(fp);
data.LoadBinary(fs);
info.labels.resize(data.NumRow());
utils::Assert(fs.Read(&info.labels[0], sizeof(float)* data.NumRow()) != 0, "DMatrix LoadBinary");
{// load in group ptr
unsigned ngptr;
if( fs.Read(&ngptr, sizeof(unsigned) ) != 0 ){
info.group_ptr.resize( ngptr );
if( ngptr != 0 ){
utils::Assert( fs.Read(&info.group_ptr[0], sizeof(unsigned) * ngptr) != 0, "Load group file");
utils::Assert( info.group_ptr.back() == data.NumRow(), "number of group must match number of record" );
}
}
}
{// load in weight
unsigned nwt;
if( fs.Read(&nwt, sizeof(unsigned) ) != 0 ){
utils::Assert( nwt == 0 || nwt == data.NumRow(), "invalid weight" );
info.weights.resize( nwt );
if( nwt != 0 ){
utils::Assert( fs.Read(&info.weights[0], sizeof(unsigned) * nwt) != 0, "Load weight file");
}
}
}
fs.Close();
if (!silent){
printf("%ux%u matrix with %lu entries is loaded from %s\n",
(unsigned)data.NumRow(), (unsigned)data.NumCol(), (unsigned long)data.NumEntry(), fname);
if( info.group_ptr.size() != 0 ){
printf("data contains %u groups\n", (unsigned)info.group_ptr.size()-1 );
}
}
return true;
}
/*!
* \brief save to binary file
* \param fname name of binary data
* \param silent whether print information or not
*/
inline void SaveBinary(const char* fname, bool silent = false){
// initialize column support as well
data.InitData();
utils::FileStream fs(utils::FopenCheck(fname, "wb"));
data.SaveBinary(fs);
utils::Assert( info.labels.size() == data.NumRow(), "label size is not consistent with feature matrix size" );
fs.Write(&info.labels[0], sizeof(float) * data.NumRow());
{// write out group ptr
unsigned ngptr = static_cast<unsigned>( info.group_ptr.size() );
fs.Write(&ngptr, sizeof(unsigned) );
if( ngptr != 0 ){
fs.Write(&info.group_ptr[0], sizeof(unsigned) * ngptr);
}
}
{// write out weight
unsigned nwt = static_cast<unsigned>( info.weights.size() );
fs.Write( &nwt, sizeof(unsigned) );
if( nwt != 0 ){
fs.Write(&info.weights[0], sizeof(float) * nwt);
}
}
fs.Close();
if (!silent){
printf("%ux%u matrix with %lu entries is saved to %s\n",
(unsigned)data.NumRow(), (unsigned)data.NumCol(), (unsigned long)data.NumEntry(), fname);
if( info.group_ptr.size() != 0 ){
printf("data contains %u groups\n", (unsigned)info.group_ptr.size()-1 );
}
}
}
/*!
* \brief cache load data given a file name, if filename ends with .buffer, direct load binary
* otherwise the function will first check if fname + '.buffer' exists,
* if binary buffer exists, it will reads from binary buffer, otherwise, it will load from text file,
* and try to create a buffer file
* \param fname name of binary data
* \param silent whether print information or not
* \param savebuffer whether do save binary buffer if it is text
*/
inline void CacheLoad(const char *fname, bool silent = false, bool savebuffer = true){
int len = strlen(fname);
if (len > 8 && !strcmp(fname + len - 7, ".buffer")){
if( !this->LoadBinary(fname, silent) ){
fprintf(stderr,"can not open file \"%s\"", fname);
utils::Error("DMatrix::CacheLoad failed");
}
return;
}
char bname[1024];
sprintf(bname, "%s.buffer", fname);
if (!this->LoadBinary(bname, silent)){
this->LoadText(fname, silent);
if (savebuffer) this->SaveBinary(bname, silent);
}
}
private:
inline bool TryLoadGroup(const char* fname, bool silent = false){
std::string name = fname;
if (name.length() > 8 && !strcmp(fname + name.length() - 7, ".buffer")){
name.resize( name.length() - 7 );
}
name += ".group";
//if exists group data load it in
FILE *fi = fopen64(name.c_str(), "r");
if (fi == NULL) return false;
info.group_ptr.push_back(0);
unsigned nline;
while (fscanf(fi, "%u", &nline) == 1){
info.group_ptr.push_back(info.group_ptr.back()+nline);
}
if(!silent){
printf("%lu groups are loaded from %s\n", info.group_ptr.size()-1, name.c_str());
}
fclose(fi);
utils::Assert( info.group_ptr.back() == data.NumRow(), "DMatrix: group data does not match the number of rows in feature matrix" );
return true;
}
inline bool TryLoadWeight(const char* fname, bool silent = false){
std::string name = fname;
if (name.length() > 8 && !strcmp(fname + name.length() - 7, ".buffer")){
name.resize( name.length() - 7 );
}
name += ".weight";
//if exists group data load it in
FILE *fi = fopen64(name.c_str(), "r");
if (fi == NULL) return false;
float wt;
while (fscanf(fi, "%f", &wt) == 1){
info.weights.push_back( wt );
}
if(!silent){
printf("loading weight from %s\n", name.c_str());
}
fclose(fi);
utils::Assert( info.weights.size() == data.NumRow(), "DMatrix: weight data does not match the number of rows in feature matrix" );
return true;
}
};
};
};
#endif

View File

@@ -0,0 +1,375 @@
#ifndef XGBOOST_REGRANK_EVAL_H
#define XGBOOST_REGRANK_EVAL_H
/*!
* \file xgboost_regrank_eval.h
* \brief evaluation metrics for regression and classification and rank
* \author Kailong Chen: chenkl198812@gmail.com, Tianqi Chen: tianqi.tchen@gmail.com
*/
#include <cmath>
#include <vector>
#include <algorithm>
#include "../utils/xgboost_utils.h"
#include "../utils/xgboost_omp.h"
#include "../utils/xgboost_random.h"
#include "xgboost_regrank_data.h"
#include "xgboost_regrank_utils.h"
namespace xgboost{
namespace regrank{
/*! \brief evaluator that evaluates the loss metrics */
struct IEvaluator{
/*!
* \brief evaluate a specific metric
* \param preds prediction
* \param info information, including label etc.
*/
virtual float Eval(const std::vector<float> &preds,
const DMatrix::Info &info) const = 0;
/*! \return name of metric */
virtual const char *Name(void) const = 0;
/*! \brief virtual destructor */
virtual ~IEvaluator(void){}
};
/*! \brief RMSE */
struct EvalRMSE : public IEvaluator{
virtual float Eval(const std::vector<float> &preds,
const DMatrix::Info &info) const {
utils::Assert( preds.size() == info.labels.size(), "label size predict size not match" );
const unsigned ndata = static_cast<unsigned>(preds.size());
float sum = 0.0, wsum = 0.0;
#pragma omp parallel for reduction(+:sum,wsum) schedule( static )
for (unsigned i = 0; i < ndata; ++i){
const float wt = info.GetWeight(i);
const float diff = info.labels[i] - preds[i];
sum += diff*diff * wt;
wsum += wt;
}
return sqrtf(sum / wsum);
}
virtual const char *Name(void) const{
return "rmse";
}
};
/*! \brief Error */
struct EvalLogLoss : public IEvaluator{
virtual float Eval(const std::vector<float> &preds,
const DMatrix::Info &info) const {
utils::Assert( preds.size() == info.labels.size(), "label size predict size not match" );
const unsigned ndata = static_cast<unsigned>(preds.size());
float sum = 0.0f, wsum = 0.0f;
#pragma omp parallel for reduction(+:sum,wsum) schedule( static )
for (unsigned i = 0; i < ndata; ++i){
const float y = info.labels[i];
const float py = preds[i];
const float wt = info.GetWeight(i);
sum -= wt * (y * std::log(py) + (1.0f - y)*std::log(1 - py));
wsum += wt;
}
return sum / wsum;
}
virtual const char *Name(void) const{
return "negllik";
}
};
/*! \brief Error */
struct EvalError : public IEvaluator{
virtual float Eval(const std::vector<float> &preds,
const DMatrix::Info &info) const {
const unsigned ndata = static_cast<unsigned>(preds.size());
float sum = 0.0f, wsum = 0.0f;
#pragma omp parallel for reduction(+:sum,wsum) schedule( static )
for (unsigned i = 0; i < ndata; ++i){
const float wt = info.GetWeight(i);
if (preds[i] > 0.5f){
if (info.labels[i] < 0.5f) sum += wt;
}
else{
if (info.labels[i] >= 0.5f) sum += wt;
}
wsum += wt;
}
return sum / wsum;
}
virtual const char *Name(void) const{
return "error";
}
};
/*! \brief AMS: also records best threshold */
struct EvalAMS : public IEvaluator{
public:
EvalAMS(const char *name){
name_ = name;
// note: ams@0 will automatically select which ratio to go
utils::Assert( sscanf(name, "ams@%f", &ratio_ ) == 1, "invalid ams format" );
}
virtual float Eval(const std::vector<float> &preds,
const DMatrix::Info &info) const {
const unsigned ndata = static_cast<unsigned>(preds.size());
utils::Assert( info.weights.size() == ndata, "we need weight to evaluate ams");
std::vector< std::pair<float, unsigned> > rec(ndata);
#pragma omp parallel for schedule( static )
for (unsigned i = 0; i < ndata; ++i){
rec[i] = std::make_pair( preds[i], i );
}
std::sort( rec.begin(), rec.end(), CmpFirst );
unsigned ntop = static_cast<unsigned>( ratio_ * ndata );
if( ntop == 0 ) ntop = ndata;
const double br = 10.0;
unsigned thresindex = 0;
double s_tp = 0.0, b_fp = 0.0, tams = 0.0;
for (unsigned i = 0; i < ndata-1 && i < ntop; ++i){
const unsigned ridx = rec[i].second;
const float wt = info.weights[ridx];
if( info.labels[ridx] > 0.5f ){
s_tp += wt;
}else{
b_fp += wt;
}
if( rec[i].first != rec[i+1].first ){
double ams = sqrtf( 2*((s_tp+b_fp+br) * log( 1.0 + s_tp/(b_fp+br) ) - s_tp) );
if( tams < ams ){
thresindex = i;
tams = ams;
}
}
}
if( ntop == ndata ){
fprintf( stderr, "\tams-ratio=%g", float(thresindex)/ndata );
return tams;
}else{
return sqrtf( 2*((s_tp+b_fp+br) * log( 1.0 + s_tp/(b_fp+br) ) - s_tp) );
}
}
virtual const char *Name(void) const{
return name_.c_str();
}
private:
std::string name_;
float ratio_;
};
/*! \brief Error for multi-class classification, need exact match */
struct EvalMatchError : public IEvaluator{
public:
virtual float Eval(const std::vector<float> &preds,
const DMatrix::Info &info) const {
const unsigned ndata = static_cast<unsigned>(preds.size());
float sum = 0.0f, wsum = 0.0f;
#pragma omp parallel for reduction(+:sum,wsum) schedule( static )
for (unsigned i = 0; i < ndata; ++i){
const float wt = info.GetWeight(i);
int label = static_cast<int>(info.labels[i]);
if (static_cast<int>(preds[i]) != label ) sum += wt;
wsum += wt;
}
return sum / wsum;
}
virtual const char *Name(void) const{
return "merror";
}
};
/*! \brief Area under curve, for both classification and rank */
struct EvalAuc : public IEvaluator{
virtual float Eval(const std::vector<float> &preds,
const DMatrix::Info &info) const {
utils::Assert( preds.size() == info.labels.size(), "label size predict size not match" );
std::vector<unsigned> tgptr(2, 0); tgptr[1] = preds.size();
const std::vector<unsigned> &gptr = info.group_ptr.size() == 0 ? tgptr : info.group_ptr;
utils::Assert(gptr.back() == preds.size(), "EvalAuc: group structure must match number of prediction");
const unsigned ngroup = static_cast<unsigned>(gptr.size() - 1);
double sum_auc = 0.0f;
#pragma omp parallel reduction(+:sum_auc)
{
// each thread takes a local rec
std::vector< std::pair<float, unsigned> > rec;
#pragma omp for schedule(static)
for (unsigned k = 0; k < ngroup; ++k){
rec.clear();
for (unsigned j = gptr[k]; j < gptr[k + 1]; ++j){
rec.push_back(std::make_pair(preds[j], j));
}
std::sort(rec.begin(), rec.end(), CmpFirst);
// calculate AUC
double sum_pospair = 0.0;
double sum_npos = 0.0, sum_nneg = 0.0, buf_pos = 0.0, buf_neg = 0.0;
for (size_t j = 0; j < rec.size(); ++j){
const float wt = info.GetWeight(rec[j].second);
const float ctr = info.labels[rec[j].second];
// keep bucketing predictions in same bucket
if (j != 0 && rec[j].first != rec[j - 1].first){
sum_pospair += buf_neg * (sum_npos + buf_pos *0.5);
sum_npos += buf_pos; sum_nneg += buf_neg;
buf_neg = buf_pos = 0.0f;
}
buf_pos += ctr * wt; buf_neg += (1.0f - ctr) * wt;
}
sum_pospair += buf_neg * (sum_npos + buf_pos *0.5);
sum_npos += buf_pos; sum_nneg += buf_neg;
//
utils::Assert(sum_npos > 0.0 && sum_nneg > 0.0, "the dataset only contains pos or neg samples");
// this is the AUC
sum_auc += sum_pospair / (sum_npos*sum_nneg);
}
}
// return average AUC over list
return static_cast<float>(sum_auc) / ngroup;
}
virtual const char *Name(void) const{
return "auc";
}
};
/*! \brief Evaluate rank list */
struct EvalRankList : public IEvaluator{
public:
virtual float Eval(const std::vector<float> &preds,
const DMatrix::Info &info) const {
utils::Assert( preds.size() == info.labels.size(), "label size predict size not match" );
const std::vector<unsigned> &gptr = info.group_ptr;
utils::Assert(gptr.size() != 0, "must specify group when constructing rank file");
utils::Assert( gptr.back() == preds.size(), "EvalRanklist: group structure must match number of prediction");
const unsigned ngroup = static_cast<unsigned>(gptr.size() - 1);
double sum_metric = 0.0f;
#pragma omp parallel reduction(+:sum_metric)
{
// each thread takes a local rec
std::vector< std::pair<float, unsigned> > rec;
#pragma omp for schedule(static)
for (unsigned k = 0; k < ngroup; ++k){
rec.clear();
for (unsigned j = gptr[k]; j < gptr[k + 1]; ++j){
rec.push_back(std::make_pair(preds[j], (int)info.labels[j]));
}
sum_metric += this->EvalMetric( rec );
}
}
return static_cast<float>(sum_metric) / ngroup;
}
virtual const char *Name(void) const{
return name_.c_str();
}
protected:
EvalRankList(const char *name){
name_ = name;
if( sscanf(name, "%*[^@]@%u", &topn_) != 1 ){
topn_ = UINT_MAX;
}
}
/*! \return evaluation metric, given the pair_sort record, (pred,label) */
virtual float EvalMetric( std::vector< std::pair<float, unsigned> > &pair_sort ) const = 0;
protected:
unsigned topn_;
std::string name_;
};
/*! \brief Precison at N, for both classification and rank */
struct EvalPrecision : public EvalRankList{
public:
EvalPrecision(const char *name):EvalRankList(name){}
protected:
virtual float EvalMetric( std::vector< std::pair<float, unsigned> > &rec ) const {
// calculate Preicsion
std::sort(rec.begin(), rec.end(), CmpFirst);
unsigned nhit = 0;
for (size_t j = 0; j < rec.size() && j < this->topn_; ++j){
nhit += (rec[j].second != 0 );
}
return static_cast<float>( nhit ) / topn_;
}
};
/*! \brief NDCG */
struct EvalNDCG : public EvalRankList{
public:
EvalNDCG(const char *name):EvalRankList(name){}
protected:
inline float CalcDCG( const std::vector< std::pair<float,unsigned> > &rec ) const {
double sumdcg = 0.0;
for( size_t i = 0; i < rec.size() && i < this->topn_; i ++ ){
const unsigned rel = rec[i].second;
if( rel != 0 ){
sumdcg += logf(2.0f) * ((1<<rel)-1) / logf( i + 2 );
}
}
return static_cast<float>(sumdcg);
}
virtual float EvalMetric( std::vector< std::pair<float, unsigned> > &rec ) const {
std::sort(rec.begin(), rec.end(), CmpSecond);
float idcg = this->CalcDCG(rec);
std::sort(rec.begin(), rec.end(), CmpFirst);
float dcg = this->CalcDCG(rec);
if( idcg == 0.0f ) return 0.0f;
else return dcg/idcg;
}
};
/*! \brief Precison at N, for both classification and rank */
struct EvalMAP : public EvalRankList{
public:
EvalMAP(const char *name):EvalRankList(name){}
protected:
virtual float EvalMetric( std::vector< std::pair<float, unsigned> > &rec ) const {
std::sort(rec.begin(), rec.end(), CmpFirst);
unsigned nhits = 0;
double sumap = 0.0;
for( size_t i = 0; i < rec.size(); ++i){
if( rec[i].second != 0 ){
nhits += 1;
if( i < this->topn_ ){
sumap += static_cast<float>(nhits) / (i+1);
}
}
}
if (nhits != 0) sumap /= nhits;
return static_cast<float>(sumap);
}
};
};
namespace regrank{
/*! \brief a set of evaluators */
struct EvalSet{
public:
inline void AddEval(const char *name){
for (size_t i = 0; i < evals_.size(); ++i){
if (!strcmp(name, evals_[i]->Name())) return;
}
if (!strcmp(name, "rmse")) evals_.push_back(new EvalRMSE());
if (!strcmp(name, "error")) evals_.push_back(new EvalError());
if (!strcmp(name, "merror")) evals_.push_back(new EvalMatchError());
if (!strcmp(name, "logloss")) evals_.push_back(new EvalLogLoss());
if (!strcmp(name, "auc")) evals_.push_back(new EvalAuc());
if (!strncmp(name, "ams@",4)) evals_.push_back(new EvalAMS(name));
if (!strncmp(name, "pre@", 4)) evals_.push_back(new EvalPrecision(name));
if (!strncmp(name, "map", 3)) evals_.push_back(new EvalMAP(name));
if (!strncmp(name, "ndcg", 3)) evals_.push_back(new EvalNDCG(name));
}
~EvalSet(){
for (size_t i = 0; i < evals_.size(); ++i){
delete evals_[i];
}
}
inline void Eval(FILE *fo, const char *evname,
const std::vector<float> &preds,
const DMatrix::Info &info) const{
for (size_t i = 0; i < evals_.size(); ++i){
float res = evals_[i]->Eval(preds, info);
fprintf(fo, "\t%s-%s:%f", evname, evals_[i]->Name(), res);
}
}
private:
std::vector<const IEvaluator*> evals_;
};
};
};
#endif

View File

@@ -0,0 +1,303 @@
#define _CRT_SECURE_NO_WARNINGS
#define _CRT_SECURE_NO_DEPRECATE
#include <ctime>
#include <string>
#include <cstring>
#include "xgboost_regrank.h"
#include "../utils/xgboost_fmap.h"
#include "../utils/xgboost_random.h"
#include "../utils/xgboost_config.h"
namespace xgboost{
namespace regrank{
/*!
* \brief wrapping the training process of the gradient boosting regression model,
* given the configuation
* \author Kailong Chen: chenkl198812@gmail.com, Tianqi Chen: tianqi.chen@gmail.com
*/
class RegBoostTask{
public:
inline int Run(int argc, char *argv[]){
if (argc < 2){
printf("Usage: <config>\n");
return 0;
}
utils::ConfigIterator itr(argv[1]);
while (itr.Next()){
this->SetParam(itr.name(), itr.val());
}
for (int i = 2; i < argc; i++){
char name[256], val[256];
if (sscanf(argv[i], "%[^=]=%s", name, val) == 2){
this->SetParam(name, val);
}
}
this->InitData();
this->InitLearner();
if (task == "dump"){
this->TaskDump();
return 0;
}
if (task == "interact"){
this->TaskInteractive(); return 0;
}
if (task == "dumppath"){
this->TaskDumpPath(); return 0;
}
if (task == "eval"){
this->TaskEval(); return 0;
}
if (task == "pred"){
this->TaskPred();
}
else{
this->TaskTrain();
}
return 0;
}
inline void SetParam(const char *name, const char *val){
if (!strcmp("silent", name)) silent = atoi(val);
if (!strcmp("use_buffer", name)) use_buffer = atoi(val);
if (!strcmp("seed", name)) random::Seed(atoi(val));
if (!strcmp("num_round", name)) num_round = atoi(val);
if (!strcmp("save_period", name)) save_period = atoi(val);
if (!strcmp("eval_train", name)) eval_train = atoi(val);
if (!strcmp("task", name)) task = val;
if (!strcmp("data", name)) train_path = val;
if (!strcmp("test:data", name)) test_path = val;
if (!strcmp("model_in", name)) model_in = val;
if (!strcmp("model_out", name)) model_out = val;
if (!strcmp("model_dir", name)) model_dir_path = val;
if (!strcmp("fmap", name)) name_fmap = val;
if (!strcmp("name_dump", name)) name_dump = val;
if (!strcmp("name_dumppath", name)) name_dumppath = val;
if (!strcmp("name_pred", name)) name_pred = val;
if (!strcmp("dump_stats", name)) dump_model_stats = atoi(val);
if (!strcmp("interact:action", name)) interact_action = val;
if (!strncmp("batch:", name, 6)){
cfg_batch.PushBack(name + 6, val);
}
if (!strncmp("eval[", name, 5)) {
char evname[256];
utils::Assert(sscanf(name, "eval[%[^]]", evname) == 1, "must specify evaluation name for display");
eval_data_names.push_back(std::string(evname));
eval_data_paths.push_back(std::string(val));
}
cfg.PushBack(name, val);
}
public:
RegBoostTask(void){
// default parameters
silent = 0;
use_buffer = 1;
num_round = 10;
save_period = 0;
eval_train = 0;
dump_model_stats = 0;
task = "train";
model_in = "NULL";
model_out = "NULL";
name_fmap = "NULL";
name_pred = "pred.txt";
name_dump = "dump.txt";
name_dumppath = "dump.path.txt";
model_dir_path = "./";
interact_action = "update";
}
~RegBoostTask(void){
for (size_t i = 0; i < deval.size(); i++){
delete deval[i];
}
}
private:
inline void InitData(void){
if (name_fmap != "NULL") fmap.LoadText(name_fmap.c_str());
if (task == "dump") return;
if (task == "pred" || task == "dumppath"){
data.CacheLoad(test_path.c_str(), silent != 0, use_buffer != 0);
}
else{
// training
data.CacheLoad(train_path.c_str(), silent != 0, use_buffer != 0);
utils::Assert(eval_data_names.size() == eval_data_paths.size());
for (size_t i = 0; i < eval_data_names.size(); ++i){
deval.push_back(new DMatrix());
deval.back()->CacheLoad(eval_data_paths[i].c_str(), silent != 0, use_buffer != 0);
devalall.push_back(deval.back());
}
std::vector<DMatrix *> dcache(1, &data);
for( size_t i = 0; i < deval.size(); ++ i){
dcache.push_back( deval[i] );
}
// set cache data to be all training and evaluation data
learner.SetCacheData(dcache);
// add training set to evaluation set if needed
if( eval_train != 0 ){
devalall.push_back( &data );
eval_data_names.push_back( std::string("train") );
}
}
}
inline void InitLearner(void){
cfg.BeforeFirst();
while (cfg.Next()){
learner.SetParam(cfg.name(), cfg.val());
}
if (model_in != "NULL"){
utils::FileStream fi(utils::FopenCheck(model_in.c_str(), "rb"));
learner.LoadModel(fi);
fi.Close();
}
else{
utils::Assert(task == "train", "model_in not specified");
learner.InitModel();
}
learner.InitTrainer();
}
inline void TaskTrain(void){
const time_t start = time(NULL);
unsigned long elapsed = 0;
for (int i = 0; i < num_round; ++i){
elapsed = (unsigned long)(time(NULL) - start);
if (!silent) printf("boosting round %d, %lu sec elapsed\n", i, elapsed);
learner.UpdateOneIter(data);
learner.EvalOneIter(i, devalall, eval_data_names);
if (save_period != 0 && (i + 1) % save_period == 0){
this->SaveModel(i);
}
elapsed = (unsigned long)(time(NULL) - start);
}
// always save final round
if ((save_period == 0 || num_round % save_period != 0) && model_out != "NONE"){
if (model_out == "NULL"){
this->SaveModel(num_round - 1);
}
else{
this->SaveModel(model_out.c_str());
}
}
if (!silent){
printf("\nupdating end, %lu sec in all\n", elapsed);
}
}
inline void TaskEval(void){
learner.EvalOneIter(0, devalall, eval_data_names);
}
inline void TaskInteractive(void){
const time_t start = time(NULL);
unsigned long elapsed = 0;
int batch_action = 0;
cfg_batch.BeforeFirst();
while (cfg_batch.Next()){
if (!strcmp(cfg_batch.name(), "run")){
learner.UpdateInteract(interact_action, data);
batch_action += 1;
}
else{
learner.SetParam(cfg_batch.name(), cfg_batch.val());
}
}
if (batch_action == 0){
learner.UpdateInteract(interact_action, data);
}
utils::Assert(model_out != "NULL", "interactive mode must specify model_out");
this->SaveModel(model_out.c_str());
elapsed = (unsigned long)(time(NULL) - start);
if (!silent){
printf("\ninteractive update, %d batch actions, %lu sec in all\n", batch_action, elapsed);
}
}
inline void TaskDump(void){
FILE *fo = utils::FopenCheck(name_dump.c_str(), "w");
learner.DumpModel(fo, fmap, dump_model_stats != 0);
fclose(fo);
}
inline void TaskDumpPath(void){
FILE *fo = utils::FopenCheck(name_dumppath.c_str(), "w");
learner.DumpPath(fo, data);
fclose(fo);
}
inline void SaveModel(const char *fname) const{
utils::FileStream fo(utils::FopenCheck(fname, "wb"));
learner.SaveModel(fo);
fo.Close();
}
inline void SaveModel(int i) const{
char fname[256];
sprintf(fname, "%s/%04d.model", model_dir_path.c_str(), i + 1);
this->SaveModel(fname);
}
inline void TaskPred(void){
std::vector<float> preds;
if (!silent) printf("start prediction...\n");
learner.Predict(preds, data);
if (!silent) printf("writing prediction to %s\n", name_pred.c_str());
FILE *fo = utils::FopenCheck(name_pred.c_str(), "w");
for (size_t i = 0; i < preds.size(); i++){
fprintf(fo, "%f\n", preds[i]);
}
fclose(fo);
}
private:
/* \brief whether silent */
int silent;
/* \brief whether use auto binary buffer */
int use_buffer;
/* \brief whether evaluate training statistics */
int eval_train;
/* \brief number of boosting iterations */
int num_round;
/* \brief the period to save the model, 0 means only save the final round model */
int save_period;
/*! \brief interfact action */
std::string interact_action;
/* \brief the path of training/test data set */
std::string train_path, test_path;
/* \brief the path of test model file, or file to restart training */
std::string model_in;
/* \brief the path of final model file, to be saved */
std::string model_out;
/* \brief the path of directory containing the saved models */
std::string model_dir_path;
/* \brief task to perform */
std::string task;
/* \brief name of predict file */
std::string name_pred;
/* \brief whether dump statistics along with model */
int dump_model_stats;
/* \brief name of feature map */
std::string name_fmap;
/* \brief name of dump file */
std::string name_dump;
/* \brief name of dump path file */
std::string name_dumppath;
/* \brief the paths of validation data sets */
std::vector<std::string> eval_data_paths;
/* \brief the names of the evaluation data used in output log */
std::vector<std::string> eval_data_names;
/*! \brief saves configurations */
utils::ConfigSaver cfg;
/*! \brief batch configurations */
utils::ConfigSaver cfg_batch;
private:
DMatrix data;
std::vector<DMatrix*> deval;
std::vector<const DMatrix*> devalall;
utils::FeatMap fmap;
RegRankBoostLearner learner;
};
};
};
int main( int argc, char *argv[] ){
xgboost::random::Seed( 0 );
xgboost::regrank::RegBoostTask tsk;
return tsk.Run( argc, argv );
}

View File

@@ -0,0 +1,131 @@
#ifndef XGBOOST_REGRANK_OBJ_H
#define XGBOOST_REGRANK_OBJ_H
/*!
* \file xgboost_regrank_obj.h
* \brief defines objective function interface used in xgboost for regression and rank
* \author Tianqi Chen, Kailong Chen
*/
#include "xgboost_regrank_data.h"
namespace xgboost{
namespace regrank{
/*! \brief interface of objective function */
class IObjFunction{
public:
/*! \brief virtual destructor */
virtual ~IObjFunction(void){}
/*!
* \brief set parameters from outside
* \param name name of the parameter
* \param val value of the parameter
*/
virtual void SetParam(const char *name, const char *val) = 0;
/*!
* \brief get gradient over each of predictions, given existing information
* \param preds prediction of current round
* \param info information about labels, weights, groups in rank
* \param iter current iteration number
* \param grad gradient over each preds
* \param hess second order gradient over each preds
*/
virtual void GetGradient(const std::vector<float>& preds,
const DMatrix::Info &info,
int iter,
std::vector<float> &grad,
std::vector<float> &hess ) = 0;
/*! \return the default evaluation metric for the problem */
virtual const char* DefaultEvalMetric(void) = 0;
/*!
* \brief transform prediction values, this is only called when Prediction is called
* \param preds prediction values, saves to this vector as well
*/
virtual void PredTransform(std::vector<float> &preds){}
/*!
* \brief transform prediction values, this is only called when Eval is called, usually it redirect to PredTransform
* \param preds prediction values, saves to this vector as well
*/
virtual void EvalTransform(std::vector<float> &preds){ this->PredTransform(preds); }
};
};
namespace regrank{
/*! \brief defines functions to calculate some commonly used functions */
struct LossType{
public:
const static int kLinearSquare = 0;
const static int kLogisticNeglik = 1;
const static int kLogisticClassify = 2;
const static int kLogisticRaw = 3;
public:
/*! \brief indicate which type we are using */
int loss_type;
public:
/*!
* \brief transform the linear sum to prediction
* \param x linear sum of boosting ensemble
* \return transformed prediction
*/
inline float PredTransform(float x){
switch (loss_type){
case kLogisticRaw:
case kLinearSquare: return x;
case kLogisticClassify:
case kLogisticNeglik: return 1.0f / (1.0f + expf(-x));
default: utils::Error("unknown loss_type"); return 0.0f;
}
}
/*!
* \brief calculate first order gradient of loss, given transformed prediction
* \param predt transformed prediction
* \param label true label
* \return first order gradient
*/
inline float FirstOrderGradient(float predt, float label) const{
switch (loss_type){
case kLinearSquare: return predt - label;
case kLogisticRaw: predt = 1.0f / (1.0f + expf(-predt));
case kLogisticClassify:
case kLogisticNeglik: return predt - label;
default: utils::Error("unknown loss_type"); return 0.0f;
}
}
/*!
* \brief calculate second order gradient of loss, given transformed prediction
* \param predt transformed prediction
* \param label true label
* \return second order gradient
*/
inline float SecondOrderGradient(float predt, float label) const{
switch (loss_type){
case kLinearSquare: return 1.0f;
case kLogisticRaw: predt = 1.0f / (1.0f + expf(-predt));
case kLogisticClassify:
case kLogisticNeglik: return predt * (1 - predt);
default: utils::Error("unknown loss_type"); return 0.0f;
}
}
};
};
};
#include "xgboost_regrank_obj.hpp"
namespace xgboost{
namespace regrank{
inline IObjFunction* CreateObjFunction( const char *name ){
if( !strcmp("reg:linear", name ) ) return new RegressionObj( LossType::kLinearSquare );
if( !strcmp("reg:logistic", name ) ) return new RegressionObj( LossType::kLogisticNeglik );
if( !strcmp("binary:logistic", name ) ) return new RegressionObj( LossType::kLogisticClassify );
if( !strcmp("binary:logitraw", name ) ) return new RegressionObj( LossType::kLogisticRaw );
if( !strcmp("multi:softmax", name ) ) return new SoftmaxMultiClassObj(0);
if( !strcmp("multi:softprob", name ) ) return new SoftmaxMultiClassObj(1);
if( !strcmp("rank:pairwise", name ) ) return new PairwiseRankObj();
if( !strcmp("rank:softmax", name ) ) return new SoftmaxRankObj();
utils::Error("unknown objective function type");
return NULL;
}
};
};
#endif

View File

@@ -0,0 +1,353 @@
#ifndef XGBOOST_REGRANK_OBJ_HPP
#define XGBOOST_REGRANK_OBJ_HPP
/*!
* \file xgboost_regrank_obj.hpp
* \brief implementation of objective functions
* \author Tianqi Chen, Kailong Chen
*/
//#include "xgboost_regrank_sample.h"
#include <vector>
#include <functional>
#include "xgboost_regrank_utils.h"
namespace xgboost{
namespace regrank{
class RegressionObj : public IObjFunction{
public:
RegressionObj( int loss_type ){
loss.loss_type = loss_type;
scale_pos_weight = 1.0f;
}
virtual ~RegressionObj(){}
virtual void SetParam(const char *name, const char *val){
if( !strcmp( "loss_type", name ) ) loss.loss_type = atoi( val );
if( !strcmp( "scale_pos_weight", name ) ) scale_pos_weight = (float)atof( val );
}
virtual void GetGradient(const std::vector<float>& preds,
const DMatrix::Info &info,
int iter,
std::vector<float> &grad,
std::vector<float> &hess ) {
utils::Assert( preds.size() == info.labels.size(), "label size predict size not match" );
grad.resize(preds.size()); hess.resize(preds.size());
const unsigned ndata = static_cast<unsigned>(preds.size());
#pragma omp parallel for schedule( static )
for (unsigned j = 0; j < ndata; ++j){
float p = loss.PredTransform(preds[j]);
float w = info.GetWeight(j);
if( info.labels[j] == 1.0f ) w *= scale_pos_weight;
grad[j] = loss.FirstOrderGradient(p, info.labels[j]) * w;
hess[j] = loss.SecondOrderGradient(p, info.labels[j]) * w;
}
}
virtual const char* DefaultEvalMetric(void) {
if( loss.loss_type == LossType::kLogisticClassify ) return "error";
if( loss.loss_type == LossType::kLogisticRaw ) return "auc";
return "rmse";
}
virtual void PredTransform(std::vector<float> &preds){
const unsigned ndata = static_cast<unsigned>(preds.size());
#pragma omp parallel for schedule( static )
for (unsigned j = 0; j < ndata; ++j){
preds[j] = loss.PredTransform( preds[j] );
}
}
private:
float scale_pos_weight;
LossType loss;
};
};
namespace regrank{
// simple softmax rak
class SoftmaxRankObj : public IObjFunction{
public:
SoftmaxRankObj(void){
}
virtual ~SoftmaxRankObj(){}
virtual void SetParam(const char *name, const char *val){
}
virtual void GetGradient(const std::vector<float>& preds,
const DMatrix::Info &info,
int iter,
std::vector<float> &grad,
std::vector<float> &hess ) {
utils::Assert( preds.size() == info.labels.size(), "label size predict size not match" );
grad.resize(preds.size()); hess.resize(preds.size());
const std::vector<unsigned> &gptr = info.group_ptr;
utils::Assert( gptr.size() != 0 && gptr.back() == preds.size(), "rank loss must have group file" );
const unsigned ngroup = static_cast<unsigned>( gptr.size() - 1 );
#pragma omp parallel
{
std::vector< float > rec;
#pragma omp for schedule(static)
for (unsigned k = 0; k < ngroup; ++k){
rec.clear();
int nhit = 0;
for(unsigned j = gptr[k]; j < gptr[k+1]; ++j ){
rec.push_back( preds[j] );
grad[j] = hess[j] = 0.0f;
nhit += info.labels[j];
}
Softmax( rec );
if( nhit == 1 ){
for(unsigned j = gptr[k]; j < gptr[k+1]; ++j ){
float p = rec[ j - gptr[k] ];
grad[j] = p - info.labels[j];
hess[j] = 2.0f * p * ( 1.0f - p );
}
}else{
utils::Assert( nhit == 0, "softmax does not allow multiple labels" );
}
}
}
}
virtual const char* DefaultEvalMetric(void) {
return "pre@1";
}
};
// simple softmax multi-class classification
class SoftmaxMultiClassObj : public IObjFunction{
public:
SoftmaxMultiClassObj(int output_prob):output_prob(output_prob){
nclass = 0;
}
virtual ~SoftmaxMultiClassObj(){}
virtual void SetParam(const char *name, const char *val){
if( !strcmp( "num_class", name ) ) nclass = atoi(val);
}
virtual void GetGradient(const std::vector<float>& preds,
const DMatrix::Info &info,
int iter,
std::vector<float> &grad,
std::vector<float> &hess ) {
utils::Assert( nclass != 0, "must set num_class to use softmax" );
utils::Assert( preds.size() == (size_t)nclass * info.labels.size(), "SoftmaxMultiClassObj: label size and pred size does not match" );
grad.resize(preds.size()); hess.resize(preds.size());
const unsigned ndata = static_cast<unsigned>(info.labels.size());
#pragma omp parallel
{
std::vector<float> rec(nclass);
#pragma omp for schedule(static)
for (unsigned j = 0; j < ndata; ++j){
for( int k = 0; k < nclass; ++ k ){
rec[k] = preds[j + k * ndata];
}
Softmax( rec );
int label = static_cast<int>(info.labels[j]);
if( label < 0 ){
label = -label - 1;
}
utils::Assert( label < nclass, "SoftmaxMultiClassObj: label exceed num_class" );
for( int k = 0; k < nclass; ++ k ){
float p = rec[ k ];
if( label == k ){
grad[j+k*ndata] = p - 1.0f;
}else{
grad[j+k*ndata] = p;
}
hess[j+k*ndata] = 2.0f * p * ( 1.0f - p );
}
}
}
}
virtual void PredTransform(std::vector<float> &preds){
this->Transform(preds, output_prob);
}
virtual void EvalTransform(std::vector<float> &preds){
this->Transform(preds, 0);
}
private:
inline void Transform(std::vector<float> &preds, int prob){
utils::Assert( nclass != 0, "must set num_class to use softmax" );
utils::Assert( preds.size() % nclass == 0, "SoftmaxMultiClassObj: label size and pred size does not match" );
const unsigned ndata = static_cast<unsigned>(preds.size()/nclass);
#pragma omp parallel
{
std::vector<float> rec(nclass);
#pragma omp for schedule(static)
for (unsigned j = 0; j < ndata; ++j){
for( int k = 0; k < nclass; ++ k ){
rec[k] = preds[j + k * ndata];
}
if( prob == 0 ){
preds[j] = FindMaxIndex( rec );
}else{
Softmax( rec );
for( int k = 0; k < nclass; ++ k ){
preds[j + k * ndata] = rec[k];
}
}
}
}
if( prob == 0 ){
preds.resize( ndata );
}
}
virtual const char* DefaultEvalMetric(void) {
return "merror";
}
private:
int nclass;
int output_prob;
};
};
namespace regrank{
/*! \brief objective for lambda rank */
class LambdaRankObj : public IObjFunction{
public:
LambdaRankObj(void){
loss.loss_type = LossType::kLogisticRaw;
fix_list_weight = 0.0f;
num_pairsample = 1;
}
virtual ~LambdaRankObj(){}
virtual void SetParam(const char *name, const char *val){
if( !strcmp( "loss_type", name ) ) loss.loss_type = atoi( val );
if( !strcmp( "fix_list_weight", name ) ) fix_list_weight = (float)atof( val );
if( !strcmp( "num_pairsample", name ) ) num_pairsample = atoi( val );
}
public:
virtual void GetGradient(const std::vector<float>& preds,
const DMatrix::Info &info,
int iter,
std::vector<float> &grad,
std::vector<float> &hess ) {
utils::Assert( preds.size() == info.labels.size(), "label size predict size not match" );
grad.resize(preds.size()); hess.resize(preds.size());
const std::vector<unsigned> &gptr = info.group_ptr;
utils::Assert( gptr.size() != 0 && gptr.back() == preds.size(), "rank loss must have group file" );
const unsigned ngroup = static_cast<unsigned>( gptr.size() - 1 );
#pragma omp parallel
{
// parall construct, declare random number generator here, so that each
// thread use its own random number generator, seed by thread id and current iteration
random::Random rnd; rnd.Seed( iter * 1111 + omp_get_thread_num() );
std::vector<LambdaPair> pairs;
std::vector<ListEntry> lst;
std::vector< std::pair<float,unsigned> > rec;
#pragma omp for schedule(static)
for (unsigned k = 0; k < ngroup; ++k){
lst.clear(); pairs.clear();
for(unsigned j = gptr[k]; j < gptr[k+1]; ++j ){
lst.push_back( ListEntry(preds[j], info.labels[j], j ) );
grad[j] = hess[j] = 0.0f;
}
std::sort( lst.begin(), lst.end(), ListEntry::CmpPred );
rec.resize( lst.size() );
for( unsigned i = 0; i < lst.size(); ++i ){
rec[i] = std::make_pair( lst[i].label, i );
}
std::sort( rec.begin(), rec.end(), CmpFirst );
// enumerate buckets with same label, for each item in the lst, grab another sample randomly
for( unsigned i = 0; i < rec.size(); ){
unsigned j = i + 1;
while( j < rec.size() && rec[j].first == rec[i].first ) ++ j;
// bucket in [i,j), get a sample outside bucket
unsigned nleft = i, nright = rec.size() - j;
if( nleft + nright != 0 ){
int nsample = num_pairsample;
while( nsample -- ){
for( unsigned pid = i; pid < j; ++ pid ){
unsigned ridx = static_cast<unsigned>( rnd.RandDouble() * (nleft+nright) );
if( ridx < nleft ){
pairs.push_back( LambdaPair( rec[ridx].second, rec[pid].second ) );
}else{
pairs.push_back( LambdaPair( rec[pid].second, rec[ridx+j-i].second ) );
}
}
}
}
i = j;
}
// get lambda weight for the pairs
this->GetLambdaWeight( lst, pairs );
// rescale each gradient and hessian so that the lst have constant weighted
float scale = 1.0f / num_pairsample;
if( fix_list_weight != 0.0f ){
scale *= fix_list_weight / (gptr[k+1] - gptr[k]);
}
for( size_t i = 0; i < pairs.size(); ++ i ){
const ListEntry &pos = lst[ pairs[i].pos_index ];
const ListEntry &neg = lst[ pairs[i].neg_index ];
const float w = pairs[i].weight * scale;
float p = loss.PredTransform( pos.pred - neg.pred );
float g = loss.FirstOrderGradient( p, 1.0f );
float h = loss.SecondOrderGradient( p, 1.0f );
// accumulate gradient and hessian in both pid, and nid,
grad[ pos.rindex ] += g * w;
grad[ neg.rindex ] -= g * w;
// take conservative update, scale hessian by 2
hess[ pos.rindex ] += 2.0f * h * w;
hess[ neg.rindex ] += 2.0f * h * w;
}
}
}
}
virtual const char* DefaultEvalMetric(void) {
return "map";
}
private:
// loss function
LossType loss;
// number of samples peformed for each instance
int num_pairsample;
// fix weight of each elements in list
float fix_list_weight;
protected:
/*! \brief helper information in a list */
struct ListEntry{
/*! \brief the predict score we in the data */
float pred;
/*! \brief the actual label of the entry */
float label;
/*! \brief row index in the data matrix */
unsigned rindex;
// constructor
ListEntry(float pred, float label, unsigned rindex): pred(pred),label(label),rindex(rindex){}
// comparator by prediction
inline static bool CmpPred(const ListEntry &a, const ListEntry &b){
return a.pred > b.pred;
}
// comparator by label
inline static bool CmpLabel(const ListEntry &a, const ListEntry &b){
return a.label > b.label;
}
};
/*! \brief a pair in the lambda rank */
struct LambdaPair{
/*! \brief positive index: this is a position in the list */
unsigned pos_index;
/*! \brief negative index: this is a position in the list */
unsigned neg_index;
/*! \brief weight to be filled in */
float weight;
LambdaPair( unsigned pos_index, unsigned neg_index ):pos_index(pos_index),neg_index(neg_index),weight(1.0f){}
};
/*!
* \brief get lambda weight for existing pairs
* \param list a list that is sorted by pred score
* \param pairs record of pairs, containing the pairs to fill in weights
*/
virtual void GetLambdaWeight( const std::vector<ListEntry> &sorted_list, std::vector<LambdaPair> &pairs ) = 0;
};
};
namespace regrank{
class PairwiseRankObj: public LambdaRankObj{
public:
virtual ~PairwiseRankObj(void){}
virtual void GetLambdaWeight( const std::vector<ListEntry> &sorted_list, std::vector<LambdaPair> &pairs ){}
};
};
};
#endif

View File

@@ -0,0 +1,45 @@
#ifndef XGBOOST_REGRANK_UTILS_H
#define XGBOOST_REGRANK_UTILS_H
/*!
* \file xgboost_regrank_utils.h
* \brief useful helper functions
* \author Tianqi Chen, Kailong Chen
*/
namespace xgboost{
namespace regrank{
// simple helper function to do softmax
inline static void Softmax( std::vector<float>& rec ){
float wmax = rec[0];
for( size_t i = 1; i < rec.size(); ++ i ){
wmax = std::max( rec[i], wmax );
}
double wsum = 0.0f;
for( size_t i = 0; i < rec.size(); ++ i ){
rec[i] = expf(rec[i]-wmax);
wsum += rec[i];
}
for( size_t i = 0; i < rec.size(); ++ i ){
rec[i] /= static_cast<float>(wsum);
}
}
// simple helper function to do softmax
inline static int FindMaxIndex( std::vector<float>& rec ){
size_t mxid = 0;
for( size_t i = 1; i < rec.size(); ++ i ){
if( rec[i] > rec[mxid]+1e-6f ){
mxid = i;
}
}
return (int)mxid;
}
inline static bool CmpFirst(const std::pair<float, unsigned> &a, const std::pair<float, unsigned> &b){
return a.first > b.first;
}
inline static bool CmpSecond(const std::pair<float, unsigned> &a, const std::pair<float, unsigned> &b){
return a.second > b.second;
}
};
};
#endif

View File

@@ -1,403 +0,0 @@
#ifndef XGBOOST_REG_H
#define XGBOOST_REG_H
/*!
* \file xgboost_reg.h
* \brief class for gradient boosted regression
* \author Kailong Chen: chenkl198812@gmail.com, Tianqi Chen: tianqi.tchen@gmail.com
*/
#include <cmath>
#include <cstdlib>
#include <cstring>
#include "xgboost_reg_data.h"
#include "xgboost_reg_eval.h"
#include "../utils/xgboost_omp.h"
#include "../booster/xgboost_gbmbase.h"
#include "../utils/xgboost_utils.h"
#include "../utils/xgboost_stream.h"
namespace xgboost{
namespace regression{
/*! \brief class for gradient boosted regression */
class RegBoostLearner{
public:
/*! \brief constructor */
RegBoostLearner( void ){
silent = 0;
}
/*!
* \brief a regression booter associated with training and evaluating data
* \param train pointer to the training data
* \param evals array of evaluating data
* \param evname name of evaluation data, used print statistics
*/
RegBoostLearner( const DMatrix *train,
const std::vector<DMatrix *> &evals,
const std::vector<std::string> &evname ){
silent = 0;
this->SetData(train,evals,evname);
}
/*!
* \brief associate regression booster with training and evaluating data
* \param train pointer to the training data
* \param evals array of evaluating data
* \param evname name of evaluation data, used print statistics
*/
inline void SetData( const DMatrix *train,
const std::vector<DMatrix *> &evals,
const std::vector<std::string> &evname ){
this->train_ = train;
this->evals_ = evals;
this->evname_ = evname;
// estimate feature bound
int num_feature = (int)(train->data.NumCol());
// assign buffer index
unsigned buffer_size = static_cast<unsigned>( train->Size() );
for( size_t i = 0; i < evals.size(); ++ i ){
buffer_size += static_cast<unsigned>( evals[i]->Size() );
num_feature = std::max( num_feature, (int)(evals[i]->data.NumCol()) );
}
char str_temp[25];
if( num_feature > mparam.num_feature ){
mparam.num_feature = num_feature;
sprintf( str_temp, "%d", num_feature );
base_gbm.SetParam( "bst:num_feature", str_temp );
}
sprintf( str_temp, "%u", buffer_size );
base_gbm.SetParam( "num_pbuffer", str_temp );
if( !silent ){
printf( "buffer_size=%u\n", buffer_size );
}
// set eval_preds tmp sapce
this->eval_preds_.resize( evals.size(), std::vector<float>() );
}
/*!
* \brief set parameters from outside
* \param name name of the parameter
* \param val value of the parameter
*/
inline void SetParam( const char *name, const char *val ){
if( !strcmp( name, "silent") ) silent = atoi( val );
if( !strcmp( name, "eval_metric") ) evaluator_.AddEval( val );
mparam.SetParam( name, val );
base_gbm.SetParam( name, val );
}
/*!
* \brief initialize solver before training, called before training
* this function is reserved for solver to allocate necessary space and do other preparation
*/
inline void InitTrainer( void ){
base_gbm.InitTrainer();
if( mparam.loss_type == kLogisticClassify ){
evaluator_.AddEval( "error" );
}else{
evaluator_.AddEval( "rmse" );
}
evaluator_.Init();
}
/*!
* \brief initialize the current data storage for model, if the model is used first time, call this function
*/
inline void InitModel( void ){
base_gbm.InitModel();
mparam.AdjustBase();
}
/*!
* \brief load model from stream
* \param fi input stream
*/
inline void LoadModel( utils::IStream &fi ){
base_gbm.LoadModel( fi );
utils::Assert( fi.Read( &mparam, sizeof(ModelParam) ) != 0 );
}
/*!
* \brief DumpModel
* \param fo text file
* \param fmap feature map that may help give interpretations of feature
* \param with_stats whether print statistics as well
*/
inline void DumpModel( FILE *fo, const utils::FeatMap& fmap, bool with_stats ){
base_gbm.DumpModel( fo, fmap, with_stats );
}
/*!
* \brief Dump path of all trees
* \param fo text file
* \param data input data
*/
inline void DumpPath( FILE *fo, const DMatrix &data ){
base_gbm.DumpPath( fo, data.data );
}
/*!
* \brief save model to stream
* \param fo output stream
*/
inline void SaveModel( utils::IStream &fo ) const{
base_gbm.SaveModel( fo );
fo.Write( &mparam, sizeof(ModelParam) );
}
/*!
* \brief update the model for one iteration
* \param iteration iteration number
*/
inline void UpdateOneIter( int iter ){
this->PredictBuffer( preds_, *train_, 0 );
this->GetGradient( preds_, train_->labels, grad_, hess_ );
std::vector<unsigned> root_index;
base_gbm.DoBoost( grad_, hess_, train_->data, root_index );
}
/*!
* \brief evaluate the model for specific iteration
* \param iter iteration number
* \param fo file to output log
*/
inline void EvalOneIter( int iter, FILE *fo = stderr ){
fprintf( fo, "[%d]", iter );
int buffer_offset = static_cast<int>( train_->Size() );
for( size_t i = 0; i < evals_.size(); ++i ){
std::vector<float> &preds = this->eval_preds_[ i ];
this->PredictBuffer( preds, *evals_[i], buffer_offset);
evaluator_.Eval( fo, evname_[i].c_str(), preds, (*evals_[i]).labels );
buffer_offset += static_cast<int>( evals_[i]->Size() );
}
fprintf( fo,"\n" );
}
/*! \brief get prediction, without buffering */
inline void Predict( std::vector<float> &preds, const DMatrix &data ){
preds.resize( data.Size() );
const unsigned ndata = static_cast<unsigned>( data.Size() );
#pragma omp parallel for schedule( static )
for( unsigned j = 0; j < ndata; ++ j ){
preds[j] = mparam.PredTransform
( mparam.base_score + base_gbm.Predict( data.data, j, -1 ) );
}
}
public:
/*!
* \brief update the model for one iteration
* \param iteration iteration number
*/
inline void UpdateInteract( std::string action ){
this->InteractPredict( preds_, *train_, 0 );
int buffer_offset = static_cast<int>( train_->Size() );
for( size_t i = 0; i < evals_.size(); ++i ){
std::vector<float> &preds = this->eval_preds_[ i ];
this->InteractPredict( preds, *evals_[i], buffer_offset );
buffer_offset += static_cast<int>( evals_[i]->Size() );
}
if( action == "remove" ){
base_gbm.DelteBooster(); return;
}
this->GetGradient( preds_, train_->labels, grad_, hess_ );
std::vector<unsigned> root_index;
base_gbm.DoBoost( grad_, hess_, train_->data, root_index );
this->InteractRePredict( *train_, 0 );
buffer_offset = static_cast<int>( train_->Size() );
for( size_t i = 0; i < evals_.size(); ++i ){
this->InteractRePredict( *evals_[i], buffer_offset );
buffer_offset += static_cast<int>( evals_[i]->Size() );
}
}
private:
/*! \brief get the transformed predictions, given data */
inline void InteractPredict( std::vector<float> &preds, const DMatrix &data, unsigned buffer_offset ){
preds.resize( data.Size() );
const unsigned ndata = static_cast<unsigned>( data.Size() );
#pragma omp parallel for schedule( static )
for( unsigned j = 0; j < ndata; ++ j ){
preds[j] = mparam.PredTransform
( mparam.base_score + base_gbm.InteractPredict( data.data, j, buffer_offset + j ) );
}
}
/*! \brief repredict trial */
inline void InteractRePredict( const DMatrix &data, unsigned buffer_offset ){
const unsigned ndata = static_cast<unsigned>( data.Size() );
#pragma omp parallel for schedule( static )
for( unsigned j = 0; j < ndata; ++ j ){
base_gbm.InteractRePredict( data.data, j, buffer_offset + j );
}
}
private:
/*! \brief get the transformed predictions, given data */
inline void PredictBuffer( std::vector<float> &preds, const DMatrix &data, unsigned buffer_offset ){
preds.resize( data.Size() );
const unsigned ndata = static_cast<unsigned>( data.Size() );
#pragma omp parallel for schedule( static )
for( unsigned j = 0; j < ndata; ++ j ){
preds[j] = mparam.PredTransform
( mparam.base_score + base_gbm.Predict( data.data, j, buffer_offset + j ) );
}
}
/*! \brief get the first order and second order gradient, given the transformed predictions and labels */
inline void GetGradient( const std::vector<float> &preds,
const std::vector<float> &labels,
std::vector<float> &grad,
std::vector<float> &hess ){
grad.resize( preds.size() ); hess.resize( preds.size() );
const unsigned ndata = static_cast<unsigned>( preds.size() );
#pragma omp parallel for schedule( static )
for( unsigned j = 0; j < ndata; ++ j ){
grad[j] = mparam.FirstOrderGradient( preds[j], labels[j] );
hess[j] = mparam.SecondOrderGradient( preds[j], labels[j] );
}
}
private:
enum LossType{
kLinearSquare = 0,
kLogisticNeglik = 1,
kLogisticClassify = 2
};
/*! \brief training parameter for regression */
struct ModelParam{
/* \brief global bias */
float base_score;
/* \brief type of loss function */
int loss_type;
/* \brief number of features */
int num_feature;
/*! \brief reserved field */
int reserved[ 16 ];
/*! \brief constructor */
ModelParam( void ){
base_score = 0.5f;
loss_type = 0;
num_feature = 0;
memset( reserved, 0, sizeof( reserved ) );
}
/*!
* \brief set parameters from outside
* \param name name of the parameter
* \param val value of the parameter
*/
inline void SetParam( const char *name, const char *val ){
if( !strcmp("base_score", name ) ) base_score = (float)atof( val );
if( !strcmp("loss_type", name ) ) loss_type = atoi( val );
if( !strcmp("bst:num_feature", name ) ) num_feature = atoi( val );
}
/*!
* \brief adjust base_score
*/
inline void AdjustBase( void ){
if( loss_type == 1 || loss_type == 2 ){
utils::Assert( base_score > 0.0f && base_score < 1.0f, "sigmoid range constrain" );
base_score = - logf( 1.0f / base_score - 1.0f );
}
}
/*!
* \brief transform the linear sum to prediction
* \param x linear sum of boosting ensemble
* \return transformed prediction
*/
inline float PredTransform( float x ){
switch( loss_type ){
case kLinearSquare: return x;
case kLogisticClassify:
case kLogisticNeglik: return 1.0f/(1.0f + expf(-x));
default: utils::Error("unknown loss_type"); return 0.0f;
}
}
/*!
* \brief calculate first order gradient of loss, given transformed prediction
* \param predt transformed prediction
* \param label true label
* \return first order gradient
*/
inline float FirstOrderGradient( float predt, float label ) const{
switch( loss_type ){
case kLinearSquare: return predt - label;
case kLogisticClassify:
case kLogisticNeglik: return predt - label;
default: utils::Error("unknown loss_type"); return 0.0f;
}
}
/*!
* \brief calculate second order gradient of loss, given transformed prediction
* \param predt transformed prediction
* \param label true label
* \return second order gradient
*/
inline float SecondOrderGradient( float predt, float label ) const{
switch( loss_type ){
case kLinearSquare: return 1.0f;
case kLogisticClassify:
case kLogisticNeglik: return predt * ( 1 - predt );
default: utils::Error("unknown loss_type"); return 0.0f;
}
}
/*!
* \brief calculating the loss, given the predictions, labels and the loss type
* \param preds the given predictions
* \param labels the given labels
* \return the specified loss
*/
inline float Loss(const std::vector<float> &preds, const std::vector<float> &labels) const{
switch( loss_type ){
case kLinearSquare: return SquareLoss(preds,labels);
case kLogisticNeglik:
case kLogisticClassify: return NegLoglikelihoodLoss(preds,labels);
default: utils::Error("unknown loss_type"); return 0.0f;
}
}
/*!
* \brief calculating the square loss, given the predictions and labels
* \param preds the given predictions
* \param labels the given labels
* \return the summation of square loss
*/
inline float SquareLoss(const std::vector<float> &preds, const std::vector<float> &labels) const{
float ans = 0.0;
for(size_t i = 0; i < preds.size(); i++){
float dif = preds[i] - labels[i];
ans += dif * dif;
}
return ans;
}
/*!
* \brief calculating the square loss, given the predictions and labels
* \param preds the given predictions
* \param labels the given labels
* \return the summation of square loss
*/
inline float NegLoglikelihoodLoss(const std::vector<float> &preds, const std::vector<float> &labels) const{
float ans = 0.0;
for(size_t i = 0; i < preds.size(); i++)
ans -= labels[i] * logf(preds[i]) + ( 1 - labels[i] ) * logf(1 - preds[i]);
return ans;
}
};
private:
int silent;
EvalSet evaluator_;
booster::GBMBase base_gbm;
ModelParam mparam;
const DMatrix *train_;
std::vector<DMatrix *> evals_;
std::vector<std::string> evname_;
std::vector<unsigned> buffer_index_;
private:
std::vector<float> grad_, hess_, preds_;
std::vector< std::vector<float> > eval_preds_;
};
}
};
#endif

View File

@@ -1,155 +0,0 @@
#ifndef XGBOOST_REG_DATA_H
#define XGBOOST_REG_DATA_H
/*!
* \file xgboost_reg_data.h
* \brief input data structure for regression and binary classification task.
* Format:
* The data should contain each data instance in each line.
* The format of line data is as below:
* label <nonzero feature dimension> [feature index:feature value]+
* \author Kailong Chen: chenkl198812@gmail.com, Tianqi Chen: tianqi.tchen@gmail.com
*/
#include <cstdio>
#include <vector>
#include "../booster/xgboost_data.h"
#include "../utils/xgboost_utils.h"
#include "../utils/xgboost_stream.h"
namespace xgboost{
namespace regression{
/*! \brief data matrix for regression content */
struct DMatrix{
public:
/*! \brief maximum feature dimension */
unsigned num_feature;
/*! \brief feature data content */
booster::FMatrixS data;
/*! \brief label of each instance */
std::vector<float> labels;
public:
/*! \brief default constructor */
DMatrix( void ){}
/*! \brief get the number of instances */
inline size_t Size() const{
return labels.size();
}
/*!
* \brief load from text file
* \param fname name of text data
* \param silent whether print information or not
*/
inline void LoadText( const char* fname, bool silent = false ){
data.Clear();
FILE* file = utils::FopenCheck( fname, "r" );
float label; bool init = true;
char tmp[ 1024 ];
std::vector<booster::bst_uint> findex;
std::vector<booster::bst_float> fvalue;
while( fscanf( file, "%s", tmp ) == 1 ){
unsigned index; float value;
if( sscanf( tmp, "%u:%f", &index, &value ) == 2 ){
findex.push_back( index ); fvalue.push_back( value );
}else{
if( !init ){
labels.push_back( label );
data.AddRow( findex, fvalue );
}
findex.clear(); fvalue.clear();
utils::Assert( sscanf( tmp, "%f", &label ) == 1, "invalid format" );
init = false;
}
}
labels.push_back( label );
data.AddRow( findex, fvalue );
// initialize column support as well
data.InitData();
if( !silent ){
printf("%ux%u matrix with %lu entries is loaded from %s\n",
(unsigned)data.NumRow(), (unsigned)data.NumCol(), (unsigned long)data.NumEntry(), fname );
}
fclose(file);
}
/*!
* \brief load from binary file
* \param fname name of binary data
* \param silent whether print information or not
* \return whether loading is success
*/
inline bool LoadBinary( const char* fname, bool silent = false ){
FILE *fp = fopen64( fname, "rb" );
if( fp == NULL ) return false;
utils::FileStream fs( fp );
data.LoadBinary( fs );
labels.resize( data.NumRow() );
utils::Assert( fs.Read( &labels[0], sizeof(float) * data.NumRow() ) != 0, "DMatrix LoadBinary" );
fs.Close();
// initialize column support as well
data.InitData();
if( !silent ){
printf("%ux%u matrix with %lu entries is loaded from %s\n",
(unsigned)data.NumRow(), (unsigned)data.NumCol(), (unsigned long)data.NumEntry(), fname );
}
return true;
}
/*!
* \brief save to binary file
* \param fname name of binary data
* \param silent whether print information or not
*/
inline void SaveBinary( const char* fname, bool silent = false ){
// initialize column support as well
data.InitData();
utils::FileStream fs( utils::FopenCheck( fname, "wb" ) );
data.SaveBinary( fs );
fs.Write( &labels[0], sizeof(float) * data.NumRow() );
fs.Close();
if( !silent ){
printf("%ux%u matrix with %lu entries is saved to %s\n",
(unsigned)data.NumRow(), (unsigned)data.NumCol(), (unsigned long)data.NumEntry(), fname );
}
}
/*!
* \brief cache load data given a file name, if filename ends with .buffer, direct load binary
* otherwise the function will first check if fname + '.buffer' exists,
* if binary buffer exists, it will reads from binary buffer, otherwise, it will load from text file,
* and try to create a buffer file
* \param fname name of binary data
* \param silent whether print information or not
* \param savebuffer whether do save binary buffer if it is text
*/
inline void CacheLoad( const char *fname, bool silent = false, bool savebuffer = true ){
int len = strlen( fname );
if( len > 8 && !strcmp( fname + len - 7, ".buffer") ){
this->LoadBinary( fname, silent ); return;
}
char bname[ 1024 ];
sprintf( bname, "%s.buffer", fname );
if( !this->LoadBinary( bname, silent ) ){
this->LoadText( fname, silent );
if( savebuffer ) this->SaveBinary( bname, silent );
}
}
private:
/*! \brief update num_feature info */
inline void UpdateInfo( void ){
this->num_feature = 0;
for( size_t i = 0; i < data.NumRow(); i ++ ){
booster::FMatrixS::Line sp = data[i];
for( unsigned j = 0; j < sp.len; j ++ ){
if( num_feature <= sp[j].findex ){
num_feature = sp[j].findex + 1;
}
}
}
}
};
};
};
#endif

View File

@@ -1,119 +0,0 @@
#ifndef XGBOOST_REG_EVAL_H
#define XGBOOST_REG_EVAL_H
/*!
* \file xgboost_reg_eval.h
* \brief evaluation metrics for regression and classification
* \author Kailong Chen: chenkl198812@gmail.com, Tianqi Chen: tianqi.tchen@gmail.com
*/
#include <cmath>
#include <vector>
#include <algorithm>
#include "../utils/xgboost_utils.h"
#include "../utils/xgboost_omp.h"
namespace xgboost{
namespace regression{
/*! \brief evaluator that evaluates the loss metrics */
struct IEvaluator{
/*!
* \brief evaluate a specific metric
* \param preds prediction
* \param labels label
*/
virtual float Eval( const std::vector<float> &preds,
const std::vector<float> &labels ) const= 0;
/*! \return name of metric */
virtual const char *Name( void ) const= 0;
};
/*! \brief RMSE */
struct EvalRMSE : public IEvaluator{
virtual float Eval( const std::vector<float> &preds,
const std::vector<float> &labels ) const{
const unsigned ndata = static_cast<unsigned>( preds.size() );
float sum = 0.0;
#pragma omp parallel for reduction(+:sum) schedule( static )
for( unsigned i = 0; i < ndata; ++ i ){
float diff = preds[i] - labels[i];
sum += diff * diff;
}
return sqrtf( sum / ndata );
}
virtual const char *Name( void ) const{
return "rmse";
}
};
/*! \brief Error */
struct EvalError : public IEvaluator{
virtual float Eval( const std::vector<float> &preds,
const std::vector<float> &labels ) const{
const unsigned ndata = static_cast<unsigned>( preds.size() );
unsigned nerr = 0;
#pragma omp parallel for reduction(+:nerr) schedule( static )
for( unsigned i = 0; i < ndata; ++ i ){
if( preds[i] > 0.5f ){
if( labels[i] < 0.5f ) nerr += 1;
}else{
if( labels[i] > 0.5f ) nerr += 1;
}
}
return static_cast<float>(nerr) / ndata;
}
virtual const char *Name( void ) const{
return "error";
}
};
/*! \brief Error */
struct EvalLogLoss : public IEvaluator{
virtual float Eval( const std::vector<float> &preds,
const std::vector<float> &labels ) const{
const unsigned ndata = static_cast<unsigned>( preds.size() );
unsigned nerr = 0;
#pragma omp parallel for reduction(+:nerr) schedule( static )
for( unsigned i = 0; i < ndata; ++ i ){
const float y = labels[i];
const float py = preds[i];
nerr -= y * std::log(py) + (1.0f-y)*std::log(1-py);
}
return static_cast<float>(nerr) / ndata;
}
virtual const char *Name( void ) const{
return "negllik";
}
};
};
namespace regression{
/*! \brief a set of evaluators */
struct EvalSet{
public:
inline void AddEval( const char *name ){
if( !strcmp( name, "rmse") ) evals_.push_back( &rmse_ );
if( !strcmp( name, "error") ) evals_.push_back( &error_ );
if( !strcmp( name, "logloss") ) evals_.push_back( &logloss_ );
}
inline void Init( void ){
std::sort( evals_.begin(), evals_.end() );
evals_.resize( std::unique( evals_.begin(), evals_.end() ) - evals_.begin() );
}
inline void Eval( FILE *fo, const char *evname,
const std::vector<float> &preds,
const std::vector<float> &labels ) const{
for( size_t i = 0; i < evals_.size(); ++ i ){
float res = evals_[i]->Eval( preds, labels );
fprintf( fo, "\t%s-%s:%f", evname, evals_[i]->Name(), res );
}
}
private:
EvalRMSE rmse_;
EvalError error_;
EvalLogLoss logloss_;
std::vector<const IEvaluator*> evals_;
};
};
};
#endif

View File

@@ -1,280 +0,0 @@
#define _CRT_SECURE_NO_WARNINGS
#define _CRT_SECURE_NO_DEPRECATE
#include <ctime>
#include <string>
#include <cstring>
#include "xgboost_reg.h"
#include "../utils/xgboost_fmap.h"
#include "../utils/xgboost_random.h"
#include "../utils/xgboost_config.h"
namespace xgboost{
namespace regression{
/*!
* \brief wrapping the training process of the gradient boosting regression model,
* given the configuation
* \author Kailong Chen: chenkl198812@gmail.com, Tianqi Chen: tianqi.chen@gmail.com
*/
class RegBoostTask{
public:
inline int Run( int argc, char *argv[] ){
if( argc < 2 ){
printf("Usage: <config>\n");
return 0;
}
utils::ConfigIterator itr( argv[1] );
while( itr.Next() ){
this->SetParam( itr.name(), itr.val() );
}
for( int i = 2; i < argc; i ++ ){
char name[256], val[256];
if( sscanf( argv[i], "%[^=]=%s", name, val ) == 2 ){
this->SetParam( name, val );
}
}
this->InitData();
this->InitLearner();
if( task == "dump" ){
this->TaskDump();
return 0;
}
if( task == "interact" ){
this->TaskInteractive(); return 0;
}
if( task == "dumppath" ){
this->TaskDumpPath(); return 0;
}
if( task == "eval" ){
this->TaskEval(); return 0;
}
if( task == "pred" ){
this->TaskPred();
}else{
this->TaskTrain();
}
return 0;
}
inline void SetParam( const char *name, const char *val ){
if( !strcmp("silent", name ) ) silent = atoi( val );
if( !strcmp("use_buffer", name ) ) use_buffer = atoi( val );
if( !strcmp("seed", name ) ) random::Seed( atoi(val) );
if( !strcmp("num_round", name ) ) num_round = atoi( val );
if( !strcmp("save_period", name ) ) save_period = atoi( val );
if( !strcmp("task", name ) ) task = val;
if( !strcmp("data", name ) ) train_path = val;
if( !strcmp("test:data", name ) ) test_path = val;
if( !strcmp("model_in", name ) ) model_in = val;
if( !strcmp("model_out", name ) ) model_out = val;
if( !strcmp("model_dir", name ) ) model_dir_path = val;
if( !strcmp("fmap", name ) ) name_fmap = val;
if( !strcmp("name_dump", name ) ) name_dump = val;
if( !strcmp("name_dumppath", name ) ) name_dumppath = val;
if( !strcmp("name_pred", name ) ) name_pred = val;
if( !strcmp("dump_stats", name ) ) dump_model_stats = atoi( val );
if( !strcmp("interact:action", name ) ) interact_action = val;
if( !strncmp("batch:", name, 6 ) ){
cfg_batch.PushBack( name + 6, val );
}
if( !strncmp("eval[", name, 5 ) ) {
char evname[ 256 ];
utils::Assert( sscanf( name, "eval[%[^]]", evname ) == 1, "must specify evaluation name for display");
eval_data_names.push_back( std::string( evname ) );
eval_data_paths.push_back( std::string( val ) );
}
cfg.PushBack( name, val );
}
public:
RegBoostTask( void ){
// default parameters
silent = 0;
use_buffer = 1;
num_round = 10;
save_period = 0;
dump_model_stats = 0;
task = "train";
model_in = "NULL";
model_out = "NULL";
name_fmap = "NULL";
name_pred = "pred.txt";
name_dump = "dump.txt";
name_dumppath = "dump.path.txt";
model_dir_path = "./";
interact_action = "update";
}
~RegBoostTask( void ){
for( size_t i = 0; i < deval.size(); i ++ ){
delete deval[i];
}
}
private:
inline void InitData( void ){
if( name_fmap != "NULL" ) fmap.LoadText( name_fmap.c_str() );
if( task == "dump" ) return;
if( task == "pred" || task == "dumppath" ){
data.CacheLoad( test_path.c_str(), silent!=0, use_buffer!=0 );
}else{
// training
data.CacheLoad( train_path.c_str(), silent!=0, use_buffer!=0 );
utils::Assert( eval_data_names.size() == eval_data_paths.size() );
for( size_t i = 0; i < eval_data_names.size(); ++ i ){
deval.push_back( new DMatrix() );
deval.back()->CacheLoad( eval_data_paths[i].c_str(), silent!=0, use_buffer!=0 );
}
}
learner.SetData( &data, deval, eval_data_names );
}
inline void InitLearner( void ){
cfg.BeforeFirst();
while( cfg.Next() ){
learner.SetParam( cfg.name(), cfg.val() );
}
if( model_in != "NULL" ){
utils::FileStream fi( utils::FopenCheck( model_in.c_str(), "rb") );
learner.LoadModel( fi );
fi.Close();
}else{
utils::Assert( task == "train", "model_in not specified" );
learner.InitModel();
}
learner.InitTrainer();
}
inline void TaskTrain( void ){
const time_t start = time( NULL );
unsigned long elapsed = 0;
for( int i = 0; i < num_round; ++ i ){
elapsed = (unsigned long)(time(NULL) - start);
if( !silent ) printf("boosting round %d, %lu sec elapsed\n", i , elapsed );
learner.UpdateOneIter( i );
learner.EvalOneIter( i );
if( save_period != 0 && (i+1) % save_period == 0 ){
this->SaveModel( i );
}
elapsed = (unsigned long)(time(NULL) - start);
}
// always save final round
if( save_period == 0 || num_round % save_period != 0 ){
if( model_out == "NULL" ){
this->SaveModel( num_round - 1 );
}else{
this->SaveModel( model_out.c_str() );
}
}
if( !silent ){
printf("\nupdating end, %lu sec in all\n", elapsed );
}
}
inline void TaskEval( void ){
learner.EvalOneIter( 0 );
}
inline void TaskInteractive( void ){
const time_t start = time( NULL );
unsigned long elapsed = 0;
int batch_action = 0;
cfg_batch.BeforeFirst();
while( cfg_batch.Next() ){
if( !strcmp( cfg_batch.name(), "run" ) ){
learner.UpdateInteract( interact_action );
batch_action += 1;
} else{
learner.SetParam( cfg_batch.name(), cfg_batch.val() );
}
}
if( batch_action == 0 ){
learner.UpdateInteract( interact_action );
}
utils::Assert( model_out != "NULL", "interactive mode must specify model_out" );
this->SaveModel( model_out.c_str() );
elapsed = (unsigned long)(time(NULL) - start);
if( !silent ){
printf("\ninteractive update, %d batch actions, %lu sec in all\n", batch_action, elapsed );
}
}
inline void TaskDump( void ){
FILE *fo = utils::FopenCheck( name_dump.c_str(), "w" );
learner.DumpModel( fo, fmap, dump_model_stats != 0 );
fclose( fo );
}
inline void TaskDumpPath( void ){
FILE *fo = utils::FopenCheck( name_dumppath.c_str(), "w" );
learner.DumpPath( fo, data );
fclose( fo );
}
inline void SaveModel( const char *fname ) const{
utils::FileStream fo( utils::FopenCheck( fname, "wb" ) );
learner.SaveModel( fo );
fo.Close();
}
inline void SaveModel( int i ) const{
char fname[256];
sprintf( fname ,"%s/%04d.model", model_dir_path.c_str(), i+1 );
this->SaveModel( fname );
}
inline void TaskPred( void ){
std::vector<float> preds;
if( !silent ) printf("start prediction...\n");
learner.Predict( preds, data );
if( !silent ) printf("writing prediction to %s\n", name_pred.c_str() );
FILE *fo = utils::FopenCheck( name_pred.c_str(), "w" );
for( size_t i = 0; i < preds.size(); i ++ ){
fprintf( fo, "%f\n", preds[i] );
}
fclose( fo );
}
private:
/* \brief whether silent */
int silent;
/* \brief whether use auto binary buffer */
int use_buffer;
/* \brief number of boosting iterations */
int num_round;
/* \brief the period to save the model, 0 means only save the final round model */
int save_period;
/*! \brief interfact action */
std::string interact_action;
/* \brief the path of training/test data set */
std::string train_path, test_path;
/* \brief the path of test model file, or file to restart training */
std::string model_in;
/* \brief the path of final model file, to be saved */
std::string model_out;
/* \brief the path of directory containing the saved models */
std::string model_dir_path;
/* \brief task to perform */
std::string task;
/* \brief name of predict file */
std::string name_pred;
/* \brief whether dump statistics along with model */
int dump_model_stats;
/* \brief name of feature map */
std::string name_fmap;
/* \brief name of dump file */
std::string name_dump;
/* \brief name of dump path file */
std::string name_dumppath;
/* \brief the paths of validation data sets */
std::vector<std::string> eval_data_paths;
/* \brief the names of the evaluation data used in output log */
std::vector<std::string> eval_data_names;
/*! \brief saves configurations */
utils::ConfigSaver cfg;
/*! \brief batch configurations */
utils::ConfigSaver cfg_batch;
private:
DMatrix data;
std::vector<DMatrix*> deval;
utils::FeatMap fmap;
RegBoostLearner learner;
};
};
};
int main( int argc, char *argv[] ){
xgboost::random::Seed( 0 );
xgboost::regression::RegBoostTask tsk;
return tsk.Run( argc, argv );
}

26
tools/Makefile Normal file
View File

@@ -0,0 +1,26 @@
export CC = gcc
export CXX = g++
export CFLAGS = -Wall -O3 -msse2 -Wno-unknown-pragmas -fopenmp
# specify tensor path
BIN = xgcombine_buffer
OBJ =
.PHONY: clean all
all: $(BIN) $(OBJ)
export LDFLAGS= -pthread -lm
xgcombine_buffer : xgcombine_buffer.cpp
$(BIN) :
$(CXX) $(CFLAGS) $(LDFLAGS) -o $@ $(filter %.cpp %.o %.c, $^)
$(OBJ) :
$(CXX) -c $(CFLAGS) -o $@ $(firstword $(filter %.cpp %.c, $^) )
install:
cp -f -r $(BIN) $(INSTALL_PATH)
clean:
$(RM) $(OBJ) $(BIN) *~

248
tools/xgcombine_buffer.cpp Normal file
View File

@@ -0,0 +1,248 @@
/*!
* a tool to combine different set of features into binary buffer
* not well organized code, but does it's job
* \author Tianqi Chen: tianqi.tchen@gmail.com
*/
#define _CRT_SECURE_NO_WARNINGS
#define _CRT_SECURE_NO_DEPRECATE
#include <cstdio>
#include <cstring>
#include <ctime>
#include <cmath>
#include "../regrank/xgboost_regrank_data.h"
#include "../utils/xgboost_utils.h"
using namespace xgboost;
using namespace xgboost::booster;
using namespace xgboost::regrank;
// header in dataset
struct Header{
FILE *fi;
int tmp_num;
int base;
int num_feat;
// whether it's dense format
bool is_dense;
bool warned;
Header( void ){ this->warned = false; this->is_dense = false; }
inline void CheckBase( unsigned findex ){
if( findex >= (unsigned)num_feat && ! warned ) {
fprintf( stderr, "warning:some feature exceed bound, num_feat=%d\n", num_feat );
warned = true;
}
}
};
inline int norm( std::vector<Header> &vec, int base = 0 ){
int n = base;
for( size_t i = 0; i < vec.size(); i ++ ){
if( vec[i].is_dense ) vec[i].num_feat = 1;
vec[i].base = n; n += vec[i].num_feat;
}
return n;
}
inline void vclose( std::vector<Header> &vec ){
for( size_t i = 0; i < vec.size(); i ++ ){
fclose( vec[i].fi );
}
}
inline int readnum( std::vector<Header> &vec ){
int n = 0;
for( size_t i = 0; i < vec.size(); i ++ ){
if( !vec[i].is_dense ){
utils::Assert( fscanf( vec[i].fi, "%d", &vec[i].tmp_num ) == 1, "load num" );
n += vec[i].tmp_num;
}else{
n ++;
}
}
return n;
}
inline void vskip( std::vector<Header> &vec ){
for( size_t i = 0; i < vec.size(); i ++ ){
if( !vec[i].is_dense ){
utils::Assert( fscanf( vec[i].fi, "%*d%*[^\n]\n" ) >= 0 );
}else{
utils::Assert( fscanf( vec[i].fi, "%*f\n" ) >= 0 );
}
}
}
class DataLoader: public DMatrix{
public:
// whether to do node and edge feature renormalization
int rescale;
int linelimit;
public:
FILE *fp, *fwlist, *fgroup, *fweight;
std::vector<Header> fheader;
std::vector<FMatrixS::REntry> entry;
DataLoader( void ){
rescale = 0;
linelimit = -1;
fp = NULL; fwlist = NULL; fgroup = NULL; fweight = NULL;
}
private:
inline void Load( std::vector<unsigned> &findex, std::vector<float> &fvalue, std::vector<Header> &vec ){
unsigned fidx; float fv;
for( size_t i = 0; i < vec.size(); i ++ ){
if( !vec[i].is_dense ) {
for( int j = 0; j < vec[i].tmp_num; j ++ ){
utils::Assert( fscanf ( vec[i].fi, "%u:%f", &fidx, &fv ) == 2, "Error when load feat" );
vec[i].CheckBase( fidx );
fidx += vec[i].base;
findex.push_back( fidx ); fvalue.push_back( fv );
}
}else{
utils::Assert( fscanf ( vec[i].fi, "%f", &fv ) == 1, "load feat" );
fidx = vec[i].base;
findex.push_back( fidx ); fvalue.push_back( fv );
}
}
}
inline void DoRescale( std::vector<float> &vec ){
double sum = 0.0;
for( size_t i = 0; i < vec.size(); i ++ ){
sum += vec[i] * vec[i];
}
sum = sqrt( sum );
for( size_t i = 0; i < vec.size(); i ++ ){
vec[i] /= sum;
}
}
public:
// basically we are loading all the data inside
inline void Load( void ){
this->data.Clear();
float label, weight = 0.0f;
unsigned ngleft = 0, ngacc = 0;
if( fgroup != NULL ){
info.group_ptr.clear();
info.group_ptr.push_back(0);
}
while( fscanf( fp, "%f", &label ) == 1 ){
if( ngleft == 0 && fgroup != NULL ){
utils::Assert( fscanf( fgroup, "%u", &ngleft ) == 1 );
}
if( fweight != NULL ){
utils::Assert( fscanf( fweight, "%f", &weight ) == 1 );
}
ngleft -= 1; ngacc += 1;
int pass = 1;
if( fwlist != NULL ){
utils::Assert( fscanf( fwlist, "%u", &pass ) ==1 );
}
if( pass == 0 ){
vskip( fheader ); ngacc -= 1;
}else{
const int nfeat = readnum( fheader );
std::vector<unsigned> findex;
std::vector<float> fvalue;
// pairs
this->Load( findex, fvalue, fheader );
utils::Assert( findex.size() == (unsigned)nfeat );
if( rescale != 0 ) this->DoRescale( fvalue );
// push back data :)
this->info.labels.push_back( label );
// push back weight if any
if( fweight != NULL ){
this->info.weights.push_back( weight );
}
this->data.AddRow( findex, fvalue );
}
if( ngleft == 0 && fgroup != NULL && ngacc != 0 ){
info.group_ptr.push_back( info.group_ptr.back() + ngacc );
utils::Assert( info.group_ptr.back() == data.NumRow(), "group size must match num rows" );
ngacc = 0;
}
// linelimit
if( linelimit >= 0 ) {
if( -- linelimit <= 0 ) break;
}
}
if( ngleft == 0 && fgroup != NULL && ngacc != 0 ){
info.group_ptr.push_back( info.group_ptr.back() + ngacc );
utils::Assert( info.group_ptr.back() == data.NumRow(), "group size must match num rows" );
}
this->data.InitData();
}
};
const char *folder = "features";
int main( int argc, char *argv[] ){
if( argc < 3 ){
printf("Usage:xgcombine_buffer <inname> <outname> [options] -f [features] -fd [densefeatures]\n"\
"options: -rescale -linelimit -fgroup <groupfilename> -wlist <whitelistinstance>\n");
return 0;
}
DataLoader loader;
time_t start = time( NULL );
int mode = 0;
for( int i = 3; i < argc; i ++ ){
if( !strcmp( argv[i], "-f") ){
mode = 0; continue;
}
if( !strcmp( argv[i], "-fd") ){
mode = 2; continue;
}
if( !strcmp( argv[i], "-rescale") ){
loader.rescale = 1; continue;
}
if( !strcmp( argv[i], "-wlist") ){
loader.fwlist = utils::FopenCheck( argv[ ++i ], "r" ); continue;
}
if( !strcmp( argv[i], "-fgroup") ){
loader.fgroup = utils::FopenCheck( argv[ ++i ], "r" ); continue;
}
if( !strcmp( argv[i], "-fweight") ){
loader.fweight = utils::FopenCheck( argv[ ++i ], "r" ); continue;
}
if( !strcmp( argv[i], "-linelimit") ){
loader.linelimit = atoi( argv[ ++i ] ); continue;
}
char name[ 256 ];
sprintf( name, "%s/%s.%s", folder, argv[1], argv[i] );
Header h;
h.fi = utils::FopenCheck( name, "r" );
if( mode == 2 ){
h.is_dense = true; h.num_feat = 1;
loader.fheader.push_back( h );
}else{
utils::Assert( fscanf( h.fi, "%d", &h.num_feat ) == 1, "num feat" );
switch( mode ){
case 0: loader.fheader.push_back( h ); break;
default: ;
}
}
}
loader.fp = utils::FopenCheck( argv[1], "r" );
printf("num_features=%d\n", norm( loader.fheader ) );
printf("start creating buffer...\n");
loader.Load();
loader.SaveBinary( argv[2] );
// close files
fclose( loader.fp );
if( loader.fwlist != NULL ) fclose( loader.fwlist );
if( loader.fgroup != NULL ) fclose( loader.fgroup );
vclose( loader.fheader );
printf("all generation end, %lu sec used\n", (unsigned long)(time(NULL) - start) );
return 0;
}

View File

@@ -14,198 +14,203 @@
namespace xgboost{ namespace xgboost{
namespace utils{ namespace utils{
/*! /*!
* \brief an iterator that iterates over a configure file and gets the configures * \brief an iterator that iterates over a configure file and gets the configures
*/ */
class ConfigIterator{ class ConfigIterator{
public: public:
/*! /*!
* \brief constructor * \brief constructor
* \param fname name of configure file * \param fname name of configure file
*/ */
ConfigIterator( const char *fname ){ ConfigIterator(const char *fname){
fi = FopenCheck( fname, "r"); fi = FopenCheck(fname, "r");
ch_buf = fgetc( fi ); ch_buf = fgetc(fi);
} }
/*! \brief destructor */ /*! \brief destructor */
~ConfigIterator(){ ~ConfigIterator(){
fclose( fi ); fclose(fi);
} }
/*! /*!
* \brief get current name, called after Next returns true * \brief get current name, called after Next returns true
* \return current parameter name * \return current parameter name
*/ */
inline const char *name( void )const{ inline const char *name(void)const{
return s_name; return s_name;
} }
/*! /*!
* \brief get current value, called after Next returns true * \brief get current value, called after Next returns true
* \return current parameter value * \return current parameter value
*/ */
inline const char *val( void ) const{ inline const char *val(void) const{
return s_val; return s_val;
} }
/*! /*!
* \brief move iterator to next position * \brief move iterator to next position
* \return true if there is value in next position * \return true if there is value in next position
*/ */
inline bool Next( void ){ inline bool Next(void){
while( !feof( fi ) ){ while (!feof(fi)){
GetNextToken( s_name ); GetNextToken(s_name);
if( s_name[0] == '=') return false; if (s_name[0] == '=') return false;
if( GetNextToken( s_buf ) || s_buf[0] != '=' ) return false; if (GetNextToken(s_buf) || s_buf[0] != '=') return false;
if( GetNextToken( s_val ) || s_val[0] == '=' ) return false; if (GetNextToken(s_val) || s_val[0] == '=') return false;
return true; return true;
} }
return false; return false;
} }
private: private:
FILE *fi; FILE *fi;
char ch_buf; char ch_buf;
char s_name[256],s_val[256],s_buf[246]; char s_name[256], s_val[256], s_buf[246];
inline void SkipLine(){ inline void SkipLine(){
do{ do{
ch_buf = fgetc( fi ); ch_buf = fgetc(fi);
}while( ch_buf != EOF && ch_buf != '\n' && ch_buf != '\r' ); } while (ch_buf != EOF && ch_buf != '\n' && ch_buf != '\r');
} }
inline void ParseStr( char tok[] ){ inline void ParseStr(char tok[]){
int i = 0; int i = 0;
while( (ch_buf = fgetc(fi)) != EOF ){ while ((ch_buf = fgetc(fi)) != EOF){
switch( ch_buf ){ switch (ch_buf){
case '\\': tok[i++] = fgetc( fi ); break; case '\\': tok[i++] = fgetc(fi); break;
case '\"': tok[i++] = '\0'; case '\"': tok[i++] = '\0';
return; return;
case '\r': case '\r':
case '\n': Error("unterminated string"); break; case '\n': Error("unterminated string"); break;
default: tok[i++] = ch_buf; default: tok[i++] = ch_buf;
} }
} }
Error("unterminated string"); Error("unterminated string");
} }
// return newline // return newline
inline bool GetNextToken( char tok[] ){ inline bool GetNextToken(char tok[]){
int i = 0; int i = 0;
bool new_line = false; bool new_line = false;
while( ch_buf != EOF ){ while (ch_buf != EOF){
switch( ch_buf ){ switch (ch_buf){
case '#' : SkipLine(); new_line = true; break; case '#': SkipLine(); new_line = true; break;
case '\"': case '\"':
if( i == 0 ){ if (i == 0){
ParseStr( tok );ch_buf = fgetc(fi); return new_line; ParseStr(tok); ch_buf = fgetc(fi); return new_line;
}else{ }
Error("token followed directly by string"); else{
Error("token followed directly by string");
} }
case '=': case '=':
if( i == 0 ) { if (i == 0) {
ch_buf = fgetc( fi ); ch_buf = fgetc(fi);
tok[0] = '='; tok[0] = '=';
tok[1] = '\0'; tok[1] = '\0';
}else{ }
tok[i] = '\0'; else{
tok[i] = '\0';
} }
return new_line; return new_line;
case '\r': case '\r':
case '\n': case '\n':
if( i == 0 ) new_line = true; if (i == 0) new_line = true;
case '\t': case '\t':
case ' ' : case ' ':
ch_buf = fgetc( fi ); ch_buf = fgetc(fi);
if( i > 0 ){ if (i > 0){
tok[i] = '\0'; tok[i] = '\0';
return new_line; return new_line;
} }
break; break;
default: default:
tok[i++] = ch_buf; tok[i++] = ch_buf;
ch_buf = fgetc( fi ); ch_buf = fgetc(fi);
break; break;
} }
} }
return true; return true;
} }
}; };
}; };
namespace utils{ namespace utils{
/*! /*!
* \brief a class that save parameter configurations * \brief a class that save parameter configurations
* temporally and allows to get them out later * temporally and allows to get them out later
* there are two kinds of priority in ConfigSaver * there are two kinds of priority in ConfigSaver
*/ */
class ConfigSaver{ class ConfigSaver{
public: public:
/*! \brief constructor */ /*! \brief constructor */
ConfigSaver( void ){ idx = 0; } ConfigSaver(void){ idx = 0; }
/*! \brief clear all saves */ /*! \brief clear all saves */
inline void Clear( void ){ inline void Clear(void){
idx = 0; idx = 0;
names.clear(); values.clear(); names.clear(); values.clear();
names_high.clear(); values_high.clear(); names_high.clear(); values_high.clear();
} }
/*! /*!
* \brief push back a parameter setting * \brief push back a parameter setting
* \param name name of parameter * \param name name of parameter
* \param val value of parameter * \param val value of parameter
* \param priority whether the setting has higher priority: high priority occurs * \param priority whether the setting has higher priority: high priority occurs
* latter when read from ConfigSaver, and can overwrite existing settings * latter when read from ConfigSaver, and can overwrite existing settings
*/ */
inline void PushBack( const char *name, const char *val, int priority = 0 ){ inline void PushBack(const char *name, const char *val, int priority = 0){
if( priority == 0 ){ if (priority == 0){
names.push_back( std::string( name ) ); names.push_back(std::string(name));
values.push_back( std::string( val ) ); values.push_back(std::string(val));
}else{ }
names_high.push_back( std::string( name ) ); else{
values_high.push_back( std::string( val ) ); names_high.push_back(std::string(name));
values_high.push_back(std::string(val));
} }
} }
/*! \brief set pointer to beginning of the ConfigSaver */ /*! \brief set pointer to beginning of the ConfigSaver */
inline void BeforeFirst( void ){ inline void BeforeFirst(void){
idx = 0; idx = 0;
} }
/*! /*!
* \brief move iterator to next position * \brief move iterator to next position
* \return true if there is value in next position * \return true if there is value in next position
*/ */
inline bool Next( void ){ inline bool Next(void){
if( idx >= names.size() + names_high.size() ){ if (idx >= names.size() + names_high.size()){
return false; return false;
} }
idx ++; idx++;
return true; return true;
} }
/*! /*!
* \brief get current name, called after Next returns true * \brief get current name, called after Next returns true
* \return current parameter name * \return current parameter name
*/ */
inline const char *name( void ) const{ inline const char *name(void) const{
Assert( idx > 0, "can't call name before first"); Assert(idx > 0, "can't call name before first");
size_t i = idx - 1; size_t i = idx - 1;
if( i >= names.size() ){ if (i >= names.size()){
return names_high[ i - names.size() ].c_str(); return names_high[i - names.size()].c_str();
}else{ }
return names[ i ].c_str(); else{
return names[i].c_str();
} }
} }
/*! /*!
* \brief get current value, called after Next returns true * \brief get current value, called after Next returns true
* \return current parameter value * \return current parameter value
*/ */
inline const char *val( void ) const{ inline const char *val(void) const{
Assert( idx > 0, "can't call name before first"); Assert(idx > 0, "can't call name before first");
size_t i = idx - 1; size_t i = idx - 1;
if( i >= values.size() ){ if (i >= values.size()){
return values_high[ i - values.size() ].c_str(); return values_high[i - values.size()].c_str();
}else{ }
return values[ i ].c_str(); else{
return values[i].c_str();
} }
} }
private: private:
std::vector<std::string> names; std::vector<std::string> names;
std::vector<std::string> values; std::vector<std::string> values;
std::vector<std::string> names_high; std::vector<std::string> names_high;
std::vector<std::string> values_high; std::vector<std::string> values_high;
size_t idx; size_t idx;
}; };
}; };

View File

@@ -16,48 +16,48 @@ namespace xgboost{
class FeatMap{ class FeatMap{
public: public:
enum Type{ enum Type{
kIndicator = 0, kIndicator = 0,
kQuantitive = 1, kQuantitive = 1,
kInteger = 2, kInteger = 2,
kFloat = 3 kFloat = 3
}; };
public: public:
/*! \brief load feature map from text format */ /*! \brief load feature map from text format */
inline void LoadText( const char *fname ){ inline void LoadText(const char *fname){
FILE *fi = utils::FopenCheck( fname, "r" ); FILE *fi = utils::FopenCheck(fname, "r");
this->LoadText( fi ); this->LoadText(fi);
fclose( fi ); fclose(fi);
} }
/*! \brief load feature map from text format */ /*! \brief load feature map from text format */
inline void LoadText( FILE *fi ){ inline void LoadText(FILE *fi){
int fid; int fid;
char fname[256], ftype[256]; char fname[1256], ftype[1256];
while( fscanf( fi, "%d%s%s", &fid, fname, ftype ) == 3 ){ while (fscanf(fi, "%d\t%[^\t]\t%s\n", &fid, fname, ftype) == 3){
utils::Assert( fid == (int)names_.size(), "invalid fmap format" ); utils::Assert(fid == (int)names_.size(), "invalid fmap format");
names_.push_back( std::string(fname) ); names_.push_back(std::string(fname));
types_.push_back( GetType( ftype ) ); types_.push_back(GetType(ftype));
} }
} }
/*! \brief number of known features */ /*! \brief number of known features */
size_t size( void ) const{ size_t size(void) const{
return names_.size(); return names_.size();
} }
/*! \brief return name of specific feature */ /*! \brief return name of specific feature */
const char* name( size_t idx ) const{ const char* name(size_t idx) const{
utils::Assert( idx < names_.size(), "utils::FMap::name feature index exceed bound" ); utils::Assert(idx < names_.size(), "utils::FMap::name feature index exceed bound");
return names_[ idx ].c_str(); return names_[idx].c_str();
} }
/*! \brief return type of specific feature */ /*! \brief return type of specific feature */
const Type& type( size_t idx ) const{ const Type& type(size_t idx) const{
utils::Assert( idx < names_.size(), "utils::FMap::name feature index exceed bound" ); utils::Assert(idx < names_.size(), "utils::FMap::name feature index exceed bound");
return types_[ idx ]; return types_[idx];
} }
private: private:
inline static Type GetType( const char *tname ){ inline static Type GetType(const char *tname){
if( !strcmp( "i", tname ) ) return kIndicator; if (!strcmp("i", tname)) return kIndicator;
if( !strcmp( "q", tname ) ) return kQuantitive; if (!strcmp("q", tname)) return kQuantitive;
if( !strcmp( "int", tname ) ) return kInteger; if (!strcmp("int", tname)) return kInteger;
if( !strcmp( "float", tname ) ) return kFloat; if (!strcmp("float", tname)) return kFloat;
utils::Error("unknown feature type, use i for indicator and q for quantity"); utils::Error("unknown feature type, use i for indicator and q for quantity");
return kIndicator; return kIndicator;
} }
@@ -73,50 +73,50 @@ namespace xgboost{
/*! \brief feature constraint, allow or disallow some feature during training */ /*! \brief feature constraint, allow or disallow some feature during training */
class FeatConstrain{ class FeatConstrain{
public: public:
FeatConstrain( void ){ FeatConstrain(void){
default_state_ = +1; default_state_ = +1;
} }
/*!\brief set parameters */ /*!\brief set parameters */
inline void SetParam( const char *name, const char *val ){ inline void SetParam(const char *name, const char *val){
int a, b; int a, b;
if( !strcmp( name, "fban") ){ if (!strcmp(name, "fban")){
this->ParseRange( val, a, b ); this->ParseRange(val, a, b);
this->SetRange( a, b, -1 ); this->SetRange(a, b, -1);
} }
if( !strcmp( name, "fpass") ){ if (!strcmp(name, "fpass")){
this->ParseRange( val, a, b ); this->ParseRange(val, a, b);
this->SetRange( a, b, +1 ); this->SetRange(a, b, +1);
} }
if( !strcmp( name, "fdefault") ){ if (!strcmp(name, "fdefault")){
default_state_ = atoi( val ); default_state_ = atoi(val);
} }
} }
/*! \brief whether constrain is specified */ /*! \brief whether constrain is specified */
inline bool HasConstrain( void ) const { inline bool HasConstrain(void) const {
return state_.size() != 0 && default_state_ == 1; return state_.size() != 0 && default_state_ == 1;
} }
/*! \brief whether a feature index is banned or not */ /*! \brief whether a feature index is banned or not */
inline bool NotBanned( unsigned index ) const{ inline bool NotBanned(unsigned index) const{
int rt = index < state_.size() ? state_[index] : default_state_; int rt = index < state_.size() ? state_[index] : default_state_;
if( rt == 0 ) rt = default_state_; if (rt == 0) rt = default_state_;
return rt == 1; return rt == 1;
} }
private: private:
inline void SetRange( int a, int b, int st ){ inline void SetRange(int a, int b, int st){
if( b > (int)state_.size() ) state_.resize( b, 0 ); if (b >(int)state_.size()) state_.resize(b, 0);
for( int i = a; i < b; ++ i ){ for (int i = a; i < b; ++i){
state_[i] = st; state_[i] = st;
} }
} }
inline void ParseRange( const char *val, int &a, int &b ){ inline void ParseRange(const char *val, int &a, int &b){
if( sscanf( val, "%d-%d", &a, &b ) == 2 ) return; if (sscanf(val, "%d-%d", &a, &b) == 2) return;
utils::Assert( sscanf( val, "%d", &a ) == 1 ); utils::Assert(sscanf(val, "%d", &a) == 1);
b = a + 1; b = a + 1;
} }
/*! \brief default state */ /*! \brief default state */
int default_state_; int default_state_;
/*! \brief whether the state here is, +1:pass, -1: ban, 0:default */ /*! \brief whether the state here is, +1:pass, -1: ban, 0:default */
std::vector<int> state_; std::vector<int> state_;
}; };
}; // namespace utils }; // namespace utils
}; // namespace xgboost }; // namespace xgboost

View File

@@ -2,7 +2,7 @@
* \file xgboost_matrix_csr.h * \file xgboost_matrix_csr.h
* \brief this file defines some easy to use STL based class for in memory sparse CSR matrix * \brief this file defines some easy to use STL based class for in memory sparse CSR matrix
* \author Tianqi Chen: tianqi.tchen@gmail.com * \author Tianqi Chen: tianqi.tchen@gmail.com
*/ */
#ifndef XGBOOST_MATRIX_CSR_H #ifndef XGBOOST_MATRIX_CSR_H
#define XGBOOST_MATRIX_CSR_H #define XGBOOST_MATRIX_CSR_H
#include <vector> #include <vector>
@@ -11,13 +11,13 @@
namespace xgboost{ namespace xgboost{
namespace utils{ namespace utils{
/*! /*!
* \brief a class used to help construct CSR format matrix, * \brief a class used to help construct CSR format matrix,
* can be used to convert row major CSR to column major CSR * can be used to convert row major CSR to column major CSR
* \tparam IndexType type of index used to store the index position, usually unsigned or size_t * \tparam IndexType type of index used to store the index position, usually unsigned or size_t
* \tparam whether enabling the usage of aclist, this option must be enabled manually * \tparam whether enabling the usage of aclist, this option must be enabled manually
*/ */
template<typename IndexType,bool UseAcList = false> template<typename IndexType, bool UseAcList = false>
struct SparseCSRMBuilder{ struct SparseCSRMBuilder{
private: private:
/*! \brief dummy variable used in the indicator matrix construction */ /*! \brief dummy variable used in the indicator matrix construction */
@@ -29,100 +29,102 @@ namespace xgboost{
/*! \brief a list of active rows, used when many rows are empty */ /*! \brief a list of active rows, used when many rows are empty */
std::vector<size_t> &aclist; std::vector<size_t> &aclist;
public: public:
SparseCSRMBuilder( std::vector<size_t> &p_rptr, SparseCSRMBuilder(std::vector<size_t> &p_rptr,
std::vector<IndexType> &p_findex ) std::vector<IndexType> &p_findex)
:rptr(p_rptr), findex( p_findex ), aclist( dummy_aclist ){ :rptr(p_rptr), findex(p_findex), aclist(dummy_aclist){
Assert( !UseAcList, "enabling bug" ); Assert(!UseAcList, "enabling bug");
} }
/*! \brief use with caution! rptr must be cleaned before use */ /*! \brief use with caution! rptr must be cleaned before use */
SparseCSRMBuilder( std::vector<size_t> &p_rptr, SparseCSRMBuilder(std::vector<size_t> &p_rptr,
std::vector<IndexType> &p_findex, std::vector<IndexType> &p_findex,
std::vector<size_t> &p_aclist ) std::vector<size_t> &p_aclist)
:rptr(p_rptr), findex( p_findex ), aclist( p_aclist ){ :rptr(p_rptr), findex(p_findex), aclist(p_aclist){
Assert( UseAcList, "must manually enable the option use aclist" ); Assert(UseAcList, "must manually enable the option use aclist");
} }
public: public:
/*! /*!
* \brief step 1: initialize the number of rows in the data, not necessary exact * \brief step 1: initialize the number of rows in the data, not necessary exact
* \nrows number of rows in the matrix, can be smaller than expected * \nrows number of rows in the matrix, can be smaller than expected
*/ */
inline void InitBudget( size_t nrows = 0 ){ inline void InitBudget(size_t nrows = 0){
if( !UseAcList ){ if (!UseAcList){
rptr.clear(); rptr.clear();
rptr.resize( nrows + 1, 0 ); rptr.resize(nrows + 1, 0);
}else{ }
Assert( nrows + 1 == rptr.size(), "rptr must be initialized already" ); else{
Assert(nrows + 1 == rptr.size(), "rptr must be initialized already");
this->Cleanup(); this->Cleanup();
} }
} }
/*! /*!
* \brief step 2: add budget to each rows, this function is called when aclist is used * \brief step 2: add budget to each rows, this function is called when aclist is used
* \param row_id the id of the row * \param row_id the id of the row
* \param nelem number of element budget add to this row * \param nelem number of element budget add to this row
*/ */
inline void AddBudget( size_t row_id, size_t nelem = 1 ){ inline void AddBudget(size_t row_id, size_t nelem = 1){
if( rptr.size() < row_id + 2 ){ if (rptr.size() < row_id + 2){
rptr.resize( row_id + 2, 0 ); rptr.resize(row_id + 2, 0);
} }
if( UseAcList ){ if (UseAcList){
if( rptr[ row_id + 1 ] == 0 ) aclist.push_back( row_id ); if (rptr[row_id + 1] == 0) aclist.push_back(row_id);
} }
rptr[ row_id + 1 ] += nelem; rptr[row_id + 1] += nelem;
} }
/*! \brief step 3: initialize the necessary storage */ /*! \brief step 3: initialize the necessary storage */
inline void InitStorage( void ){ inline void InitStorage(void){
// initialize rptr to be beginning of each segment // initialize rptr to be beginning of each segment
size_t start = 0; size_t start = 0;
if( !UseAcList ){ if (!UseAcList){
for( size_t i = 1; i < rptr.size(); i ++ ){ for (size_t i = 1; i < rptr.size(); i++){
size_t rlen = rptr[ i ]; size_t rlen = rptr[i];
rptr[ i ] = start; rptr[i] = start;
start += rlen;
}
}else{
// case with active list
std::sort( aclist.begin(), aclist.end() );
for( size_t i = 0; i < aclist.size(); i ++ ){
size_t ridx = aclist[ i ];
size_t rlen = rptr[ ridx + 1 ];
rptr[ ridx + 1 ] = start;
// set previous rptr to right position if previous feature is not active
if( i == 0 || ridx != aclist[i-1] + 1 ) rptr[ ridx ] = start;
start += rlen; start += rlen;
} }
} }
findex.resize( start ); else{
// case with active list
std::sort(aclist.begin(), aclist.end());
for (size_t i = 0; i < aclist.size(); i++){
size_t ridx = aclist[i];
size_t rlen = rptr[ridx + 1];
rptr[ridx + 1] = start;
// set previous rptr to right position if previous feature is not active
if (i == 0 || ridx != aclist[i - 1] + 1) rptr[ridx] = start;
start += rlen;
}
}
findex.resize(start);
} }
/*! /*!
* \brief step 4: * \brief step 4:
* used in indicator matrix construction, add new * used in indicator matrix construction, add new
* element to each row, the number of calls shall be exactly same as add_budget * element to each row, the number of calls shall be exactly same as add_budget
*/ */
inline void PushElem( size_t row_id, IndexType col_id ){ inline void PushElem(size_t row_id, IndexType col_id){
size_t &rp = rptr[ row_id + 1 ]; size_t &rp = rptr[row_id + 1];
findex[ rp ++ ] = col_id; findex[rp++] = col_id;
} }
/*! /*!
* \brief step 5: only needed when aclist is used * \brief step 5: only needed when aclist is used
* clean up the rptr for next usage * clean up the rptr for next usage
*/ */
inline void Cleanup( void ){ inline void Cleanup(void){
Assert( UseAcList, "this function can only be called use AcList" ); Assert(UseAcList, "this function can only be called use AcList");
for( size_t i = 0; i < aclist.size(); i ++ ){ for (size_t i = 0; i < aclist.size(); i++){
const size_t ridx = aclist[i]; const size_t ridx = aclist[i];
rptr[ ridx ] = 0; rptr[ ridx + 1 ] = 0; rptr[ridx] = 0; rptr[ridx + 1] = 0;
} }
aclist.clear(); aclist.clear();
} }
}; };
}; };
namespace utils{ namespace utils{
/*! /*!
* \brief simple sparse matrix container * \brief simple sparse matrix container
* \tparam IndexType type of index used to store the index position, usually unsigned or size_t * \tparam IndexType type of index used to store the index position, usually unsigned or size_t
*/ */
template<typename IndexType> template<typename IndexType>
struct SparseCSRMat{ struct SparseCSRMat{
private: private:
@@ -134,22 +136,22 @@ namespace xgboost{
/*! \brief matrix builder*/ /*! \brief matrix builder*/
SparseCSRMBuilder<IndexType> builder; SparseCSRMBuilder<IndexType> builder;
public: public:
SparseCSRMat( void ):builder( rptr, findex ){ SparseCSRMat(void) :builder(rptr, findex){
} }
public: public:
/*! \return number of rows in the matrx */ /*! \return number of rows in the matrx */
inline size_t NumRow( void ) const{ inline size_t NumRow(void) const{
return rptr.size() - 1; return rptr.size() - 1;
} }
/*! \return number of elements r-th row */ /*! \return number of elements r-th row */
inline size_t NumElem( size_t r ) const{ inline size_t NumElem(size_t r) const{
return rptr[ r + 1 ] - rptr[ r ]; return rptr[r + 1] - rptr[r];
} }
/*! \return r-th row */ /*! \return r-th row */
inline const IndexType *operator[]( size_t r ) const{ inline const IndexType *operator[](size_t r) const{
return &findex[ rptr[r] ]; return &findex[rptr[r]];
} }
}; };
}; };
}; };
#endif #endif

View File

@@ -3,7 +3,7 @@
/*! /*!
* \file xgboost_omp.h * \file xgboost_omp.h
* \brief header to handle OpenMP compatibility issues * \brief header to handle OpenMP compatibility issues
* *
* \author Tianqi Chen: tianqi.tchen@gmail.com * \author Tianqi Chen: tianqi.tchen@gmail.com
*/ */
@@ -13,6 +13,6 @@
#warning "OpenMP is not available, compile to single thread code" #warning "OpenMP is not available, compile to single thread code"
inline int omp_get_thread_num() { return 0; } inline int omp_get_thread_num() { return 0; }
inline int omp_get_num_threads() { return 1; } inline int omp_get_num_threads() { return 1; }
inline void omp_set_num_threads( int nthread ) {} inline void omp_set_num_threads(int nthread) {}
#endif #endif
#endif #endif

View File

@@ -23,109 +23,126 @@ typedef unsigned int uint32_t;
namespace xgboost{ namespace xgboost{
namespace random{ namespace random{
/*! \brief seed the PRNG */ /*! \brief seed the PRNG */
inline void Seed( uint32_t seed ){ inline void Seed(uint32_t seed){
srand( seed ); srand(seed);
} }
/*! \brief return a real number uniform in [0,1) */ /*! \brief return a real number uniform in [0,1) */
inline double NextDouble(){ inline double NextDouble(){
return static_cast<double>( rand() ) / (static_cast<double>( RAND_MAX )+1.0); return static_cast<double>(rand()) / (static_cast<double>(RAND_MAX)+1.0);
} }
/*! \brief return a real numer uniform in (0,1) */ /*! \brief return a real numer uniform in (0,1) */
inline double NextDouble2(){ inline double NextDouble2(){
return (static_cast<double>( rand() ) + 1.0 ) / (static_cast<double>(RAND_MAX) + 2.0); return (static_cast<double>(rand()) + 1.0) / (static_cast<double>(RAND_MAX)+2.0);
} }
}; };
namespace random{ namespace random{
/*! \brief return a random number */ /*! \brief return a random number */
inline uint32_t NextUInt32( void ){ inline uint32_t NextUInt32(void){
return (uint32_t)rand(); return (uint32_t)rand();
} }
/*! \brief return a random number in n */ /*! \brief return a random number in n */
inline uint32_t NextUInt32( uint32_t n ){ inline uint32_t NextUInt32(uint32_t n){
return (uint32_t) floor( NextDouble() * n ) ; return (uint32_t)floor(NextDouble() * n);
} }
/*! \brief return x~N(0,1) */ /*! \brief return x~N(0,1) */
inline double SampleNormal(){ inline double SampleNormal(){
double x,y,s; double x, y, s;
do{ do{
x = 2 * NextDouble2() - 1.0; x = 2 * NextDouble2() - 1.0;
y = 2 * NextDouble2() - 1.0; y = 2 * NextDouble2() - 1.0;
s = x*x + y*y; s = x*x + y*y;
}while( s >= 1.0 || s == 0.0 ); } while (s >= 1.0 || s == 0.0);
return x * sqrt( -2.0 * log(s) / s ) ; return x * sqrt(-2.0 * log(s) / s);
} }
/*! \brief return iid x,y ~N(0,1) */ /*! \brief return iid x,y ~N(0,1) */
inline void SampleNormal2D( double &xx, double &yy ){ inline void SampleNormal2D(double &xx, double &yy){
double x,y,s; double x, y, s;
do{ do{
x = 2 * NextDouble2() - 1.0; x = 2 * NextDouble2() - 1.0;
y = 2 * NextDouble2() - 1.0; y = 2 * NextDouble2() - 1.0;
s = x*x + y*y; s = x*x + y*y;
}while( s >= 1.0 || s == 0.0 ); } while (s >= 1.0 || s == 0.0);
double t = sqrt( -2.0 * log(s) / s ) ; double t = sqrt(-2.0 * log(s) / s);
xx = x * t; xx = x * t;
yy = y * t; yy = y * t;
} }
/*! \brief return x~N(mu,sigma^2) */ /*! \brief return x~N(mu,sigma^2) */
inline double SampleNormal( double mu, double sigma ){ inline double SampleNormal(double mu, double sigma){
return SampleNormal() * sigma + mu; return SampleNormal() * sigma + mu;
} }
/*! \brief return 1 with probability p, coin flip */ /*! \brief return 1 with probability p, coin flip */
inline int SampleBinary( double p ){ inline int SampleBinary(double p){
return NextDouble() < p; return NextDouble() < p;
} }
/*! \brief return distribution from Gamma( alpha, beta ) */ /*! \brief return distribution from Gamma( alpha, beta ) */
inline double SampleGamma( double alpha, double beta ) { inline double SampleGamma(double alpha, double beta) {
if ( alpha < 1.0 ) { if (alpha < 1.0) {
double u; double u;
do { do {
u = NextDouble(); u = NextDouble();
} while (u == 0.0); } while (u == 0.0);
return SampleGamma(alpha + 1.0, beta) * pow(u, 1.0 / alpha); return SampleGamma(alpha + 1.0, beta) * pow(u, 1.0 / alpha);
} else { }
double d,c,x,v,u; else {
d = alpha - 1.0/3.0; double d, c, x, v, u;
c = 1.0 / sqrt( 9.0 * d ); d = alpha - 1.0 / 3.0;
c = 1.0 / sqrt(9.0 * d);
do { do {
do { do {
x = SampleNormal(); x = SampleNormal();
v = 1.0 + c*x; v = 1.0 + c*x;
} while ( v <= 0.0 ); } while (v <= 0.0);
v = v * v * v; v = v * v * v;
u = NextDouble(); u = NextDouble();
} while ( (u >= (1.0 - 0.0331 * (x*x) * (x*x))) } while ((u >= (1.0 - 0.0331 * (x*x) * (x*x)))
&& (log(u) >= (0.5 * x * x + d * (1.0 - v + log(v)))) ); && (log(u) >= (0.5 * x * x + d * (1.0 - v + log(v)))));
return d * v / beta; return d * v / beta;
} }
} }
template<typename T> template<typename T>
inline void Exchange( T &a, T &b ){ inline void Exchange(T &a, T &b){
T c; T c;
c = a; c = a;
a = b; a = b;
b = c; b = c;
} }
template<typename T> template<typename T>
inline void Shuffle( T *data, size_t sz ){ inline void Shuffle(T *data, size_t sz){
if( sz == 0 ) return; if (sz == 0) return;
for( uint32_t i = (uint32_t)sz - 1; i > 0; i-- ){ for (uint32_t i = (uint32_t)sz - 1; i > 0; i--){
Exchange( data[i], data[ NextUInt32( i+1 ) ] ); Exchange(data[i], data[NextUInt32(i + 1)]);
} }
} }
// random shuffle the data inside, require PRNG // random shuffle the data inside, require PRNG
template<typename T> template<typename T>
inline void Shuffle( std::vector<T> &data ){ inline void Shuffle(std::vector<T> &data){
Shuffle( &data[0], data.size() ); Shuffle(&data[0], data.size());
} }
}; };
namespace random{
/*! \brief random number generator with independent random number seed*/
struct Random{
/*! \brief set random number seed */
inline void Seed( unsigned sd ){
this->rseed = sd;
}
/*! \brief return a real number uniform in [0,1) */
inline double RandDouble( void ){
return static_cast<double>( rand_r( &rseed ) ) / (static_cast<double>( RAND_MAX )+1.0);
}
// random number seed
unsigned rseed;
};
};
}; };
#endif #endif

View File

@@ -9,44 +9,44 @@
*/ */
namespace xgboost{ namespace xgboost{
namespace utils{ namespace utils{
/*! /*!
* \brief interface of stream I/O, used to serialize model * \brief interface of stream I/O, used to serialize model
*/ */
class IStream{ class IStream{
public: public:
/*! /*!
* \brief read data from stream * \brief read data from stream
* \param ptr pointer to memory buffer * \param ptr pointer to memory buffer
* \param size size of block * \param size size of block
* \return usually is the size of data readed * \return usually is the size of data readed
*/ */
virtual size_t Read( void *ptr, size_t size ) = 0; virtual size_t Read(void *ptr, size_t size) = 0;
/*! /*!
* \brief write data to stream * \brief write data to stream
* \param ptr pointer to memory buffer * \param ptr pointer to memory buffer
* \param size size of block * \param size size of block
*/ */
virtual void Write( const void *ptr, size_t size ) = 0; virtual void Write(const void *ptr, size_t size) = 0;
/*! \brief virtual destructor */ /*! \brief virtual destructor */
virtual ~IStream( void ){} virtual ~IStream(void){}
}; };
/*! \brief implementation of file i/o stream */ /*! \brief implementation of file i/o stream */
class FileStream: public IStream{ class FileStream : public IStream{
private: private:
FILE *fp; FILE *fp;
public: public:
FileStream( FILE *fp ){ FileStream(FILE *fp){
this->fp = fp; this->fp = fp;
} }
virtual size_t Read( void *ptr, size_t size ){ virtual size_t Read(void *ptr, size_t size){
return fread( ptr, size, 1, fp ); return fread(ptr, size, 1, fp);
} }
virtual void Write( const void *ptr, size_t size ){ virtual void Write(const void *ptr, size_t size){
fwrite( ptr, size, 1, fp ); fwrite(ptr, size, 1, fp);
} }
inline void Close( void ){ inline void Close(void){
fclose( fp ); fclose(fp);
} }
}; };
}; };

View File

@@ -36,32 +36,34 @@ extern "C"{
namespace xgboost{ namespace xgboost{
/*! \brief namespace for helper utils of the project */ /*! \brief namespace for helper utils of the project */
namespace utils{ namespace utils{
inline void Error( const char *msg ){ inline void Error(const char *msg){
fprintf( stderr, "Error:%s\n",msg ); fprintf(stderr, "Error:%s\n", msg);
exit( -1 ); fflush(stderr);
} exit(-1);
inline void Assert( bool exp ){
if( !exp ) Error( "AssertError" );
}
inline void Assert( bool exp, const char *msg ){
if( !exp ) Error( msg );
} }
inline void Warning( const char *msg ){ inline void Assert(bool exp){
fprintf( stderr, "warning:%s\n",msg ); if (!exp) Error("AssertError");
}
inline void Assert(bool exp, const char *msg){
if (!exp) Error(msg);
}
inline void Warning(const char *msg){
fprintf(stderr, "warning:%s\n", msg);
} }
/*! \brief replace fopen, report error when the file open fails */ /*! \brief replace fopen, report error when the file open fails */
inline FILE *FopenCheck( const char *fname , const char *flag ){ inline FILE *FopenCheck(const char *fname, const char *flag){
FILE *fp = fopen64( fname , flag ); FILE *fp = fopen64(fname, flag);
if( fp == NULL ){ if (fp == NULL){
fprintf( stderr, "can not open file \"%s\"\n",fname ); fprintf(stderr, "can not open file \"%s\" \n", fname);
exit( -1 ); fflush(stderr);
exit(-1);
} }
return fp; return fp;
} }
}; };
}; };