Compare commits

..

261 Commits
0.47 ... v0.21

Author SHA1 Message Date
tqchen
76c44072d1 fix sometimes python cachelist problem 2014-05-20 15:42:19 -07:00
tqchen
ccde443590 more clean demo 2014-05-20 08:33:35 -07:00
tqchen
cf710bfa59 fix bug in classification, scale_pos_weight initialization 2014-05-20 08:30:19 -07:00
tqchen
be2c3d299e chg 2014-05-19 10:02:01 -07:00
Tianqi Chen
2eba59000a Merge pull request #7 from jrings/master
Compatibility with both Python 2(.7) and 3
2014-05-19 09:48:34 -07:00
Joerg Rings
a958fe8d52 Compatibility with both Python 2(.7) and 3 2014-05-19 11:23:53 -05:00
Tianqi Chen
96667b8bad Merge pull request #6 from tqchen/dev
Fix the bug in MAC
2014-05-17 11:07:42 -07:00
tqchen
95f4052aae add omp flag back 2014-05-17 11:07:12 -07:00
tqchen
e9e3e0281d use back g++ 2014-05-17 11:06:36 -07:00
tqchen
c23d8c8b88 force handle as void_p, seems fix mac problem 2014-05-17 11:03:21 -07:00
Tianqi Chen
e59f4d5a18 Merge pull request #5 from tqchen/dev
add return type for xgboost, don't know if it is mac problem. #4
2014-05-17 09:19:20 -07:00
tqchen
e267f4c5f9 add return type for xgboost, don't know if it is mac problem 2014-05-17 09:13:54 -07:00
Tianqi Chen
505e65ac08 Update README.md 2014-05-16 22:54:24 -07:00
Tianqi Chen
13fc48623e Merge pull request #2 from tqchen/dev
fix loss_type
2014-05-16 21:30:09 -07:00
tqchen
591a43ac0e some cleanup 2014-05-16 21:29:14 -07:00
tqchen
5375ac5c23 fix for loss_type problem in outside reset base 2014-05-16 21:28:03 -07:00
tqchen
6930758294 Merge branch 'master' of ssh://github.com/tqchen/xgboost 2014-05-16 20:58:03 -07:00
tqchen
e09d6ab9de chg 2014-05-16 20:57:54 -07:00
antinucleon
db4a100f6b del 2014-05-17 03:57:38 +00:00
Tianqi Chen
495e37e0dc Merge pull request #1 from tqchen/dev
2.0 version, lots of changes
2014-05-16 20:53:19 -07:00
Tianqi Chen
b56b34944e Update README.md 2014-05-16 20:49:05 -07:00
tqchen
d4530b7a47 Merge branch 'dev' of ssh://github.com/tqchen/xgboost into dev 2014-05-16 20:46:18 -07:00
tqchen
334cf5de9b add ignore 2014-05-16 20:46:08 -07:00
tqchen
004e8d811e final check 2014-05-16 20:44:02 -07:00
Tianqi Chen
4baefd857e Update README.md 2014-05-16 20:41:59 -07:00
Tianqi Chen
b52f01d61d Update README.md 2014-05-16 20:41:43 -07:00
Tianqi Chen
35f9ef684a Update README.md 2014-05-16 20:41:21 -07:00
Tianqi Chen
6f34096613 Update README.md 2014-05-16 20:41:05 -07:00
tqchen
31c5d7843f Merge branch 'dev' of ssh://github.com/tqchen/xgboost into dev 2014-05-16 20:37:55 -07:00
tqchen
f60dbe299e ok 2014-05-16 20:37:45 -07:00
yepyao
a77debc0c5 Merge branch 'dev' of https://github.com/tqchen/xgboost into dev 2014-05-17 11:36:12 +08:00
yepyao
dc2b9c86e6 small change 2014-05-17 11:35:43 +08:00
yepyao
73bc8c0de4 small change 2014-05-17 11:34:24 +08:00
tqchen
ad8eb21fcd Merge branch 'dev' of ssh://github.com/tqchen/xgboost into dev 2014-05-16 20:29:17 -07:00
tqchen
416050d5c0 fix softmax 2014-05-16 20:28:07 -07:00
antinucleon
d5f6fba82d chg 2014-05-16 21:27:37 -06:00
tqchen
23f4c41035 chg 2014-05-16 20:18:34 -07:00
Tianqi Chen
7ea988a76b Update train.py 2014-05-16 20:16:10 -07:00
tqchen
d3c0ed14f3 multi class 2014-05-16 20:12:04 -07:00
antinucleon
2fcd875675 demo 2014-05-16 21:05:11 -06:00
antinucleon
615074efb6 Merge branch 'dev' of github.com:tqchen/xgboost into dev 2014-05-16 21:03:32 -06:00
Tianqi Chen
945b336fc6 Update README.md 2014-05-16 20:00:20 -07:00
antinucleon
8e8b8a8ee3 demo 2014-05-17 02:59:10 +00:00
antinucleon
42267807f5 demo 2014-05-16 20:57:42 -06:00
tqchen
df23464a20 do not need to dump in rank 2014-05-16 19:52:39 -07:00
tqchen
2ea8d9c511 Merge branch 'dev' of ssh://github.com/tqchen/xgboost into dev 2014-05-16 19:51:41 -07:00
tqchen
3206235a5e before commit 2014-05-16 19:51:33 -07:00
yepyao
956fc09da0 small change 2014-05-17 10:50:15 +08:00
yepyao
da482500c7 Merge branch 'dev' of https://github.com/tqchen/xgboost into dev
Conflicts:
	demo/rank/mq2008.conf
	demo/rank/runexp.sh
	regrank/xgboost_regrank_obj.h
2014-05-17 10:40:12 +08:00
yepyao
b19f2bfda8 fix small bug 2014-05-17 10:35:10 +08:00
tqchen
21b21e69de add bing to author list 2014-05-16 19:33:59 -07:00
Tianqi Chen
b90d1dc92b Update demo.py 2014-05-16 19:30:32 -07:00
tqchen
3429ab3447 chgs 2014-05-16 19:24:53 -07:00
tqchen
ebcce4a2bf chg all settings to obj 2014-05-16 19:10:52 -07:00
tqchen
1839e6efe9 pre-release version 2014-05-16 18:49:02 -07:00
tqchen
9bc6e83afe chg scripts 2014-05-16 18:46:43 -07:00
tqchen
fd2774e133 cleanup 2014-05-16 18:40:46 -07:00
tqchen
72d3a6a3cc chg rank demo 2014-05-16 18:38:40 -07:00
tqchen
5febbecd88 Merge branch 'dev' of ssh://github.com/tqchen/xgboost into dev 2014-05-16 18:29:37 -07:00
tqchen
b3c3ecd9c9 chng few things 2014-05-16 18:25:01 -07:00
tqchen
c28a1be34c minor changes 2014-05-16 18:19:57 -07:00
antinucleon
ae70b9b152 new speed test 2014-05-16 18:05:17 -06:00
antinucleon
e0a0343ae6 speedtest 2014-05-16 17:48:03 -06:00
yepyao
0e0d3efd6a use ndcg@all in lambdarank for ndcg 2014-05-16 23:06:24 +08:00
yepyao
a3bd5000ba small change 2014-05-16 21:20:41 +08:00
yepyao
dd71c0e070 Download data set from web site 2014-05-16 21:18:32 +08:00
kalenhaha
d9ea324057 Impement new Lambda rank interface 2014-05-16 20:42:46 +08:00
tqchen
0d29610c40 new lambda rank interface 2014-05-16 00:02:26 -07:00
Bing Xu
0af2c92d3b Update README.md 2014-05-16 01:30:29 -04:00
tqchen
f9cdce077b ok 2014-05-15 21:17:17 -07:00
tqchen
59183b9ed8 a correct version 2014-05-15 21:11:46 -07:00
tqchen
6ff272eec6 fix numpy convert 2014-05-15 20:28:34 -07:00
tqchen
c8073e13e4 ok 2014-05-15 20:05:22 -07:00
tqchen
698fa87bc3 ok 2014-05-15 18:56:28 -07:00
tqchen
8f56671901 bug fix in pairwise rank 2014-05-15 15:37:58 -07:00
tqchen
9ea9a7a01e cleanup code 2014-05-15 15:01:41 -07:00
tqchen
d59940f1d5 add xgcombine_buffer with weights 2014-05-15 14:41:11 -07:00
tqchen
6aa190e10c change data format to include weight in binary file, add get weight to python 2014-05-15 14:37:56 -07:00
tqchen
54c486bcf1 ok 2014-05-15 14:25:44 -07:00
tqchen
88ff293de5 add ams 2014-05-14 23:23:27 -07:00
tqchen
50af92e29e some fix 2014-05-14 16:55:59 -07:00
tqchen
bbe4957cd2 add AMS metric 2014-05-14 11:30:45 -07:00
kalenhaha
789ad18d36 add in grad and hess rescale in lambdarank 2014-05-14 23:13:27 +08:00
kalenhaha
2b34d5a25e small bug in ndcg eval 2014-05-13 14:30:42 +08:00
kalenhaha
bd574e4967 Merge branch 'dev' of https://github.com/tqchen/xgboost into dev 2014-05-12 22:22:32 +08:00
kalenhaha
e8d81c1da5 Add LETOR MQ2008 for rank demo 2014-05-12 22:21:07 +08:00
kalenhaha
c84bbc91d1 remove sampler 2014-05-11 14:31:57 +08:00
kalenhaha
61e3d1562c small change 2014-05-11 14:25:30 +08:00
kalenhaha
97db8c29f2 small change 2014-05-11 14:03:21 +08:00
tqchen
f2552f8ef2 simple chgs 2014-05-09 20:39:15 -07:00
kalenhaha
2563b6d2d6 fix some warnings 2014-05-09 14:14:43 +08:00
kalenhaha
e90ffece67 Merge branch 'dev' of https://github.com/tqchen/xgboost into dev 2014-05-09 14:07:06 +08:00
kalenhaha
85f92681f9 Separating Lambda MAP and Lambda NDCG 2014-05-09 14:05:52 +08:00
tqchen
5e0d52cb8c add python o3 2014-05-08 20:15:23 -07:00
tqchen
c9d156d99e faster convert to numpy array 2014-05-08 19:35:06 -07:00
tqchen
ecf6e8f49f commit the fix 2014-05-08 19:31:32 -07:00
tqchen
93778aa4aa Merge branch 'dev' of ssh://github.com/tqchen/xgboost into dev 2014-05-07 12:00:17 -07:00
tqchen
f8cacc7308 fix omp for bug in obj 2014-05-07 11:52:12 -07:00
kalenhaha
c0e1e9fe7a Merge branch 'dev' of https://github.com/tqchen/xgboost into dev
Conflicts:
	regrank/xgboost_regrank_obj.hpp
2014-05-07 22:15:59 +08:00
tqchen
fa5afe2141 fix 2014-05-06 16:53:37 -07:00
tqchen
f7789ecf14 Merge branch 'dev' of ssh://github.com/tqchen/xgboost into dev 2014-05-06 16:51:18 -07:00
tqchen
a57fbe091a Merge branch 'dev' of ssh://github.com/tqchen/xgboost into dev
Conflicts:
	regrank/xgboost_regrank_data.h
2014-05-06 16:51:11 -07:00
tqchen
9f82b53366 add regrank utils 2014-05-06 16:50:46 -07:00
tqchen
248b2cf74d right group size 2014-05-06 16:49:10 -07:00
tqchen
5fb9376af4 add cutomized training 2014-05-04 13:57:10 -07:00
tqchen
9c2bb12cd1 add cutomized training 2014-05-04 13:55:58 -07:00
tqchen
ebde99bde8 add boost group support to xgboost. now have beta multi-class classification 2014-05-04 12:10:03 -07:00
kalenhaha
ef7be5398d c++11 features removed 2014-05-04 16:58:44 +08:00
kalenhaha
2ef61bf982 c++11 features removed 2014-05-04 16:56:57 +08:00
tqchen
d4d141347a fix 2014-05-04 00:09:16 -07:00
tqchen
e18ba04751 add interact mode 2014-05-03 23:24:22 -07:00
tqchen
3388d1a8b5 add python interface for xgboost 2014-05-03 23:04:02 -07:00
tqchen
65917bb831 finish python lib 2014-05-03 22:18:25 -07:00
tqchen
140499ac9e finish matrix 2014-05-03 17:12:25 -07:00
tqchen
ccd037292d good 2014-05-03 16:15:44 -07:00
tqchen
59939d0b14 ok 2014-05-03 14:24:00 -07:00
tqchen
9a2c00554d important change to regrank interface, need some more test 2014-05-03 14:20:27 -07:00
tqchen
ee30c1728b try python 2014-05-03 10:54:08 -07:00
tqchen
8f75b0ef75 pass test 2014-05-02 18:04:45 -07:00
tqchen
3128e718e2 add new combine tool as promised 2014-05-02 12:55:34 -07:00
tqchen
657c617215 Merge branch 'dev' of ssh://github.com/tqchen/xgboost into dev 2014-05-01 11:01:05 -07:00
tqchen
439d4725a0 cleanup of evaluation metric, move c++11 codes into sample.h for backup, add lambda in a clean way latter 2014-05-01 11:00:50 -07:00
Tianqi Chen
8491bb3651 Update xgboost_omp.h 2014-05-01 10:16:05 -07:00
kalenhaha
cce96e8f41 fix some bugs in linux 2014-05-02 00:16:12 +08:00
kalenhaha
f02dd68713 lambda rank added 2014-05-01 22:17:26 +08:00
tqchen
ec14d32756 add softmax 2014-04-30 22:11:26 -07:00
tqchen
38577d45b0 add pre @ n 2014-04-30 22:00:53 -07:00
tqchen
ab0e7a3ddc use omp parallel sortting 2014-04-30 09:48:41 -07:00
tqchen
bbd952a021 add rank 2014-04-30 09:32:42 -07:00
tqchen
77e3051b1d add pairwise rank first version 2014-04-29 21:12:30 -07:00
tqchen
924e164c14 new AUC code 2014-04-29 17:26:58 -07:00
tqchen
25ff5ef169 new AUC evaluator, now compatible with weighted loss 2014-04-29 17:03:34 -07:00
tqchen
3ea29eccae make regression module compatible with rank loss, now support weighted loss 2014-04-29 16:16:02 -07:00
tqchen
0f8a3d21a5 chg fmap format 2014-04-29 09:59:10 -07:00
tqchen
7487c2f668 add auc evaluation metric 2014-04-24 22:20:40 -07:00
tqchen
88787b8573 remove unwanted private field 2014-04-21 10:42:19 -07:00
tqchen
17559a90f9 expose fmatrixs 2014-04-18 18:18:19 -07:00
tqchen
24696071a8 Merge branch 'master' of ssh://github.com/tqchen/xgboost
Conflicts:
	regression/xgboost_reg_data.h
2014-04-18 17:46:44 -07:00
tqchen
cca67af8d7 simplify data 2014-04-18 17:43:44 -07:00
kalenhaha
2beb92745f Lambda rank added 2014-04-11 10:50:13 +08:00
kalenhaha
d6b582dc70 Merge branch 'master' of https://github.com/tqchen/xgboost 2014-04-11 10:48:45 +08:00
kalenhaha
218320daf2 Lambda rank added 2014-04-10 22:11:15 +08:00
kalenhaha
f83942d3e9 lambda rank added 2014-04-10 22:09:19 +08:00
Tianqi Chen
60d79eb2e7 Update xgboost_utils.h 2014-04-07 16:25:21 -07:00
kalenhaha
1136c71e64 rank pass toy 2014-04-07 23:25:35 +08:00
tqchen
1bbbb0cf7f add deleted main back 2014-04-06 09:32:27 -07:00
kalenhaha
1756fde0c6 small fix 2014-04-06 22:54:41 +08:00
kalenhaha
7f30fc1468 compiled 2014-04-06 22:51:52 +08:00
tqchen
d5607fbb55 add dev 2014-04-04 10:42:13 -07:00
kalenhaha
05d984d83d pairwise ranking implemented 2014-04-05 00:14:55 +08:00
kalenhaha
1110ae7421 Adding ranking task 2014-04-03 16:22:55 +08:00
tqchen
2aa1031d24 add dump nice to regression demo 2014-03-26 16:47:01 -07:00
tqchen
1440dc9c8f update regression 2014-03-26 16:25:44 -07:00
kalenhaha
27bd5496a8 small fix 2014-03-27 00:08:47 +08:00
kalenhaha
81b32525e0 Merge branch 'master' of https://github.com/tqchen/xgboost 2014-03-26 23:50:56 +08:00
tqchen
6fa0948461 Merge branch 'master' of ssh://github.com/tqchen/xgboost 2014-03-25 17:18:27 -07:00
tqchen
61123f86aa small fix 2014-03-25 17:17:00 -07:00
Tianqi Chen
110b97fea2 Update README.md 2014-03-26 08:01:47 +08:00
Tianqi Chen
b2eb4e956b Update README.md 2014-03-26 08:01:24 +08:00
Tianqi Chen
56ae0e32e3 Update README 2014-03-26 07:21:15 +08:00
kalenhaha
e350c38483 change the regression demo data set 2014-03-24 23:23:11 +08:00
tqchen
e59ed018e6 fix test to pred 2014-03-24 00:31:53 -07:00
kalenhaha
3123d11655 remove test directory 2014-03-23 00:05:46 +08:00
kalenhaha
ca74cba9ec adding regression demo 2014-03-22 21:52:29 +08:00
kalenhaha
a84d4f3e68 Merge branch 'master' of https://github.com/tqchen/xgboost 2014-03-22 21:50:31 +08:00
kalenhaha
76cd1561a0 separate binary classification and regression demo 2014-03-22 21:48:27 +08:00
Tianqi Chen
5b4f77488c Update README.md 2014-03-20 23:12:41 -07:00
Tianqi Chen
b0676fc682 Update README.md 2014-03-20 23:12:16 -07:00
tqchen
97418b113e add batch running 2014-03-20 16:27:24 -07:00
tqchen
d56394d2ef add feature constraint 2014-03-19 10:47:56 -07:00
tqchen
6a91438634 fixed remove bug 2014-03-13 13:42:40 -07:00
tqchen
da3b3c8136 neglok 2014-03-12 20:28:21 -07:00
tqchen
fcf06a7164 support int type 2014-03-12 17:58:14 -07:00
tqchen
8f9efa2725 more compact 2014-03-11 13:07:20 -07:00
tqchen
6e48a938c6 add accuracy 2014-03-11 13:06:22 -07:00
tqchen
19b28b978d fix delete 2014-03-11 12:40:51 -07:00
tqchen
8f16ef8e75 add remove tree 2014-03-11 11:25:50 -07:00
tqchen
d2377b26bd add name dumpath 2014-03-06 11:23:51 -08:00
tqchen
70f3f31206 add add and remove 2014-03-05 16:39:07 -08:00
tqchen
f62c5dc3c1 try interact mode 2014-03-05 15:28:53 -08:00
tqchen
2d67377a96 add a test folder 2014-03-05 15:20:11 -08:00
tqchen
d982be9dca complete row maker 2014-03-05 14:38:13 -08:00
tqchen
98114cabce add row tree maker, to be finished 2014-03-05 11:00:03 -08:00
tqchen
2910bdedf4 split new base treemaker, not very good abstraction, but ok 2014-03-05 10:20:36 -08:00
tqchen
128e94be1a fix reg model_out 2014-03-05 09:34:37 -08:00
tqchen
eade6ddf7c reupdate data 2014-03-04 22:47:39 -08:00
tqchen
9b45210fa7 fix text 2014-03-04 16:22:24 -08:00
tqchen
ddd61b43be fix fmatrix 2014-03-04 11:45:22 -08:00
tqchen
98e851d80f add simple text loader 2014-03-04 11:33:33 -08:00
tqchen
3d223232e3 ok fix 2014-03-03 22:20:45 -08:00
tqchen
b689b4525a big change, change interface to template, everything still OK 2014-03-03 22:16:37 -08:00
tqchen
a3ca03cfc1 backup makefile 2014-03-03 15:21:50 -08:00
tqchen
2aa1978cb6 compatibility issue with openmp 2014-03-03 15:11:41 -08:00
tqchen
e3b7abfb47 ok 2014-03-03 12:26:40 -08:00
tqchen
2adf905dcf maptree is not needed 2014-03-03 11:06:24 -08:00
tqchen
cfbeeef9c1 fix fmap 2014-03-03 11:05:10 -08:00
tqchen
8ae1d37828 auto do reboost 2014-03-02 16:42:22 -08:00
tqchen
0fc64d1c2a chg file name of reg 2014-03-02 16:39:00 -08:00
tqchen
1eca127f69 chg file name of reg 2014-03-02 16:38:59 -08:00
tqchen
c7b29774c2 change test task to pred 2014-03-02 16:20:42 -08:00
tqchen
a8f69878eb make style more like Google style 2014-03-02 13:30:24 -08:00
tqchen
51b6d86c17 add smart decision of nfeatures 2014-03-01 21:49:29 -08:00
tqchen
082a57ba0b fix type 2014-03-01 21:29:07 -08:00
tqchen
f3c98d0c4b add smart load 2014-03-01 21:15:54 -08:00
tqchen
1748e4517a full omp support for regression 2014-03-01 20:56:25 -08:00
tqchen
328e41244c fix col maker, make it default 2014-03-01 15:16:30 -08:00
tqchen
155b593984 add col maker 2014-03-01 14:00:09 -08:00
Tianqi Chen
76cbc754c9 Update README.md 2014-02-28 20:13:01 -08:00
Tianqi Chen
97ca3bf739 Update README.md 2014-02-28 20:10:57 -08:00
tqchen
752f336cb3 chg license, README 2014-02-28 20:09:40 -08:00
tqchen
fffad41e53 start add coltree maker 2014-02-28 11:44:50 -08:00
tqchen
10382f6365 add dump2json 2014-02-26 18:54:12 -08:00
tqchen
7b2fe1bf5d add pathdump 2014-02-26 17:08:23 -08:00
tqchen
88c982012a modify tree so that training is standalone 2014-02-26 16:03:00 -08:00
tqchen
b6f98bf37a modify tree so that training is standalone 2014-02-26 16:02:58 -08:00
tqchen
3a4d0f28d9 change input data structure 2014-02-26 11:51:58 -08:00
tqchen
e58daa6d52 fix mushroom 2014-02-24 23:19:58 -08:00
tqchen
a5b37e0395 finish mushroom 2014-02-24 23:06:57 -08:00
tqchen
e75488b578 add mushroom classification 2014-02-24 22:25:43 -08:00
tqchen
1160a38323 add mushroom 2014-02-24 22:19:40 -08:00
tqchen
4401d549f1 pass simple test 2014-02-20 22:28:05 -08:00
tqchen
fd120a8f5c changes to reg booster 2014-02-20 22:08:31 -08:00
kalenhaha
00add6dd1d tab eliminated 2014-02-19 13:25:01 +08:00
kalenhaha
cd009f2541 add toy data 2014-02-19 13:01:15 +08:00
kalenhaha
582be45810 add in reg.conf for configuration demo 2014-02-18 16:49:23 +08:00
kalenhaha
3c93216850 Merge branch 'master' of https://github.com/tqchen/xgboost 2014-02-16 14:34:35 +08:00
kalenhaha
787f76e952 fix some bugs 2014-02-16 11:44:03 +08:00
tqchen
91c170e463 fix nboosters 2014-02-15 19:42:02 -08:00
tqchen
0c44347e82 update license 2014-02-15 17:45:48 -08:00
tqchen
603704287d Merge branch 'master' of ssh://github.com/tqchen/xgboost 2014-02-15 17:42:31 -08:00
tqchen
c933625f94 update license 2014-02-15 17:42:23 -08:00
tqchen
cebf39ea47 Update README.md 2014-02-15 11:22:50 -08:00
kalenhaha
f22139c659 Comments added 2014-02-13 13:04:55 +08:00
kalenhaha
06ce8c9f3a GBRT Train and Test Phase added 2014-02-12 23:30:32 +08:00
tqchen
98a60b3610 Update README.md 2014-02-11 20:38:06 -08:00
tqchen
2dc6c9c683 chg fmt to libsvm 2014-02-10 21:41:43 -08:00
tqchen
3e53fcf465 cleanup reg 2014-02-10 21:09:09 -08:00
tqchen
cb0fa75252 add regression data 2014-02-10 20:32:23 -08:00
kalenhaha
51a63d80d0 Merge branch 'master' of https://github.com/tqchen/xgboost 2014-02-11 11:19:27 +08:00
kalenhaha
1e356c5bd2 gbrt modified 2014-02-11 11:07:00 +08:00
kalenhaha
c5ada79be5 gbrt implemented 2014-02-10 23:40:38 +08:00
tqchen
dd924becd8 Update README.md 2014-02-08 19:02:33 -08:00
tqchen
7fa301a8ce Update README.md 2014-02-08 13:01:10 -08:00
tqchen
3d1e0badd3 Update README.md 2014-02-08 13:00:49 -08:00
tqchen
7e605306ad Update README.md 2014-02-08 12:50:24 -08:00
tqchen
5e5acdc121 finish readme 2014-02-08 11:47:37 -08:00
tqchen
7302a4e1b5 add linear booster 2014-02-08 11:24:35 -08:00
tqchen
21dd4b5904 add ok 2014-02-07 22:51:16 -08:00
tqchen
61e5410789 chg makefile 2014-02-07 22:43:13 -08:00
tqchen
0febb1a443 adapt tree booster 2014-02-07 22:41:32 -08:00
tqchen
36a04f17df adapt svdfeature tree 2014-02-07 22:38:26 -08:00
tqchen
3dd477c4b2 add detailed comment about gbmcore 2014-02-07 20:30:39 -08:00
tqchen
779d6a34de add empty folder for regression. TODO 2014-02-07 20:20:09 -08:00
tqchen
4535ab7e5c move core code to booster 2014-02-07 20:13:27 -08:00
tqchen
75c36a0667 add base code 2014-02-07 18:40:53 -08:00
tqchen
790c76e814 sync everything 2014-02-06 21:28:47 -08:00
tqchen
a81ea03022 add config 2014-02-06 21:26:27 -08:00
tqchen
a198759df6 update this folder 2014-02-06 16:06:59 -08:00
tqchen
a607444038 update this folder 2014-02-06 16:06:18 -08:00
tqchen
ee6a0c7f4a initial cleanup of interface 2014-02-06 16:03:04 -08:00
tqchen
57fef8bc54 init commit 2014-02-06 15:50:50 -08:00
414 changed files with 7229 additions and 42060 deletions

53
.gitignore vendored
View File

@@ -2,23 +2,21 @@
*.slo
*.lo
*.o
*.page
# Compiled Dynamic libraries
*.so
*.dylib
*.page
# Compiled Static libraries
*.lai
*.la
*.a
*~
*.Rcheck
*.rds
*.tar.gz
*txt*
*conf
*buffer
*model
xgboost
*pyc
*train
*test
@@ -26,48 +24,3 @@
*rar
*vali
*data
*sdf
Release
*exe*
*exp
ipch
*.filters
*.user
*log
Debug
*suo
*test*
.Rhistory
*.dll
*i386
*x64
*dump
*save
*csv
.Rproj.user
*.cpage.col
*.cpage
*.Rproj
./xgboost
./xgboost.mpi
./xgboost.mock
rabit
#.Rbuildignore
R-package.Rproj
*.cache*
R-package/inst
R-package/src
#java
java/xgboost4j/target
java/xgboost4j/tmp
java/xgboost4j-demo/target
java/xgboost4j-demo/data/
java/xgboost4j-demo/tmp/
java/xgboost4j-demo/model/
nb-configuration*
dmlc-core
# Eclipse
.project
.cproject
.pydevproject
.settings/

View File

@@ -1,58 +0,0 @@
sudo: true
# Enabling test on Linux and OS X
os:
- linux
- osx
# Use Build Matrix to do lint and build seperately
env:
matrix:
- TASK=lint LINT_LANG=cpp
- TASK=lint LINT_LANG=python
- TASK=R-package CXX=g++
- TASK=python-package CXX=g++
- TASK=python-package3 CXX=g++
- TASK=java-package CXX=g++
- TASK=build CXX=g++
- TASK=build-with-dmlc CXX=g++
os:
- linux
- osx
# dependent apt packages
addons:
apt:
packages:
- doxygen
- libopenmpi-dev
- wget
- libcurl4-openssl-dev
- unzip
- python-numpy
- python-scipy
before_install:
- scripts/travis_osx_install.sh
- git clone https://github.com/dmlc/dmlc-core
- export TRAVIS=dmlc-core/scripts/travis/
- export PYTHONPATH=${PYTHONPATH}:${PWD}/python-package
- source ${TRAVIS}/travis_setup_env.sh
install:
- pip install cpplint pylint --user `whoami`
script: scripts/travis_script.sh
after_failure:
- scripts/travis_after_failure.sh
notifications:
email:
on_success: change
on_failure: always

View File

@@ -1,61 +0,0 @@
Change Log
==========
xgboost-0.1
-----------
* Initial release
xgboost-0.2x
------------
* Python module
* Weighted samples instances
* Initial version of pairwise rank
xgboost-0.3
-----------
* Faster tree construction module
- Allows subsample columns during tree construction via ```bst:col_samplebytree=ratio```
* Support for boosting from initial predictions
* Experimental version of LambdaRank
* Linear booster is now parallelized, using parallel coordinated descent.
* Add [Code Guide](src/README.md) for customizing objective function and evaluation
* Add R module
xgboost-0.4
-----------
* Distributed version of xgboost that runs on YARN, scales to billions of examples
* Direct save/load data and model from/to S3 and HDFS
* Feature importance visualization in R module, by Michael Benesty
* Predict leaf index
* Poisson regression for counts data
* Early stopping option in training
* Native save load support in R and python
- xgboost models now can be saved using save/load in R
- xgboost python model is now pickable
* sklearn wrapper is supported in python module
* Experimental External memory version
xgboost-0.47
------------
* Changes in R library
- fixed possible problem of poisson regression.
- switched from 0 to NA for missing values.
- exposed access to additional model parameters.
* Changes in Python library
- throws exception instead of crash terminal when a parameter error happens.
- has importance plot and tree plot functions.
- accepts different learning rates for each boosting round.
- allows model training continuation from previously saved model.
- allows early stopping in CV.
- allows feval to return a list of tuples.
- allows eval_metric to handle additional format.
- improved compatibility in sklearn module.
- additional parameters added for sklearn wrapper.
- added pip installation functionality.
- supports more Pandas DataFrame dtypes.
- added best_ntree_limit attribute, in addition to best_score and best_iteration.
* Java api is ready for use
* Added more test cases and continuous integration to make each build more robust.
on going at master
------------------

View File

@@ -1,61 +0,0 @@
Contributors of DMLC/XGBoost
============================
XGBoost has been developed and used by a group of active community. Everyone is more than welcomed to is a great way to make the project better and more accessible to more users.
Comitters
---------
Committers are people who have made substantial contribution to the project and granted write access to the project.
* [Tianqi Chen](https://github.com/tqchen), University of Washington
- Tianqi is a PhD working on large-scale machine learning, he is the creator of the project.
* [Tong He](https://github.com/hetong007), Simon Fraser University
- Tong is a master student working on data mining, he is the maintainer of xgboost R package.
* [Bing Xu](https://github.com/antinucleon)
- Bing is the original creator of xgboost python package and currently the maintainer of [XGBoost.jl](https://github.com/antinucleon/XGBoost.jl).
* [Michael Benesty](https://github.com/pommedeterresautee)
- Micheal is a lawyer, data scientist in France, he is the creator of xgboost interactive analysis module in R.
* [Yuan Tang](https://github.com/terrytangyuan)
- Yuan is a data scientist in Chicago, US. He contributed mostly in R and Python packages.
Become a Comitter
-----------------
XGBoost is a opensource project and we are actively looking for new comitters who are willing to help maintaining and lead the project.
Committers comes from contributors who:
* Made substantial contribution to the project.
* Willing to spent time on maintaining and lead the project.
New committers will be proposed by current comitter memembers, with support from more than two of current comitters.
List of Contributors
--------------------
* [Full List of Contributors](https://github.com/dmlc/xgboost/graphs/contributors)
- To contributors: please add your name to the list when you submit a patch to the project:)
* [Kailong Chen](https://github.com/kalenhaha)
- Kailong is an early contributor of xgboost, he is creator of ranking objectives in xgboost.
* [Skipper Seabold](https://github.com/jseabold)
- Skipper is the major contributor to the scikit-learn module of xgboost.
* [Zygmunt Zając](https://github.com/zygmuntz)
- Zygmunt is the master behind the early stopping feature frequently used by kagglers.
* [Ajinkya Kale](https://github.com/ajkl)
* [Boliang Chen](https://github.com/cblsjtu)
* [Vadim Khotilovich](https://github.com/khotilov)
* [Yangqing Men](https://github.com/yanqingmen)
- Yangqing is the creator of xgboost java package.
* [Engpeng Yao](https://github.com/yepyao)
* [Giulio](https://github.com/giuliohome)
- Giulio is the creator of windows project of xgboost
* [Jamie Hall](https://github.com/nerdcha)
- Jamie is the initial creator of xgboost sklearn modue.
* [Yen-Ying Lee](https://github.com/white1033)
* [Masaaki Horikoshi](https://github.com/sinhrks)
- Masaaki is the initial creator of xgboost python plotting module.
* [Hongliang Liu](https://github.com/phunterlau)
- Hongliang is the maintainer of xgboost python PyPI package for pip installation.
* [daiyl0320](https://github.com/daiyl0320)
- daiyl0320 contributed patch to xgboost distributed version more robust, and scales stably on TB scale datasets.
* [Huayi Zhang](https://github.com/irachex)
* [Johan Manders](https://github.com/johanmanders)
* [yoori](https://github.com/yoori)
* [Mathias Müller](https://github.com/far0n)
* [Sam Thomson](https://github.com/sammthomson)
* [ganesh-krishnan](https://github.com/ganesh-krishnan)
* [Damien Carol](https://github.com/damiencarol)

View File

@@ -1,4 +1,4 @@
Copyright (c) 2014 by Contributors
Copyright (c) 2014 by Tianqi Chen and Contributors
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.

214
Makefile
View File

@@ -1,214 +1,26 @@
export CC = $(if $(shell which gcc-5 2>/dev/null),gcc-5,gcc)
export CXX = $(if $(shell which g++-5 2>/dev/null),g++-5,g++)
export MPICXX = mpicxx
export LDFLAGS= -pthread -lm
export CFLAGS = -Wall -O3 -msse2 -Wno-unknown-pragmas -funroll-loops
# java include path
export JAVAINCFLAGS = -I${JAVA_HOME}/include -I./java
ifeq ($(OS), Windows_NT)
export CXX = g++ -m64
export CC = gcc -m64
endif
UNAME= $(shell uname)
ifeq ($(UNAME), Linux)
LDFLAGS += -lrt
JAVAINCFLAGS += -I${JAVA_HOME}/include/linux
endif
ifeq ($(UNAME), Darwin)
JAVAINCFLAGS += -I${JAVA_HOME}/include/darwin
endif
ifeq ($(no_omp),1)
CFLAGS += -DDISABLE_OPENMP
else
#CFLAGS += -fopenmp
ifeq ($(omp_mac_static),1)
#CFLAGS += -fopenmp -Bstatic
CFLAGS += -static-libgcc -static-libstdc++ -L. -fopenmp
#LDFLAGS += -Wl,--whole-archive -lpthread -Wl --no-whole-archive
else
CFLAGS += -fopenmp
endif
endif
# by default use c++11
ifeq ($(cxx11),1)
CFLAGS += -std=c++11
endif
# handling dmlc
ifdef dmlc
ifndef config
ifneq ("$(wildcard $(dmlc)/config.mk)","")
config = $(dmlc)/config.mk
else
config = $(dmlc)/make/config.mk
endif
endif
include $(config)
include $(dmlc)/make/dmlc.mk
LDFLAGS+= $(DMLC_LDFLAGS)
LIBDMLC=$(dmlc)/libdmlc.a
else
LIBDMLC=dmlc_simple.o
endif
ifndef WITH_FPIC
WITH_FPIC = 1
endif
ifeq ($(WITH_FPIC), 1)
CFLAGS += -fPIC
endif
ifeq ($(OS), Windows_NT)
LIBRABIT = subtree/rabit/lib/librabit_empty.a
SLIB = wrapper/xgboost_wrapper.dll
else
LIBRABIT = subtree/rabit/lib/librabit.a
SLIB = wrapper/libxgboostwrapper.so
endif
# java lib
JLIB = java/libxgboost4j.so
export CC = gcc
export CXX = g++
export CFLAGS = -Wall -O3 -msse2 -Wno-unknown-pragmas -fopenmp
# specify tensor path
BIN = xgboost
MOCKBIN = xgboost.mock
OBJ = updater.o gbm.o io.o main.o dmlc_simple.o
MPIBIN =
ifeq ($(WITH_FPIC), 1)
TARGET = $(BIN) $(OBJ) $(SLIB)
else
TARGET = $(BIN)
endif
OBJ =
.PHONY: clean all
ifndef LINT_LANG
LINT_LANG= "all"
endif
all: $(BIN) $(OBJ)
export LDFLAGS= -pthread -lm
.PHONY: clean all mpi python Rpack lint
xgboost: regrank/xgboost_regrank_main.cpp regrank/*.h regrank/*.hpp booster/*.h booster/*/*.hpp booster/*.hpp
all: $(TARGET)
mpi: $(MPIBIN)
python: wrapper/libxgboostwrapper.so
# now the wrapper takes in two files. io and wrapper part
updater.o: src/tree/updater.cpp src/tree/*.hpp src/*.h src/tree/*.h src/utils/*.h
dmlc_simple.o: src/io/dmlc_simple.cpp src/utils/*.h
gbm.o: src/gbm/gbm.cpp src/gbm/*.hpp src/gbm/*.h
io.o: src/io/io.cpp src/io/*.hpp src/utils/*.h src/learner/dmatrix.h src/*.h
main.o: src/xgboost_main.cpp src/utils/*.h src/*.h src/learner/*.hpp src/learner/*.h
xgboost: updater.o gbm.o io.o main.o $(LIBRABIT) $(LIBDMLC)
wrapper/xgboost_wrapper.dll wrapper/libxgboostwrapper.so: wrapper/xgboost_wrapper.cpp src/utils/*.h src/*.h src/learner/*.hpp src/learner/*.h updater.o gbm.o io.o $(LIBRABIT) $(LIBDMLC)
$(BIN) :
$(CXX) $(CFLAGS) $(LDFLAGS) -o $@ $(filter %.cpp %.o %.c, $^)
java: java/libxgboost4j.so
java/libxgboost4j.so: java/xgboost4j_wrapper.cpp wrapper/xgboost_wrapper.cpp src/utils/*.h src/*.h src/learner/*.hpp src/learner/*.h updater.o gbm.o io.o $(LIBRABIT) $(LIBDMLC)
# dependency on rabit
subtree/rabit/lib/librabit.a: subtree/rabit/src/engine.cc
+ cd subtree/rabit;make lib/librabit.a; cd ../..
subtree/rabit/lib/librabit_empty.a: subtree/rabit/src/engine_empty.cc
+ cd subtree/rabit;make lib/librabit_empty.a; cd ../..
subtree/rabit/lib/librabit_mock.a: subtree/rabit/src/engine_mock.cc
+ cd subtree/rabit;make lib/librabit_mock.a; cd ../..
subtree/rabit/lib/librabit_mpi.a: subtree/rabit/src/engine_mpi.cc
+ cd subtree/rabit;make lib/librabit_mpi.a; cd ../..
$(BIN) :
$(CXX) $(CFLAGS) -fPIC -o $@ $(filter %.cpp %.o %.c %.cc %.a, $^) $(LDFLAGS)
$(MOCKBIN) :
$(CXX) $(CFLAGS) -o $@ $(filter %.cpp %.o %.c %.cc %.a, $^) $(LDFLAGS)
$(SLIB) :
$(CXX) $(CFLAGS) -fPIC -shared -o $@ $(filter %.cpp %.o %.c %.a %.cc, $^) $(LDFLAGS) $(DLLFLAGS)
$(JLIB) :
$(CXX) $(CFLAGS) -fPIC -shared -o $@ $(filter %.cpp %.o %.c %.a %.cc, $^) $(LDFLAGS) $(JAVAINCFLAGS)
$(OBJ) :
$(CXX) -c $(CFLAGS) -o $@ $(firstword $(filter %.cpp %.c %.cc, $^) )
$(MPIOBJ) :
$(MPICXX) -c $(CFLAGS) -o $@ $(firstword $(filter %.cpp %.c, $^) )
$(MPIBIN) :
$(MPICXX) $(CFLAGS) -o $@ $(filter %.cpp %.o %.c %.cc %.a, $^) $(LDFLAGS)
$(OBJ) :
$(CXX) -c $(CFLAGS) -o $@ $(firstword $(filter %.cpp %.c, $^) )
install:
cp -f -r $(BIN) $(INSTALL_PATH)
Rpack:
make clean
cd subtree/rabit;make clean;cd ..
rm -rf xgboost xgboost*.tar.gz
cp -r R-package xgboost
rm -rf xgboost/src/*.o xgboost/src/*.so xgboost/src/*.dll
rm -rf xgboost/src/*/*.o
rm -rf subtree/rabit/src/*.o
rm -rf xgboost/demo/*.model xgboost/demo/*.buffer xgboost/demo/*.txt
rm -rf xgboost/demo/runall.R
cp -r src xgboost/src/src
mkdir xgboost/src/subtree
mkdir xgboost/src/subtree/rabit
cp -r subtree/rabit/include xgboost/src/subtree/rabit/include
cp -r subtree/rabit/src xgboost/src/subtree/rabit/src
rm -rf xgboost/src/subtree/rabit/src/*.o
mkdir xgboost/src/wrapper
cp wrapper/xgboost_wrapper.h xgboost/src/wrapper
cp wrapper/xgboost_wrapper.cpp xgboost/src/wrapper
cp ./LICENSE xgboost
cat R-package/src/Makevars|sed '2s/.*/PKGROOT=./' > xgboost/src/Makevars
cp xgboost/src/Makevars xgboost/src/Makevars.win
# R CMD build --no-build-vignettes xgboost
# R CMD build xgboost
# rm -rf xgboost
# R CMD check --as-cran xgboost*.tar.gz
Rbuild:
make Rpack
R CMD build xgboost
rm -rf xgboost
Rcheck:
make Rbuild
R CMD check --as-cran xgboost*.tar.gz
pythonpack:
#for pip maintainer only
cd subtree/rabit;make clean;cd ..
rm -rf xgboost-deploy xgboost*.tar.gz
cp -r python-package xgboost-deploy
#cp *.md xgboost-deploy/
cp LICENSE xgboost-deploy/
cp Makefile xgboost-deploy/xgboost
cp -r wrapper xgboost-deploy/xgboost
cp -r subtree xgboost-deploy/xgboost
cp -r multi-node xgboost-deploy/xgboost
cp -r windows xgboost-deploy/xgboost
cp -r src xgboost-deploy/xgboost
cp python-package/setup_pip.py xgboost-deploy/setup.py
#make python
pythonbuild:
make pythonpack
python setup.py install
pythoncheck:
make pythonbuild
python -c 'import xgboost;print xgboost.core.find_lib_path()'
# lint requires dmlc to be in current folder
lint:
dmlc-core/scripts/lint.py xgboost $(LINT_LANG) src wrapper R-package python-package
clean:
$(RM) -rf $(OBJ) $(BIN) $(MPIBIN) $(MPIOBJ) $(SLIB) *.o */*.o */*/*.o *~ */*~ */*/*~
cd subtree/rabit; make clean; cd ..
$(RM) $(OBJ) $(BIN) *~

View File

@@ -1,6 +0,0 @@
\.o$
\.so$
\.dll$
^.*\.Rproj$
^\.Rproj\.user$
README.md

View File

@@ -1,37 +0,0 @@
Package: xgboost
Type: Package
Title: Extreme Gradient Boosting
Version: 0.4-2
Date: 2015-08-01
Author: Tianqi Chen <tianqi.tchen@gmail.com>, Tong He <hetong007@gmail.com>,
Michael Benesty <michael@benesty.fr>
Maintainer: Tong He <hetong007@gmail.com>
Description: Extreme Gradient Boosting, which is an efficient implementation
of gradient boosting framework. This package is its R interface. The package
includes efficient linear model solver and tree learning algorithms. The package
can automatically do parallel computation on a single machine which could be
more than 10 times faster than existing gradient boosting packages. It supports
various objective functions, including regression, classification and ranking.
The package is made to be extensible, so that users are also allowed to define
their own objectives easily.
License: Apache License (== 2.0) | file LICENSE
URL: https://github.com/dmlc/xgboost
BugReports: https://github.com/dmlc/xgboost/issues
VignetteBuilder: knitr
Suggests:
knitr,
ggplot2 (>= 1.0.1),
DiagrammeR (>= 0.8.1),
Ckmeans.1d.dp (>= 3.3.1),
vcd (>= 1.3),
testthat,
igraph (>= 1.0.1)
Depends:
R (>= 2.10)
Imports:
Matrix (>= 1.1-0),
methods,
data.table (>= 1.9.6),
magrittr (>= 1.5),
stringr (>= 0.6.2)
RoxygenNote: 5.0.1

View File

@@ -1,13 +0,0 @@
Copyright (c) 2014 by Tianqi Chen and Contributors
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

View File

@@ -1,48 +0,0 @@
# Generated by roxygen2: do not edit by hand
export(getinfo)
export(setinfo)
export(slice)
export(xgb.DMatrix)
export(xgb.DMatrix.save)
export(xgb.create.features)
export(xgb.cv)
export(xgb.dump)
export(xgb.importance)
export(xgb.load)
export(xgb.model.dt.tree)
export(xgb.plot.deepness)
export(xgb.plot.importance)
export(xgb.plot.multi.trees)
export(xgb.plot.tree)
export(xgb.save)
export(xgb.save.raw)
export(xgb.train)
export(xgboost)
exportMethods(nrow)
exportMethods(predict)
import(methods)
importClassesFrom(Matrix,dgCMatrix)
importClassesFrom(Matrix,dgeMatrix)
importFrom(Matrix,cBind)
importFrom(Matrix,colSums)
importFrom(Matrix,sparse.model.matrix)
importFrom(Matrix,sparseVector)
importFrom(data.table,":=")
importFrom(data.table,as.data.table)
importFrom(data.table,copy)
importFrom(data.table,data.table)
importFrom(data.table,fread)
importFrom(data.table,rbindlist)
importFrom(data.table,set)
importFrom(data.table,setnames)
importFrom(magrittr,"%>%")
importFrom(magrittr,add)
importFrom(magrittr,not)
importFrom(stringr,str_detect)
importFrom(stringr,str_extract)
importFrom(stringr,str_extract_all)
importFrom(stringr,str_match)
importFrom(stringr,str_replace)
importFrom(stringr,str_split)
importFrom(stringr,str_trim)

View File

@@ -1,55 +0,0 @@
setClass('xgb.DMatrix')
#' Get information of an xgb.DMatrix object
#'
#' Get information of an xgb.DMatrix object
#'
#' The information can be one of the following:
#'
#' \itemize{
#' \item \code{label}: label Xgboost learn from ;
#' \item \code{weight}: to do a weight rescale ;
#' \item \code{base_margin}: base margin is the base prediction Xgboost will boost from ;
#' \item \code{nrow}: number of rows of the \code{xgb.DMatrix}.
#' }
#'
#' @examples
#' data(agaricus.train, package='xgboost')
#' train <- agaricus.train
#' dtrain <- xgb.DMatrix(train$data, label=train$label)
#' labels <- getinfo(dtrain, 'label')
#' setinfo(dtrain, 'label', 1-labels)
#' labels2 <- getinfo(dtrain, 'label')
#' stopifnot(all(labels2 == 1-labels))
#' @rdname getinfo
#' @export
getinfo <- function(object, ...){
UseMethod("getinfo")
}
#' @param object Object of class \code{xgb.DMatrix}
#' @param name the name of the field to get
#' @param ... other parameters
#' @rdname getinfo
#' @method getinfo xgb.DMatrix
setMethod("getinfo", signature = "xgb.DMatrix",
definition = function(object, name) {
if (typeof(name) != "character") {
stop("xgb.getinfo: name must be character")
}
if (class(object) != "xgb.DMatrix") {
stop("xgb.setinfo: first argument dtrain must be xgb.DMatrix")
}
if (name != "label" && name != "weight" &&
name != "base_margin" && name != "nrow") {
stop(paste("xgb.getinfo: unknown info name", name))
}
if (name != "nrow"){
ret <- .Call("XGDMatrixGetInfo_R", object, name, PACKAGE = "xgboost")
} else {
ret <- xgb.numrow(object)
}
return(ret)
})

View File

@@ -1,19 +0,0 @@
setGeneric("nrow")
#' @title Number of xgb.DMatrix rows
#' @description \code{nrow} return the number of rows present in the \code{xgb.DMatrix}.
#' @param x Object of class \code{xgb.DMatrix}
#'
#' @examples
#' data(agaricus.train, package='xgboost')
#' train <- agaricus.train
#' dtrain <- xgb.DMatrix(train$data, label=train$label)
#' stopifnot(nrow(dtrain) == nrow(train$data))
#'
#' @export
setMethod("nrow",
signature = "xgb.DMatrix",
definition = function(x) {
xgb.numrow(x)
}
)

View File

@@ -1,80 +0,0 @@
setClass("xgb.Booster.handle")
setClass("xgb.Booster",
slots = c(handle = "xgb.Booster.handle",
raw = "raw"))
#' Predict method for eXtreme Gradient Boosting model
#'
#' Predicted values based on xgboost model object.
#'
#' @param object Object of class "xgb.Boost"
#' @param newdata takes \code{matrix}, \code{dgCMatrix}, local data file or
#' \code{xgb.DMatrix}.
#' @param missing Missing is only used when input is dense matrix, pick a float
#' value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values.
#' @param outputmargin whether the prediction should be shown in the original
#' value of sum of functions, when outputmargin=TRUE, the prediction is
#' untransformed margin value. In logistic regression, outputmargin=T will
#' output value before logistic transformation.
#' @param ntreelimit limit number of trees used in prediction, this parameter is
#' only valid for gbtree, but not for gblinear. set it to be value bigger
#' than 0. It will use all trees by default.
#' @param predleaf whether predict leaf index instead. If set to TRUE, the output will be a matrix object.
#'
#' @details
#' The option \code{ntreelimit} purpose is to let the user train a model with lots
#' of trees but use only the first trees for prediction to avoid overfitting
#' (without having to train a new model with less trees).
#'
#' The option \code{predleaf} purpose is inspired from §3.1 of the paper
#' \code{Practical Lessons from Predicting Clicks on Ads at Facebook}.
#' The idea is to use the model as a generator of new features which capture non linear link
#' from original features.
#'
#' @examples
#' data(agaricus.train, package='xgboost')
#' data(agaricus.test, package='xgboost')
#' train <- agaricus.train
#' test <- agaricus.test
#' bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
#' eta = 1, nthread = 2, nround = 2,objective = "binary:logistic")
#' pred <- predict(bst, test$data)
#' @export
setMethod("predict", signature = "xgb.Booster",
definition = function(object, newdata, missing = NA,
outputmargin = FALSE, ntreelimit = NULL, predleaf = FALSE) {
if (class(object) != "xgb.Booster"){
stop("predict: model in prediction must be of class xgb.Booster")
} else {
object <- xgb.Booster.check(object, saveraw = FALSE)
}
if (class(newdata) != "xgb.DMatrix") {
newdata <- xgb.DMatrix(newdata, missing = missing)
}
if (is.null(ntreelimit)) {
ntreelimit <- 0
} else {
if (ntreelimit < 1){
stop("predict: ntreelimit must be equal to or greater than 1")
}
}
option <- 0
if (outputmargin) {
option <- option + 1
}
if (predleaf) {
option <- option + 2
}
ret <- .Call("XGBoosterPredict_R", object$handle, newdata, as.integer(option),
as.integer(ntreelimit), PACKAGE = "xgboost")
if (predleaf){
len <- getinfo(newdata, "nrow")
if (length(ret) == len){
ret <- matrix(ret,ncol = 1)
} else {
ret <- matrix(ret, ncol = len)
ret <- t(ret)
}
}
return(ret)
})

View File

@@ -1,18 +0,0 @@
#' Predict method for eXtreme Gradient Boosting model handle
#'
#' Predicted values based on xgb.Booster.handle object.
#'
#' @param object Object of class "xgb.Boost.handle"
#' @param ... Parameters pass to \code{predict.xgb.Booster}
#'
setMethod("predict", signature = "xgb.Booster.handle",
definition = function(object, ...) {
if (class(object) != "xgb.Booster.handle"){
stop("predict: model in prediction must be of class xgb.Booster.handle")
}
bst <- xgb.handleToBooster(object)
ret <- predict(bst, ...)
return(ret)
})

View File

@@ -1,37 +0,0 @@
#' Set information of an xgb.DMatrix object
#'
#' Set information of an xgb.DMatrix object
#'
#' It can be one of the following:
#'
#' \itemize{
#' \item \code{label}: label Xgboost learn from ;
#' \item \code{weight}: to do a weight rescale ;
#' \item \code{base_margin}: base margin is the base prediction Xgboost will boost from ;
#' \item \code{group}.
#' }
#'
#' @examples
#' data(agaricus.train, package='xgboost')
#' train <- agaricus.train
#' dtrain <- xgb.DMatrix(train$data, label=train$label)
#' labels <- getinfo(dtrain, 'label')
#' setinfo(dtrain, 'label', 1-labels)
#' labels2 <- getinfo(dtrain, 'label')
#' stopifnot(all(labels2 == 1-labels))
#' @rdname setinfo
#' @export
setinfo <- function(object, ...){
UseMethod("setinfo")
}
#' @param object Object of class "xgb.DMatrix"
#' @param name the name of the field to get
#' @param info the specific field of information to set
#' @param ... other parameters
#' @rdname setinfo
#' @method setinfo xgb.DMatrix
setMethod("setinfo", signature = "xgb.DMatrix",
definition = function(object, name, info) {
xgb.setinfo(object, name, info)
})

View File

@@ -1,44 +0,0 @@
setClass('xgb.DMatrix')
#' Get a new DMatrix containing the specified rows of
#' orginal xgb.DMatrix object
#'
#' Get a new DMatrix containing the specified rows of
#' orginal xgb.DMatrix object
#'
#' @examples
#' data(agaricus.train, package='xgboost')
#' train <- agaricus.train
#' dtrain <- xgb.DMatrix(train$data, label=train$label)
#' dsub <- slice(dtrain, 1:3)
#' @rdname slice
#' @export
slice <- function(object, ...){
UseMethod("slice")
}
#' @param object Object of class "xgb.DMatrix"
#' @param idxset a integer vector of indices of rows needed
#' @param ... other parameters
#' @rdname slice
#' @method slice xgb.DMatrix
setMethod("slice", signature = "xgb.DMatrix",
definition = function(object, idxset, ...) {
if (class(object) != "xgb.DMatrix") {
stop("slice: first argument dtrain must be xgb.DMatrix")
}
ret <- .Call("XGDMatrixSliceDMatrix_R", object, idxset,
PACKAGE = "xgboost")
attr_list <- attributes(object)
nr <- xgb.numrow(object)
len <- sapply(attr_list,length)
ind <- which(len == nr)
if (length(ind) > 0) {
nms <- names(attr_list)[ind]
for (i in 1:length(ind)) {
attr(ret,nms[i]) <- attr(object,nms[i])[idxset]
}
}
return(structure(ret, class = "xgb.DMatrix"))
})

View File

@@ -1,346 +0,0 @@
#' @importClassesFrom Matrix dgCMatrix dgeMatrix
#' @import methods
# depends on matrix
.onLoad <- function(libname, pkgname) {
library.dynam("xgboost", pkgname, libname)
}
.onUnload <- function(libpath) {
library.dynam.unload("xgboost", libpath)
}
# set information into dmatrix, this mutate dmatrix
xgb.setinfo <- function(dmat, name, info) {
if (class(dmat) != "xgb.DMatrix") {
stop("xgb.setinfo: first argument dtrain must be xgb.DMatrix")
}
if (name == "label") {
if (length(info) != xgb.numrow(dmat))
stop("The length of labels must equal to the number of rows in the input data")
.Call("XGDMatrixSetInfo_R", dmat, name, as.numeric(info),
PACKAGE = "xgboost")
return(TRUE)
}
if (name == "weight") {
if (length(info) != xgb.numrow(dmat))
stop("The length of weights must equal to the number of rows in the input data")
.Call("XGDMatrixSetInfo_R", dmat, name, as.numeric(info),
PACKAGE = "xgboost")
return(TRUE)
}
if (name == "base_margin") {
# if (length(info)!=xgb.numrow(dmat))
# stop("The length of base margin must equal to the number of rows in the input data")
.Call("XGDMatrixSetInfo_R", dmat, name, as.numeric(info),
PACKAGE = "xgboost")
return(TRUE)
}
if (name == "group") {
if (sum(info) != xgb.numrow(dmat))
stop("The sum of groups must equal to the number of rows in the input data")
.Call("XGDMatrixSetInfo_R", dmat, name, as.integer(info),
PACKAGE = "xgboost")
return(TRUE)
}
stop(paste("xgb.setinfo: unknown info name", name))
return(FALSE)
}
# construct a Booster from cachelist
xgb.Booster <- function(params = list(), cachelist = list(), modelfile = NULL) {
if (typeof(cachelist) != "list") {
stop("xgb.Booster: only accepts list of DMatrix as cachelist")
}
for (dm in cachelist) {
if (class(dm) != "xgb.DMatrix") {
stop("xgb.Booster: only accepts list of DMatrix as cachelist")
}
}
handle <- .Call("XGBoosterCreate_R", cachelist, PACKAGE = "xgboost")
if (length(params) != 0) {
for (i in 1:length(params)) {
p <- params[i]
.Call("XGBoosterSetParam_R", handle, gsub("\\.", "_", names(p)), as.character(p),
PACKAGE = "xgboost")
}
}
if (!is.null(modelfile)) {
if (typeof(modelfile) == "character") {
.Call("XGBoosterLoadModel_R", handle, modelfile, PACKAGE = "xgboost")
} else if (typeof(modelfile) == "raw") {
.Call("XGBoosterLoadModelFromRaw_R", handle, modelfile, PACKAGE = "xgboost")
} else {
stop("xgb.Booster: modelfile must be character or raw vector")
}
}
return(structure(handle, class = "xgb.Booster.handle"))
}
# convert xgb.Booster.handle to xgb.Booster
xgb.handleToBooster <- function(handle, raw = NULL)
{
bst <- list(handle = handle, raw = raw)
class(bst) <- "xgb.Booster"
return(bst)
}
# Check whether an xgb.Booster object is complete
xgb.Booster.check <- function(bst, saveraw = TRUE)
{
isnull <- is.null(bst$handle)
if (!isnull) {
isnull <- .Call("XGCheckNullPtr_R", bst$handle, PACKAGE="xgboost")
}
if (isnull) {
bst$handle <- xgb.Booster(modelfile = bst$raw)
} else {
if (is.null(bst$raw) && saveraw)
bst$raw <- xgb.save.raw(bst$handle)
}
return(bst)
}
## ----the following are low level iteratively function, not needed if
## you do not want to use them ---------------------------------------
# get dmatrix from data, label
xgb.get.DMatrix <- function(data, label = NULL, missing = NA, weight = NULL) {
inClass <- class(data)
if (inClass == "dgCMatrix" || inClass == "matrix") {
if (is.null(label)) {
stop("xgboost: need label when data is a matrix")
}
dtrain <- xgb.DMatrix(data, label = label, missing = missing)
if (!is.null(weight)){
xgb.setinfo(dtrain, "weight", weight)
}
} else {
if (!is.null(label)) {
warning("xgboost: label will be ignored.")
}
if (inClass == "character") {
dtrain <- xgb.DMatrix(data)
} else if (inClass == "xgb.DMatrix") {
dtrain <- data
} else if (inClass == "data.frame") {
stop("xgboost only support numerical matrix input,
use 'data.matrix' to transform the data.")
} else {
stop("xgboost: Invalid input of data")
}
}
return (dtrain)
}
xgb.numrow <- function(dmat) {
nrow <- .Call("XGDMatrixNumRow_R", dmat, PACKAGE="xgboost")
return(nrow)
}
# iteratively update booster with customized statistics
xgb.iter.boost <- function(booster, dtrain, gpair) {
if (class(booster) != "xgb.Booster.handle") {
stop("xgb.iter.update: first argument must be type xgb.Booster.handle")
}
if (class(dtrain) != "xgb.DMatrix") {
stop("xgb.iter.update: second argument must be type xgb.DMatrix")
}
.Call("XGBoosterBoostOneIter_R", booster, dtrain, gpair$grad, gpair$hess, PACKAGE = "xgboost")
return(TRUE)
}
# iteratively update booster with dtrain
xgb.iter.update <- function(booster, dtrain, iter, obj = NULL) {
if (class(booster) != "xgb.Booster.handle") {
stop("xgb.iter.update: first argument must be type xgb.Booster.handle")
}
if (class(dtrain) != "xgb.DMatrix") {
stop("xgb.iter.update: second argument must be type xgb.DMatrix")
}
if (is.null(obj)) {
.Call("XGBoosterUpdateOneIter_R", booster, as.integer(iter), dtrain,
PACKAGE = "xgboost")
} else {
pred <- predict(booster, dtrain)
gpair <- obj(pred, dtrain)
succ <- xgb.iter.boost(booster, dtrain, gpair)
}
return(TRUE)
}
# iteratively evaluate one iteration
xgb.iter.eval <- function(booster, watchlist, iter, feval = NULL, prediction = FALSE) {
if (class(booster) != "xgb.Booster.handle") {
stop("xgb.eval: first argument must be type xgb.Booster")
}
if (typeof(watchlist) != "list") {
stop("xgb.eval: only accepts list of DMatrix as watchlist")
}
for (w in watchlist) {
if (class(w) != "xgb.DMatrix") {
stop("xgb.eval: watch list can only contain xgb.DMatrix")
}
}
if (length(watchlist) != 0) {
if (is.null(feval)) {
evnames <- list()
for (i in 1:length(watchlist)) {
w <- watchlist[i]
if (length(names(w)) == 0) {
stop("xgb.eval: name tag must be presented for every elements in watchlist")
}
evnames <- append(evnames, names(w))
}
msg <- .Call("XGBoosterEvalOneIter_R", booster, as.integer(iter), watchlist,
evnames, PACKAGE = "xgboost")
} else {
msg <- paste("[", iter, "]", sep="")
for (j in 1:length(watchlist)) {
w <- watchlist[j]
if (length(names(w)) == 0) {
stop("xgb.eval: name tag must be presented for every elements in watchlist")
}
preds <- predict(booster, w[[1]])
ret <- feval(preds, w[[1]])
msg <- paste(msg, "\t", names(w), "-", ret$metric, ":", ret$value, sep="")
}
}
} else {
msg <- ""
}
if (prediction){
preds <- predict(booster,watchlist[[2]])
return(list(msg,preds))
}
return(msg)
}
#------------------------------------------
# helper functions for cross validation
#
xgb.cv.mknfold <- function(dall, nfold, param, stratified, folds) {
if (nfold <= 1) {
stop("nfold must be bigger than 1")
}
if(is.null(folds)) {
if (exists('objective', where=param) && is.character(param$objective) &&
strtrim(param[['objective']], 5) == 'rank:') {
stop("\tAutomatic creation of CV-folds is not implemented for ranking!\n",
"\tConsider providing pre-computed CV-folds through the folds parameter.")
}
y <- getinfo(dall, 'label')
randidx <- sample(1 : xgb.numrow(dall))
if (stratified & length(y) == length(randidx)) {
y <- y[randidx]
#
# WARNING: some heuristic logic is employed to identify classification setting!
#
# For classification, need to convert y labels to factor before making the folds,
# and then do stratification by factor levels.
# For regression, leave y numeric and do stratification by quantiles.
if (exists('objective', where=param) && is.character(param$objective)) {
# If 'objective' provided in params, assume that y is a classification label
# unless objective is reg:linear
if (param[['objective']] != 'reg:linear') y <- factor(y)
} else {
# If no 'objective' given in params, it means that user either wants to use
# the default 'reg:linear' objective or has provided a custom obj function.
# Here, assume classification setting when y has 5 or less unique values:
if (length(unique(y)) <= 5) y <- factor(y)
}
folds <- xgb.createFolds(y, nfold)
} else {
# make simple non-stratified folds
kstep <- length(randidx) %/% nfold
folds <- list()
for (i in 1:(nfold - 1)) {
folds[[i]] <- randidx[1:kstep]
randidx <- setdiff(randidx, folds[[i]])
}
folds[[nfold]] <- randidx
}
}
ret <- list()
for (k in 1:nfold) {
dtest <- slice(dall, folds[[k]])
didx <- c()
for (i in 1:nfold) {
if (i != k) {
didx <- append(didx, folds[[i]])
}
}
dtrain <- slice(dall, didx)
bst <- xgb.Booster(param, list(dtrain, dtest))
watchlist <- list(train=dtrain, test=dtest)
ret[[k]] <- list(dtrain=dtrain, booster=bst, watchlist=watchlist, index=folds[[k]])
}
return (ret)
}
xgb.cv.aggcv <- function(res, showsd = TRUE) {
header <- res[[1]]
ret <- header[1]
for (i in 2:length(header)) {
kv <- strsplit(header[i], ":")[[1]]
ret <- paste(ret, "\t", kv[1], ":", sep="")
stats <- c()
stats[1] <- as.numeric(kv[2])
for (j in 2:length(res)) {
tkv <- strsplit(res[[j]][i], ":")[[1]]
stats[j] <- as.numeric(tkv[2])
}
ret <- paste(ret, sprintf("%f", mean(stats)), sep="")
if (showsd) {
ret <- paste(ret, sprintf("+%f", stats::sd(stats)), sep="")
}
}
return (ret)
}
# Shamelessly copied from caret::createFolds
# and simplified by always returning an unnamed list of test indices
xgb.createFolds <- function(y, k = 10)
{
if(is.numeric(y)) {
## Group the numeric data based on their magnitudes
## and sample within those groups.
## When the number of samples is low, we may have
## issues further slicing the numeric data into
## groups. The number of groups will depend on the
## ratio of the number of folds to the sample size.
## At most, we will use quantiles. If the sample
## is too small, we just do regular unstratified
## CV
cuts <- floor(length(y) / k)
if (cuts < 2) cuts <- 2
if (cuts > 5) cuts <- 5
y <- cut(y,
unique(stats::quantile(y, probs = seq(0, 1, length = cuts))),
include.lowest = TRUE)
}
if(k < length(y)) {
## reset levels so that the possible levels and
## the levels in the vector are the same
y <- factor(as.character(y))
numInClass <- table(y)
foldVector <- vector(mode = "integer", length(y))
## For each class, balance the fold allocation as far
## as possible, then resample the remainder.
## The final assignment of folds is also randomized.
for(i in 1:length(numInClass)) {
## create a vector of integers from 1:k as many times as possible without
## going over the number of samples in the class. Note that if the number
## of samples in a class is less than k, nothing is producd here.
seqVector <- rep(1:k, numInClass[i] %/% k)
## add enough random integers to get length(seqVector) == numInClass[i]
if(numInClass[i] %% k > 0) seqVector <- c(seqVector, sample(1:k, numInClass[i] %% k))
## shuffle the integers for fold assignment and assign to this classes's data
foldVector[which(y == dimnames(numInClass)$y[i])] <- sample(seqVector)
}
} else foldVector <- seq(along = y)
out <- split(seq(along = y), foldVector)
names(out) <- NULL
out
}

View File

@@ -1,44 +0,0 @@
#' Contruct xgb.DMatrix object
#'
#' Contruct xgb.DMatrix object from dense matrix, sparse matrix or local file.
#'
#' @param data a \code{matrix} object, a \code{dgCMatrix} object or a character
#' indicating the data file.
#' @param info a list of information of the xgb.DMatrix object
#' @param missing Missing is only used when input is dense matrix, pick a float
#' value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values.
#
#' @param ... other information to pass to \code{info}.
#'
#' @examples
#' data(agaricus.train, package='xgboost')
#' train <- agaricus.train
#' dtrain <- xgb.DMatrix(train$data, label=train$label)
#' xgb.DMatrix.save(dtrain, 'xgb.DMatrix.data')
#' dtrain <- xgb.DMatrix('xgb.DMatrix.data')
#' @export
xgb.DMatrix <- function(data, info = list(), missing = NA, ...) {
if (typeof(data) == "character") {
handle <- .Call("XGDMatrixCreateFromFile_R", data, as.integer(FALSE),
PACKAGE = "xgboost")
} else if (is.matrix(data)) {
handle <- .Call("XGDMatrixCreateFromMat_R", data, missing,
PACKAGE = "xgboost")
} else if (class(data) == "dgCMatrix") {
handle <- .Call("XGDMatrixCreateFromCSC_R", data@p, data@i, data@x,
PACKAGE = "xgboost")
} else {
stop(paste("xgb.DMatrix: does not support to construct from ",
typeof(data)))
}
dmat <- structure(handle, class = "xgb.DMatrix")
info <- append(info, list(...))
if (length(info) == 0)
return(dmat)
for (i in 1:length(info)) {
p <- info[i]
xgb.setinfo(dmat, names(p), p[[1]])
}
return(dmat)
}

View File

@@ -1,26 +0,0 @@
#' Save xgb.DMatrix object to binary file
#'
#' Save xgb.DMatrix object to binary file
#'
#' @param DMatrix the DMatrix object
#' @param fname the name of the binary file.
#'
#' @examples
#' data(agaricus.train, package='xgboost')
#' train <- agaricus.train
#' dtrain <- xgb.DMatrix(train$data, label=train$label)
#' xgb.DMatrix.save(dtrain, 'xgb.DMatrix.data')
#' dtrain <- xgb.DMatrix('xgb.DMatrix.data')
#' @export
xgb.DMatrix.save <- function(DMatrix, fname) {
if (typeof(fname) != "character") {
stop("xgb.save: fname must be character")
}
if (class(DMatrix) == "xgb.DMatrix") {
.Call("XGDMatrixSaveBinary_R", DMatrix, fname, as.integer(FALSE),
PACKAGE = "xgboost")
return(TRUE)
}
stop("xgb.DMatrix.save: the input must be xgb.DMatrix")
return(FALSE)
}

View File

@@ -1,91 +0,0 @@
#' Create new features from a previously learned model
#'
#' May improve the learning by adding new features to the training data based on the decision trees from a previously learned model.
#'
#' @importFrom magrittr %>%
#' @importFrom Matrix cBind
#' @importFrom Matrix sparse.model.matrix
#'
#' @param model decision tree boosting model learned on the original data
#' @param training.data original data (usually provided as a \code{dgCMatrix} matrix)
#'
#' @return \code{dgCMatrix} matrix including both the original data and the new features.
#'
#' @details
#' This is the function inspired from the paragraph 3.1 of the paper:
#'
#' \strong{Practical Lessons from Predicting Clicks on Ads at Facebook}
#'
#' \emph{(Xinran He, Junfeng Pan, Ou Jin, Tianbing Xu, Bo Liu, Tao Xu, Yan, xin Shi, Antoine Atallah, Ralf Herbrich, Stuart Bowers,
#' Joaquin Quiñonero Candela)}
#'
#' International Workshop on Data Mining for Online Advertising (ADKDD) - August 24, 2014
#'
#' \url{https://research.facebook.com/publications/758569837499391/practical-lessons-from-predicting-clicks-on-ads-at-facebook/}.
#'
#' Extract explaining the method:
#'
#' "\emph{We found that boosted decision trees are a powerful and very
#' convenient way to implement non-linear and tuple transformations
#' of the kind we just described. We treat each individual
#' tree as a categorical feature that takes as value the
#' index of the leaf an instance ends up falling in. We use
#' 1-of-K coding of this type of features.
#'
#' For example, consider the boosted tree model in Figure 1 with 2 subtrees,
#' where the first subtree has 3 leafs and the second 2 leafs. If an
#' instance ends up in leaf 2 in the first subtree and leaf 1 in
#' second subtree, the overall input to the linear classifier will
#' be the binary vector \code{[0, 1, 0, 1, 0]}, where the first 3 entries
#' correspond to the leaves of the first subtree and last 2 to
#' those of the second subtree.
#'
#' [...]
#'
#' We can understand boosted decision tree
#' based transformation as a supervised feature encoding that
#' converts a real-valued vector into a compact binary-valued
#' vector. A traversal from root node to a leaf node represents
#' a rule on certain features.}"
#'
#' @examples
#' data(agaricus.train, package='xgboost')
#' data(agaricus.test, package='xgboost')
#' dtrain <- xgb.DMatrix(data = agaricus.train$data, label = agaricus.train$label)
#' dtest <- xgb.DMatrix(data = agaricus.test$data, label = agaricus.test$label)
#'
#' param <- list(max.depth=2, eta=1, silent=1, objective='binary:logistic')
#' nround = 4
#'
#' bst = xgb.train(params = param, data = dtrain, nrounds = nround, nthread = 2)
#'
#' # Model accuracy without new features
#' accuracy.before <- sum((predict(bst, agaricus.test$data) >= 0.5) == agaricus.test$label) / length(agaricus.test$label)
#'
#' # Convert previous features to one hot encoding
#' new.features.train <- xgb.create.features(model = bst, agaricus.train$data)
#' new.features.test <- xgb.create.features(model = bst, agaricus.test$data)
#'
#' # learning with new features
#' new.dtrain <- xgb.DMatrix(data = new.features.train, label = agaricus.train$label)
#' new.dtest <- xgb.DMatrix(data = new.features.test, label = agaricus.test$label)
#' watchlist <- list(train = new.dtrain)
#' bst <- xgb.train(params = param, data = new.dtrain, nrounds = nround, nthread = 2)
#'
#' # Model accuracy with new features
#' accuracy.after <- sum((predict(bst, new.dtest) >= 0.5) == agaricus.test$label) / length(agaricus.test$label)
#'
#' # Here the accuracy was already good and is now perfect.
#' cat(paste("The accuracy was", accuracy.before, "before adding leaf features and it is now", accuracy.after, "!\n"))
#'
#' @export
xgb.create.features <- function(model, training.data){
pred_with_leaf = predict(model, training.data, predleaf = TRUE)
cols <- list()
for(i in 1:length(trees)){
# max is not the real max but it s not important for the purpose of adding features
leaf.id <- sort(unique(pred_with_leaf[,i]))
cols[[i]] <- factor(x = pred_with_leaf[,i], level = leaf.id)
}
cBind(training.data, sparse.model.matrix( ~ . -1, as.data.frame(cols)))
}

View File

@@ -1,250 +0,0 @@
#' Cross Validation
#'
#' The cross valudation function of xgboost
#'
#' @importFrom data.table data.table
#' @importFrom data.table as.data.table
#' @importFrom magrittr %>%
#' @importFrom data.table :=
#' @importFrom data.table rbindlist
#' @importFrom stringr str_extract_all
#' @importFrom stringr str_extract
#' @importFrom stringr str_split
#' @importFrom stringr str_replace
#' @importFrom stringr str_match
#'
#' @param params the list of parameters. Commonly used ones are:
#' \itemize{
#' \item \code{objective} objective function, common ones are
#' \itemize{
#' \item \code{reg:linear} linear regression
#' \item \code{binary:logistic} logistic regression for classification
#' }
#' \item \code{eta} step size of each boosting step
#' \item \code{max.depth} maximum depth of the tree
#' \item \code{nthread} number of thread used in training, if not set, all threads are used
#' }
#'
#' See \link{xgb.train} for further details.
#' See also demo/ for walkthrough example in R.
#' @param data takes an \code{xgb.DMatrix} or \code{Matrix} as the input.
#' @param nrounds the max number of iterations
#' @param nfold the original dataset is randomly partitioned into \code{nfold} equal size subsamples.
#' @param label option field, when data is \code{Matrix}
#' @param missing Missing is only used when input is dense matrix, pick a float
#' value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values.
#' @param prediction A logical value indicating whether to return the prediction vector.
#' @param showsd \code{boolean}, whether show standard deviation of cross validation
#' @param metrics, list of evaluation metrics to be used in corss validation,
#' when it is not specified, the evaluation metric is chosen according to objective function.
#' Possible options are:
#' \itemize{
#' \item \code{error} binary classification error rate
#' \item \code{rmse} Rooted mean square error
#' \item \code{logloss} negative log-likelihood function
#' \item \code{auc} Area under curve
#' \item \code{merror} Exact matching error, used to evaluate multi-class classification
#' }
#' @param obj customized objective function. Returns gradient and second order
#' gradient with given prediction and dtrain.
#' @param feval custimized evaluation function. Returns
#' \code{list(metric='metric-name', value='metric-value')} with given
#' prediction and dtrain.
#' @param stratified \code{boolean} whether sampling of folds should be stratified by the values of labels in \code{data}
#' @param folds \code{list} provides a possibility of using a list of pre-defined CV folds (each element must be a vector of fold's indices).
#' If folds are supplied, the nfold and stratified parameters would be ignored.
#' @param verbose \code{boolean}, print the statistics during the process
#' @param print.every.n Print every N progress messages when \code{verbose>0}. Default is 1 which means all messages are printed.
#' @param early.stop.round If \code{NULL}, the early stopping function is not triggered.
#' If set to an integer \code{k}, training with a validation set will stop if the performance
#' keeps getting worse consecutively for \code{k} rounds.
#' @param maximize If \code{feval} and \code{early.stop.round} are set, then \code{maximize} must be set as well.
#' \code{maximize=TRUE} means the larger the evaluation score the better.
#'
#' @param ... other parameters to pass to \code{params}.
#'
#' @return
#' If \code{prediction = TRUE}, a list with the following elements is returned:
#' \itemize{
#' \item \code{dt} a \code{data.table} with each mean and standard deviation stat for training set and test set
#' \item \code{pred} an array or matrix (for multiclass classification) with predictions for each CV-fold for the model having been trained on the data in all other folds.
#' }
#'
#' If \code{prediction = FALSE}, just a \code{data.table} with each mean and standard deviation stat for training set and test set is returned.
#'
#' @details
#' The original sample is randomly partitioned into \code{nfold} equal size subsamples.
#'
#' Of the \code{nfold} subsamples, a single subsample is retained as the validation data for testing the model, and the remaining \code{nfold - 1} subsamples are used as training data.
#'
#' The cross-validation process is then repeated \code{nrounds} times, with each of the \code{nfold} subsamples used exactly once as the validation data.
#'
#' All observations are used for both training and validation.
#'
#' Adapted from \url{http://en.wikipedia.org/wiki/Cross-validation_\%28statistics\%29#k-fold_cross-validation}
#'
#' @examples
#' data(agaricus.train, package='xgboost')
#' dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
#' history <- xgb.cv(data = dtrain, nround=3, nthread = 2, nfold = 5, metrics=list("rmse","auc"),
#' max.depth =3, eta = 1, objective = "binary:logistic")
#' print(history)
#' @export
xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing = NA,
prediction = FALSE, showsd = TRUE, metrics=list(),
obj = NULL, feval = NULL, stratified = TRUE, folds = NULL, verbose = T, print.every.n=1L,
early.stop.round = NULL, maximize = NULL, ...) {
if (typeof(params) != "list") {
stop("xgb.cv: first argument params must be list")
}
if(!is.null(folds)) {
if(class(folds) != "list" | length(folds) < 2) {
stop("folds must be a list with 2 or more elements that are vectors of indices for each CV-fold")
}
nfold <- length(folds)
}
if (nfold <= 1) {
stop("nfold must be bigger than 1")
}
dtrain <- xgb.get.DMatrix(data, label, missing)
dot.params <- list(...)
nms.params <- names(params)
nms.dot.params <- names(dot.params)
if (length(intersect(nms.params,nms.dot.params)) > 0)
stop("Duplicated defined term in parameters. Please check your list of params.")
params <- append(params, dot.params)
params <- append(params, list(silent=1))
for (mc in metrics) {
params <- append(params, list("eval_metric"=mc))
}
# customized objective and evaluation metric interface
if (!is.null(params$objective) && !is.null(obj))
stop("xgb.cv: cannot assign two different objectives")
if (!is.null(params$objective))
if (class(params$objective) == 'function') {
obj <- params$objective
params[['objective']] <- NULL
}
# if (!is.null(params$eval_metric) && !is.null(feval))
# stop("xgb.cv: cannot assign two different evaluation metrics")
if (!is.null(params$eval_metric))
if (class(params$eval_metric) == 'function') {
feval <- params$eval_metric
params[['eval_metric']] <- NULL
}
# Early Stopping
if (!is.null(early.stop.round)){
if (!is.null(feval) && is.null(maximize))
stop('Please set maximize to note whether the model is maximizing the evaluation or not.')
if (is.null(maximize) && is.null(params$eval_metric))
stop('Please set maximize to note whether the model is maximizing the evaluation or not.')
if (is.null(maximize))
{
if (params$eval_metric %in% c('rmse','logloss','error','merror','mlogloss')) {
maximize <- FALSE
} else {
maximize <- TRUE
}
}
if (maximize) {
bestScore <- 0
} else {
bestScore <- Inf
}
bestInd <- 0
earlyStopflag <- FALSE
if (length(metrics) > 1)
warning('Only the first metric is used for early stopping process.')
}
xgb_folds <- xgb.cv.mknfold(dtrain, nfold, params, stratified, folds)
obj_type <- params[['objective']]
mat_pred <- FALSE
if (!is.null(obj_type) && obj_type == 'multi:softprob')
{
num_class <- params[['num_class']]
if (is.null(num_class))
stop('must set num_class to use softmax')
predictValues <- matrix(0,xgb.numrow(dtrain),num_class)
mat_pred <- TRUE
}
else
predictValues <- rep(0,xgb.numrow(dtrain))
history <- c()
print.every.n <- max(as.integer(print.every.n), 1L)
for (i in 1:nrounds) {
msg <- list()
for (k in 1:nfold) {
fd <- xgb_folds[[k]]
succ <- xgb.iter.update(fd$booster, fd$dtrain, i - 1, obj)
msg[[k]] <- xgb.iter.eval(fd$booster, fd$watchlist, i - 1, feval) %>% str_split("\t") %>% .[[1]]
}
ret <- xgb.cv.aggcv(msg, showsd)
history <- c(history, ret)
if(verbose)
if (0 == (i - 1L) %% print.every.n)
cat(ret, "\n", sep="")
# early_Stopping
if (!is.null(early.stop.round)){
score <- strsplit(ret,'\\s+')[[1]][2 + length(metrics)]
score <- strsplit(score,'\\+|:')[[1]][[2]]
score <- as.numeric(score)
if ( (maximize && score > bestScore) || (!maximize && score < bestScore)) {
bestScore <- score
bestInd <- i
} else {
if (i - bestInd >= early.stop.round) {
earlyStopflag <- TRUE
cat('Stopping. Best iteration:', bestInd, '\n')
break
}
}
}
}
if (prediction) {
for (k in 1:nfold) {
fd <- xgb_folds[[k]]
if (!is.null(early.stop.round) && earlyStopflag) {
res <- xgb.iter.eval(fd$booster, fd$watchlist, bestInd - 1, feval, prediction)
} else {
res <- xgb.iter.eval(fd$booster, fd$watchlist, nrounds - 1, feval, prediction)
}
if (mat_pred) {
pred_mat <- matrix(res[[2]],num_class,length(fd$index))
predictValues[fd$index,] <- t(pred_mat)
} else {
predictValues[fd$index] <- res[[2]]
}
}
}
colnames <- str_split(string = history[1], pattern = "\t")[[1]] %>% .[2:length(.)] %>% str_extract(".*:") %>% str_replace(":","") %>% str_replace("-", ".")
colnamesMean <- paste(colnames, "mean")
if(showsd) colnamesStd <- paste(colnames, "std")
colnames <- c()
if(showsd) for(i in 1:length(colnamesMean)) colnames <- c(colnames, colnamesMean[i], colnamesStd[i])
else colnames <- colnamesMean
type <- rep(x = "numeric", times = length(colnames))
dt <- utils::read.table(text = "", colClasses = type, col.names = colnames) %>% as.data.table
split <- str_split(string = history, pattern = "\t")
for(line in split) dt <- line[2:length(line)] %>% str_extract_all(pattern = "\\d*\\.+\\d*") %>% unlist %>% as.numeric %>% as.list %>% {rbindlist( list( dt, .), use.names = F, fill = F)}
if (prediction) {
return( list( dt = dt,pred = predictValues))
}
return(dt)
}
# Avoid error messages during CRAN check.
# The reason is that these variables are never declared
# They are mainly column names inferred by Data.table...
globalVariables(".")

View File

@@ -1,70 +0,0 @@
#' Save xgboost model to text file
#'
#' Save a xgboost model to text file. Could be parsed later.
#'
#' @importFrom magrittr %>%
#' @importFrom stringr str_replace
#' @importFrom data.table fread
#' @importFrom data.table :=
#' @importFrom data.table setnames
#' @param model the model object.
#' @param fname the name of the text file where to save the model text dump. If not provided or set to \code{NULL} the function will return the model as a \code{character} vector.
#' @param fmap feature map file representing the type of feature.
#' Detailed description could be found at
#' \url{https://github.com/dmlc/xgboost/wiki/Binary-Classification#dump-model}.
#' See demo/ for walkthrough example in R, and
#' \url{https://github.com/dmlc/xgboost/blob/master/demo/data/featmap.txt}
#' for example Format.
#' @param with.stats whether dump statistics of splits
#' When this option is on, the model dump comes with two additional statistics:
#' gain is the approximate loss function gain we get in each split;
#' cover is the sum of second order gradient in each node.
#'
#' @return
#' if fname is not provided or set to \code{NULL} the function will return the model as a \code{character} vector. Otherwise it will return \code{TRUE}.
#'
#' @examples
#' data(agaricus.train, package='xgboost')
#' data(agaricus.test, package='xgboost')
#' train <- agaricus.train
#' test <- agaricus.test
#' bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
#' eta = 1, nthread = 2, nround = 2,objective = "binary:logistic")
#' # save the model in file 'xgb.model.dump'
#' xgb.dump(bst, 'xgb.model.dump', with.stats = TRUE)
#'
#' # print the model without saving it to a file
#' print(xgb.dump(bst))
#' @export
xgb.dump <- function(model = NULL, fname = NULL, fmap = "", with.stats=FALSE) {
if (class(model) != "xgb.Booster") {
stop("model: argument must be type xgb.Booster")
} else {
model <- xgb.Booster.check(model)
}
if (!(class(fname) %in% c("character", "NULL") && length(fname) <= 1)) {
stop("fname: argument must be type character (when provided)")
}
if (!(class(fmap) %in% c("character", "NULL") && length(fname) <= 1)) {
stop("fmap: argument must be type character (when provided)")
}
longString <- .Call("XGBoosterDumpModel_R", model$handle, fmap, as.integer(with.stats), PACKAGE = "xgboost")
dt <- fread(paste(longString, collapse = ""), sep = "\n", header = F)
setnames(dt, "Lines")
if(is.null(fname)) {
result <- dt[Lines != "0"][, Lines := str_replace(Lines, "^\t+", "")][Lines != ""][, paste(Lines)]
return(result)
} else {
result <- dt[Lines != "0"][Lines != ""][, paste(Lines)] %>% writeLines(fname)
return(TRUE)
}
}
# Avoid error messages during CRAN check.
# The reason is that these variables are never declared
# They are mainly column names inferred by Data.table...
globalVariables(c("Lines", "."))

View File

@@ -1,115 +0,0 @@
#' Show importance of features in a model
#'
#' Create a \code{data.table} of the most important features of a model.
#'
#' @importFrom data.table data.table
#' @importFrom data.table setnames
#' @importFrom data.table :=
#' @importFrom magrittr %>%
#' @importFrom Matrix colSums
#' @importFrom Matrix cBind
#' @importFrom Matrix sparseVector
#'
#' @param feature_names names of each feature as a \code{character} vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}.
#' @param model generated by the \code{xgb.train} function.
#' @param data the dataset used for the training step. Will be used with \code{label} parameter for co-occurence computation. More information in \code{Detail} part. This parameter is optional.
#' @param label the label vetor used for the training step. Will be used with \code{data} parameter for co-occurence computation. More information in \code{Detail} part. This parameter is optional.
#' @param target a function which returns \code{TRUE} or \code{1} when an observation should be count as a co-occurence and \code{FALSE} or \code{0} otherwise. Default function is provided for computing co-occurences in a binary classification. The \code{target} function should have only one parameter. This parameter will be used to provide each important feature vector after having applied the split condition, therefore these vector will be only made of 0 and 1 only, whatever was the information before. More information in \code{Detail} part. This parameter is optional.
#'
#' @return A \code{data.table} of the features used in the model with their average gain (and their weight for boosted tree model) in the model.
#'
#' @details
#' This function is for both linear and tree models.
#'
#' \code{data.table} is returned by the function.
#' The columns are :
#' \itemize{
#' \item \code{Features} name of the features as provided in \code{feature_names} or already present in the model dump;
#' \item \code{Gain} contribution of each feature to the model. For boosted tree model, each gain of each feature of each tree is taken into account, then average per feature to give a vision of the entire model. Highest percentage means important feature to predict the \code{label} used for the training (only available for tree models);
#' \item \code{Cover} metric of the number of observation related to this feature (only available for tree models);
#' \item \code{Weight} percentage representing the relative number of times a feature have been taken into trees.
#' }
#'
#' If you don't provide \code{feature_names}, index of the features will be used instead.
#'
#' Because the index is extracted from the model dump (made on the C++ side), it starts at 0 (usual in C++) instead of 1 (usual in R).
#'
#' Co-occurence count
#' ------------------
#'
#' The gain gives you indication about the information of how a feature is important in making a branch of a decision tree more pure. However, with this information only, you can't know if this feature has to be present or not to get a specific classification. In the example code, you may wonder if odor=none should be \code{TRUE} to not eat a mushroom.
#'
#' Co-occurence computation is here to help in understanding this relation between a predictor and a specific class. It will count how many observations are returned as \code{TRUE} by the \code{target} function (see parameters). When you execute the example below, there are 92 times only over the 3140 observations of the train dataset where a mushroom have no odor and can be eaten safely.
#'
#' If you need to remember one thing only: until you want to leave us early, don't eat a mushroom which has no odor :-)
#'
#' @examples
#' data(agaricus.train, package='xgboost')
#'
#' bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max.depth = 2,
#' eta = 1, nthread = 2, nround = 2,objective = "binary:logistic")
#'
#' # agaricus.train$data@@Dimnames[[2]] represents the column names of the sparse matrix.
#' xgb.importance(agaricus.train$data@@Dimnames[[2]], model = bst)
#'
#' # Same thing with co-occurence computation this time
#' xgb.importance(agaricus.train$data@@Dimnames[[2]], model = bst, data = agaricus.train$data, label = agaricus.train$label)
#'
#' @export
xgb.importance <- function(feature_names = NULL, model = NULL, data = NULL, label = NULL, target = function(x) ( (x + label) == 2)){
if (!class(feature_names) %in% c("character", "NULL")) {
stop("feature_names: Has to be a vector of character or NULL if the model already contains feature name. Look at this function documentation to see where to get feature names.")
}
if (class(model) != "xgb.Booster") {
stop("model: Has to be an object of class xgb.Booster model generaged by the xgb.train function.")
}
if((is.null(data) & !is.null(label)) | (!is.null(data) & is.null(label))) {
stop("data/label: Provide the two arguments if you want co-occurence computation or none of them if you are not interested but not one of them only.")
}
if(class(label) == "numeric"){
if(sum(label == 0) / length(label) > 0.5) label <- as(label, "sparseVector")
}
treeDump <- function(feature_names, text, keepDetail){
if(keepDetail) groupBy <- c("Feature", "Split", "MissingNo") else groupBy <- "Feature"
xgb.model.dt.tree(feature_names = feature_names, text = text)[,"MissingNo" := Missing == No ][Feature != "Leaf",.(Gain = sum(Quality), Cover = sum(Cover), Frequency = .N), by = groupBy, with = T][,`:=`(Gain = Gain / sum(Gain), Cover = Cover / sum(Cover), Frequency = Frequency / sum(Frequency))][order(Gain, decreasing = T)]
}
linearDump <- function(feature_names, text){
weights <- which(text == "weight:") %>% {a =. + 1; text[a:length(text)]} %>% as.numeric
if(is.null(feature_names)) feature_names <- seq(to = length(weights))
data.table(Feature = feature_names, Weight = weights)
}
model.text.dump <- xgb.dump(model = model, with.stats = T)
if(model.text.dump[2] == "bias:"){
result <- model.text.dump %>% linearDump(feature_names, .)
if(!is.null(data) | !is.null(label)) warning("data/label: these parameters should only be provided with decision tree based models.")
} else {
result <- treeDump(feature_names, text = model.text.dump, keepDetail = !is.null(data))
# Co-occurence computation
if(!is.null(data) & !is.null(label) & nrow(result) > 0) {
# Take care of missing column
a <- data[, result[MissingNo == T,Feature], drop=FALSE] != 0
# Bind the two Matrix and reorder columns
c <- data[, result[MissingNo == F,Feature], drop=FALSE] %>% cBind(a,.) %>% .[,result[,Feature]]
rm(a)
# Apply split
d <- data[, result[,Feature], drop=FALSE] < as.numeric(result[,Split])
apply(c & d, 2, . %>% target %>% sum) -> vec
result <- result[, "RealCover" := as.numeric(vec), with = F][, "RealCover %" := RealCover / sum(label)][,MissingNo := NULL]
}
}
result
}
# Avoid error messages during CRAN check.
# The reason is that these variables are never declared
# They are mainly column names inferred by Data.table...
globalVariables(c(".", "Feature", "Split", "No", "Missing", "MissingNo", "RealCover"))

View File

@@ -1,31 +0,0 @@
#' Load xgboost model from binary file
#'
#' Load xgboost model from the binary model file
#'
#' @param modelfile the name of the binary file.
#'
#' @examples
#' data(agaricus.train, package='xgboost')
#' data(agaricus.test, package='xgboost')
#' train <- agaricus.train
#' test <- agaricus.test
#' bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
#' eta = 1, nthread = 2, nround = 2,objective = "binary:logistic")
#' xgb.save(bst, 'xgb.model')
#' bst <- xgb.load('xgb.model')
#' pred <- predict(bst, test$data)
#' @export
xgb.load <- function(modelfile) {
if (is.null(modelfile))
stop("xgb.load: modelfile cannot be NULL")
handle <- xgb.Booster(modelfile = modelfile)
# re-use modelfile if it is raw so we donot need to serialize
if (typeof(modelfile) == "raw") {
bst <- xgb.handleToBooster(handle, modelfile)
} else {
bst <- xgb.handleToBooster(handle, NULL)
}
bst <- xgb.Booster.check(bst)
return(bst)
}

View File

@@ -1,153 +0,0 @@
#' Parse boosted tree model text dump
#'
#' Parse a boosted tree model text dump and return a \code{data.table}.
#'
#' @importFrom data.table data.table
#' @importFrom data.table set
#' @importFrom data.table rbindlist
#' @importFrom data.table copy
#' @importFrom data.table :=
#' @importFrom magrittr %>%
#' @importFrom magrittr not
#' @importFrom magrittr add
#' @importFrom stringr str_extract
#' @importFrom stringr str_split
#' @importFrom stringr str_trim
#' @param feature_names names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If the model already contains feature names, this argument should be \code{NULL} (default value).
#' @param model object created by the \code{xgb.train} function.
#' @param text \code{character} vector generated by the \code{xgb.dump} function. Model dump must include the gain per feature and per tree (parameter \code{with.stats = TRUE} in function \code{xgb.dump}).
#' @param n_first_tree limit the plot to the \code{n} first trees. If set to \code{NULL}, all trees of the model are plotted. Performance can be low depending of the size of the model.
#'
#' @return A \code{data.table} of the features used in the model with their gain, cover and few other information.
#'
#' @details
#' General function to convert a text dump of tree model to a \code{data.table}.
#'
#' The purpose is to help user to explore the model and get a better understanding of it.
#'
#' The columns of the \code{data.table} are:
#'
#' \itemize{
#' \item \code{ID}: unique identifier of a node ;
#' \item \code{Feature}: feature used in the tree to operate a split. When Leaf is indicated, it is the end of a branch ;
#' \item \code{Split}: value of the chosen feature where is operated the split ;
#' \item \code{Yes}: ID of the feature for the next node in the branch when the split condition is met ;
#' \item \code{No}: ID of the feature for the next node in the branch when the split condition is not met ;
#' \item \code{Missing}: ID of the feature for the next node in the branch for observation where the feature used for the split are not provided ;
#' \item \code{Quality}: it's the gain related to the split in this specific node ;
#' \item \code{Cover}: metric to measure the number of observation affected by the split ;
#' \item \code{Tree}: ID of the tree. It is included in the main ID ;
#' \item \code{Yes.Feature}, \code{No.Feature}, \code{Yes.Cover}, \code{No.Cover}, \code{Yes.Quality} and \code{No.Quality}: data related to the pointer in \code{Yes} or \code{No} column ;
#' }
#'
#' @examples
#' data(agaricus.train, package='xgboost')
#'
#' bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max.depth = 2,
#' eta = 1, nthread = 2, nround = 2,objective = "binary:logistic")
#'
#' # agaricus.train$data@@Dimnames[[2]] represents the column names of the sparse matrix.
#' xgb.model.dt.tree(feature_names = agaricus.train$data@@Dimnames[[2]], model = bst)
#'
#' @export
xgb.model.dt.tree <- function(feature_names = NULL, model = NULL, text = NULL, n_first_tree = NULL){
if (!class(feature_names) %in% c("character", "NULL")) {
stop("feature_names: Has to be a vector of character or NULL if the model dump already contains feature name. Look at this function documentation to see where to get feature names.")
}
if (class(model) != "xgb.Booster" & class(text) != "character") {
"model: Has to be an object of class xgb.Booster model generaged by the xgb.train function.\n" %>%
paste0("text: Has to be a vector of character or NULL if a path to the model dump has already been provided.") %>%
stop()
}
if (!class(n_first_tree) %in% c("numeric", "NULL") | length(n_first_tree) > 1) {
stop("n_first_tree: Has to be a numeric vector of size 1.")
}
if(is.null(text)){
text <- xgb.dump(model = model, with.stats = T)
}
position <- str_match(text, "booster") %>% is.na %>% not %>% which %>% c(length(text) + 1)
extract <- function(x, pattern) str_extract(x, pattern) %>% str_split("=") %>% lapply(function(x) x[2] %>% as.numeric) %>% unlist
n_round <- min(length(position) - 1, n_first_tree)
addTreeId <- function(x, i) paste(i,x,sep = "-")
allTrees <- data.table()
anynumber_regex <- "[-+]?[0-9]*\\.?[0-9]+([eE][-+]?[0-9]+)?"
for (i in 1:n_round){
tree <- text[(position[i] + 1):(position[i + 1] - 1)]
# avoid tree made of a leaf only (no split)
if(length(tree) < 2) next
treeID <- i - 1
notLeaf <- str_match(tree, "leaf") %>% is.na
leaf <- notLeaf %>% not %>% tree[.]
branch <- notLeaf %>% tree[.]
idBranch <- str_extract(branch, "\\d*:") %>% str_replace(":", "") %>% addTreeId(treeID)
idLeaf <- str_extract(leaf, "\\d*:") %>% str_replace(":", "") %>% addTreeId(treeID)
featureBranch <- str_extract(branch, "f\\d*<") %>% str_replace("<", "") %>% str_replace("f", "") %>% as.numeric
if(!is.null(feature_names)){
featureBranch <- feature_names[featureBranch + 1]
}
featureLeaf <- rep("Leaf", length(leaf))
splitBranch <- str_extract(branch, paste0("<",anynumber_regex,"\\]")) %>% str_replace("<", "") %>% str_replace("\\]", "")
splitLeaf <- rep(NA, length(leaf))
yesBranch <- extract(branch, "yes=\\d*") %>% addTreeId(treeID)
yesLeaf <- rep(NA, length(leaf))
noBranch <- extract(branch, "no=\\d*") %>% addTreeId(treeID)
noLeaf <- rep(NA, length(leaf))
missingBranch <- extract(branch, "missing=\\d+") %>% addTreeId(treeID)
missingLeaf <- rep(NA, length(leaf))
qualityBranch <- extract(branch, paste0("gain=",anynumber_regex))
qualityLeaf <- extract(leaf, paste0("leaf=",anynumber_regex))
coverBranch <- extract(branch, "cover=\\d*\\.*\\d*")
coverLeaf <- extract(leaf, "cover=\\d*\\.*\\d*")
dt <- data.table(ID = c(idBranch, idLeaf), Feature = c(featureBranch, featureLeaf), Split = c(splitBranch, splitLeaf), Yes = c(yesBranch, yesLeaf), No = c(noBranch, noLeaf), Missing = c(missingBranch, missingLeaf), Quality = c(qualityBranch, qualityLeaf), Cover = c(coverBranch, coverLeaf))[order(ID)][,Tree := treeID]
allTrees <- rbindlist(list(allTrees, dt), use.names = T, fill = F)
}
yes <- allTrees[!is.na(Yes), Yes]
set(allTrees, i = which(allTrees[, Feature] != "Leaf"),
j = "Yes.Feature",
value = allTrees[ID %in% yes, Feature])
set(allTrees, i = which(allTrees[, Feature] != "Leaf"),
j = "Yes.Cover",
value = allTrees[ID %in% yes, Cover])
set(allTrees, i = which(allTrees[, Feature] != "Leaf"),
j = "Yes.Quality",
value = allTrees[ID %in% yes, Quality])
no <- allTrees[!is.na(No), No]
set(allTrees, i = which(allTrees[, Feature] != "Leaf"),
j = "No.Feature",
value = allTrees[ID %in% no, Feature])
set(allTrees, i = which(allTrees[, Feature] != "Leaf"),
j = "No.Cover",
value = allTrees[ID %in% no, Cover])
set(allTrees, i = which(allTrees[, Feature] != "Leaf"),
j = "No.Quality",
value = allTrees[ID %in% no, Quality])
allTrees
}
# Avoid error messages during CRAN check.
# The reason is that these variables are never declared
# They are mainly column names inferred by Data.table...
globalVariables(c("ID", "Tree", "Yes", ".", ".N", "Feature", "Cover", "Quality", "No", "Gain", "Frequency"))

View File

@@ -1,160 +0,0 @@
#' Plot multiple graphs at the same time
#'
#' Plot multiple graph aligned by rows and columns.
#'
#' @importFrom data.table data.table
#' @param cols number of columns
#' @return NULL
multiplot <- function(..., cols = 1) {
plots <- list(...)
numPlots = length(plots)
layout <- matrix(seq(1, cols * ceiling(numPlots / cols)),
ncol = cols, nrow = ceiling(numPlots / cols))
if (numPlots == 1) {
print(plots[[1]])
} else {
grid::grid.newpage()
grid::pushViewport(grid::viewport(layout = grid::grid.layout(nrow(layout), ncol(layout))))
for (i in 1:numPlots) {
# Get the i,j matrix positions of the regions that contain this subplot
matchidx <- as.data.table(which(layout == i, arr.ind = TRUE))
print(
plots[[i]], vp = grid::viewport(
layout.pos.row = matchidx$row,
layout.pos.col = matchidx$col
)
)
}
}
}
#' Parse the graph to extract vector of edges
#' @param element igraph object containing the path from the root to the leaf.
edge.parser <- function(element) {
edges.vector <- igraph::as_ids(element)
t <- tail(edges.vector, n = 1)
l <- length(edges.vector)
list(t,l)
}
#' Extract path from root to leaf from data.table
#' @param dt.tree data.table containing the nodes and edges of the trees
get.paths.to.leaf <- function(dt.tree) {
dt.not.leaf.edges <-
dt.tree[Feature != "Leaf",.(ID, Yes, Tree)] %>% list(dt.tree[Feature != "Leaf",.(ID, No, Tree)]) %>% rbindlist(use.names = F)
trees <- dt.tree[,unique(Tree)]
paths <- list()
for (tree in trees) {
graph <-
igraph::graph_from_data_frame(dt.not.leaf.edges[Tree == tree])
paths.tmp <-
igraph::shortest_paths(graph, from = paste0(tree, "-0"), to = dt.tree[Tree == tree &
Feature == "Leaf", c(ID)])
paths <- c(paths, paths.tmp$vpath)
}
paths
}
#' Plot model trees deepness
#'
#' Generate a graph to plot the distribution of deepness among trees.
#'
#' @importFrom data.table data.table
#' @importFrom data.table rbindlist
#' @importFrom data.table setnames
#' @importFrom data.table :=
#' @importFrom magrittr %>%
#' @param model dump generated by the \code{xgb.train} function.
#'
#' @return Two graphs showing the distribution of the model deepness.
#'
#' @details
#' Display both the number of \code{leaf} and the distribution of \code{weighted observations}
#' by tree deepness level.
#'
#' The purpose of this function is to help the user to find the best trade-off to set
#' the \code{max.depth} and \code{min_child_weight} parameters according to the bias / variance trade-off.
#'
#' See \link{xgb.train} for more information about these parameters.
#'
#' The graph is made of two parts:
#'
#' \itemize{
#' \item Count: number of leaf per level of deepness;
#' \item Weighted cover: noramlized weighted cover per leaf (weighted number of instances).
#' }
#'
#' This function is inspired by the blog post \url{http://aysent.github.io/2015/11/08/random-forest-leaf-visualization.html}
#'
#' @examples
#' data(agaricus.train, package='xgboost')
#'
#' bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max.depth = 15,
#' eta = 1, nthread = 2, nround = 30, objective = "binary:logistic",
#' min_child_weight = 50)
#'
#' xgb.plot.deepness(model = bst)
#'
#' @export
xgb.plot.deepness <- function(model = NULL) {
if (!requireNamespace("ggplot2", quietly = TRUE)) {
stop("ggplot2 package is required for plotting the graph deepness.",
call. = FALSE)
}
if (!requireNamespace("igraph", quietly = TRUE)) {
stop("igraph package is required for plotting the graph deepness.",
call. = FALSE)
}
if (!requireNamespace("grid", quietly = TRUE)) {
stop("grid package is required for plotting the graph deepness.",
call. = FALSE)
}
if (class(model) != "xgb.Booster") {
stop("model: Has to be an object of class xgb.Booster model generaged by the xgb.train function.")
}
dt.tree <- xgb.model.dt.tree(model = model)
dt.edge.elements <- data.table()
paths <- get.paths.to.leaf(dt.tree)
dt.edge.elements <-
lapply(paths, edge.parser) %>% rbindlist %>% setnames(c("last.edge", "size")) %>%
merge(dt.tree, by.x = "last.edge", by.y = "ID") %>% rbind(dt.edge.elements)
dt.edge.summuize <-
dt.edge.elements[, .(.N, Cover = sum(Cover)), size][,Cover:= Cover / sum(Cover)]
p1 <-
ggplot2::ggplot(dt.edge.summuize) + ggplot2::geom_line(ggplot2::aes(x = size, y = N, group = 1)) +
ggplot2::xlab("") + ggplot2::ylab("Count") + ggplot2::ggtitle("Model complexity") +
ggplot2::theme(
plot.title = ggplot2::element_text(lineheight = 0.9, face = "bold"),
panel.grid.major.y = ggplot2::element_blank(),
axis.ticks = ggplot2::element_blank(),
axis.text.x = ggplot2::element_blank()
)
p2 <-
ggplot2::ggplot(dt.edge.summuize) + ggplot2::geom_line(ggplot2::aes(x =size, y = Cover, group = 1)) +
ggplot2::xlab("From root to leaf path length") + ggplot2::ylab("Weighted cover")
multiplot(p1,p2,cols = 1)
}
# Avoid error messages during CRAN check.
# The reason is that these variables are never declared
# They are mainly column names inferred by Data.table...
globalVariables(
c(
"Feature", "Count", "ggplot", "aes", "geom_bar", "xlab", "ylab", "ggtitle", "theme", "element_blank", "element_text", "ID", "Yes", "No", "Tree"
)
)

View File

@@ -1,79 +0,0 @@
#' Plot feature importance bar graph
#'
#' Read a data.table containing feature importance details and plot it (for both GLM and Trees).
#'
#' @importFrom magrittr %>%
#' @param importance_matrix a \code{data.table} returned by the \code{xgb.importance} function.
#' @param numberOfClusters a \code{numeric} vector containing the min and the max range of the possible number of clusters of bars.
#'
#' @return A \code{ggplot2} bar graph representing each feature by a horizontal bar. Longer is the bar, more important is the feature. Features are classified by importance and clustered by importance. The group is represented through the color of the bar.
#'
#' @details
#' The purpose of this function is to easily represent the importance of each feature of a model.
#' The function returns a ggplot graph, therefore each of its characteristic can be overriden (to customize it).
#' In particular you may want to override the title of the graph. To do so, add \code{+ ggtitle("A GRAPH NAME")} next to the value returned by this function.
#'
#' @examples
#' data(agaricus.train, package='xgboost')
#'
#' #Both dataset are list with two items, a sparse matrix and labels
#' #(labels = outcome column which will be learned).
#' #Each column of the sparse Matrix is a feature in one hot encoding format.
#'
#' bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max.depth = 2,
#' eta = 1, nthread = 2, nround = 2,objective = "binary:logistic")
#'
#' #agaricus.train$data@@Dimnames[[2]] represents the column names of the sparse matrix.
#' importance_matrix <- xgb.importance(agaricus.train$data@@Dimnames[[2]], model = bst)
#' xgb.plot.importance(importance_matrix)
#'
#' @export
xgb.plot.importance <-
function(importance_matrix = NULL, numberOfClusters = c(1:10)) {
if (!"data.table" %in% class(importance_matrix)) {
stop("importance_matrix: Should be a data.table.")
}
if (!requireNamespace("ggplot2", quietly = TRUE)) {
stop("ggplot2 package is required for plotting the importance", call. = FALSE)
}
if (!requireNamespace("Ckmeans.1d.dp", quietly = TRUE)) {
stop("Ckmeans.1d.dp package is required for plotting the importance", call. = FALSE)
}
if(isTRUE(all.equal(colnames(importance_matrix), c("Feature", "Gain", "Cover", "Frequency")))){
y.axe.name <- "Gain"
} else if(isTRUE(all.equal(colnames(importance_matrix), c("Feature", "Weight")))){
y.axe.name <- "Weight"
} else {
stop("Importance matrix is not correct (column names issue)")
}
# To avoid issues in clustering when co-occurences are used
importance_matrix <-
importance_matrix[, .(Gain.or.Weight = sum(get(y.axe.name))), by = Feature]
clusters <-
suppressWarnings(Ckmeans.1d.dp::Ckmeans.1d.dp(importance_matrix[,Gain.or.Weight], numberOfClusters))
importance_matrix[,"Cluster":= clusters$cluster %>% as.character]
plot <-
ggplot2::ggplot(
importance_matrix, ggplot2::aes(
x = stats::reorder(Feature, Gain.or.Weight), y = Gain.or.Weight, width = 0.05
), environment = environment()
) + ggplot2::geom_bar(ggplot2::aes(fill = Cluster), stat = "identity", position =
"identity") + ggplot2::coord_flip() + ggplot2::xlab("Features") + ggplot2::ylab(y.axe.name) + ggplot2::ggtitle("Feature importance") + ggplot2::theme(
plot.title = ggplot2::element_text(lineheight = .9, face = "bold"), panel.grid.major.y = ggplot2::element_blank()
)
return(plot)
}
# Avoid error messages during CRAN check.
# The reason is that these variables are never declared
# They are mainly column names inferred by Data.table...
globalVariables(
c(
"Feature", "Gain.or.Weight", "Cluster", "ggplot", "aes", "geom_bar", "coord_flip", "xlab", "ylab", "ggtitle", "theme", "element_blank", "element_text", "Gain.or.Weight"
)
)

View File

@@ -1,114 +0,0 @@
#' Project all trees on one tree and plot it
#'
#' Visualization of the ensemble of trees as a single collective unit.
#'
#' @importFrom data.table data.table
#' @importFrom data.table rbindlist
#' @importFrom data.table setnames
#' @importFrom data.table :=
#' @importFrom magrittr %>%
#' @importFrom stringr str_detect
#' @importFrom stringr str_extract
#'
#' @param model dump generated by the \code{xgb.train} function.
#' @param feature_names names of each feature as a \code{character} vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}.
#' @param features.keep number of features to keep in each position of the multi trees.
#' @param plot.width width in pixels of the graph to produce
#' @param plot.height height in pixels of the graph to produce
#'
#' @return Two graphs showing the distribution of the model deepness.
#'
#' @details
#'
#' This function tries to capture the complexity of gradient boosted tree ensemble
#' in a cohesive way.
#'
#' The goal is to improve the interpretability of the model generally seen as black box.
#' The function is dedicated to boosting applied to decision trees only.
#'
#' The purpose is to move from an ensemble of trees to a single tree only.
#'
#' It takes advantage of the fact that the shape of a binary tree is only defined by
#' its deepness (therefore in a boosting model, all trees have the same shape).
#'
#' Moreover, the trees tend to reuse the same features.
#'
#' The function will project each tree on one, and keep for each position the
#' \code{features.keep} first features (based on Gain per feature measure).
#'
#' This function is inspired by this blog post:
#' \url{https://wellecks.wordpress.com/2015/02/21/peering-into-the-black-box-visualizing-lambdamart/}
#'
#' @examples
#' data(agaricus.train, package='xgboost')
#'
#' bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max.depth = 15,
#' eta = 1, nthread = 2, nround = 30, objective = "binary:logistic",
#' min_child_weight = 50)
#'
#' p <- xgb.plot.multi.trees(model = bst, feature_names = agaricus.train$data@Dimnames[[2]], features.keep = 3)
#' print(p)
#'
#' @export
xgb.plot.multi.trees <- function(model, feature_names = NULL, features.keep = 5, plot.width = NULL, plot.height = NULL){
tree.matrix <- xgb.model.dt.tree(feature_names = feature_names, model = model)
# first number of the path represents the tree, then the following numbers are related to the path to follow
# root init
root.nodes <- tree.matrix[str_detect(ID, "\\d+-0"), ID]
tree.matrix[ID %in% root.nodes, abs.node.position:=root.nodes]
precedent.nodes <- root.nodes
while(tree.matrix[,sum(is.na(abs.node.position))] > 0) {
yes.row.nodes <- tree.matrix[abs.node.position %in% precedent.nodes & !is.na(Yes)]
no.row.nodes <- tree.matrix[abs.node.position %in% precedent.nodes & !is.na(No)]
yes.nodes.abs.pos <- yes.row.nodes[, abs.node.position] %>% paste0("_0")
no.nodes.abs.pos <- no.row.nodes[, abs.node.position] %>% paste0("_1")
tree.matrix[ID %in% yes.row.nodes[, Yes], abs.node.position := yes.nodes.abs.pos]
tree.matrix[ID %in% no.row.nodes[, No], abs.node.position := no.nodes.abs.pos]
precedent.nodes <- c(yes.nodes.abs.pos, no.nodes.abs.pos)
}
tree.matrix[!is.na(Yes),Yes:= paste0(abs.node.position, "_0")]
tree.matrix[!is.na(No),No:= paste0(abs.node.position, "_1")]
remove.tree <- . %>% str_replace(pattern = "^\\d+-", replacement = "")
tree.matrix[,`:=`(abs.node.position=remove.tree(abs.node.position), Yes=remove.tree(Yes), No=remove.tree(No))]
nodes.dt <- tree.matrix[,.(Quality = sum(Quality)),by = .(abs.node.position, Feature)][,.(Text =paste0(Feature[1:min(length(Feature), features.keep)], " (", Quality[1:min(length(Quality), features.keep)], ")") %>% paste0(collapse = "\n")), by=abs.node.position]
edges.dt <- tree.matrix[Feature != "Leaf",.(abs.node.position, Yes)] %>% list(tree.matrix[Feature != "Leaf",.(abs.node.position, No)]) %>% rbindlist() %>% setnames(c("From", "To")) %>% .[,.N,.(From, To)] %>% .[,N:=NULL]
nodes <- DiagrammeR::create_nodes(nodes = nodes.dt[,abs.node.position],
label = nodes.dt[,Text],
style = "filled",
color = "DimGray",
fillcolor= "Beige",
shape = "oval",
fontname = "Helvetica"
)
edges <- DiagrammeR::create_edges(from = edges.dt[,From],
to = edges.dt[,To],
color = "DimGray",
arrowsize = "1.5",
arrowhead = "vee",
fontname = "Helvetica",
rel = "leading_to")
graph <- DiagrammeR::create_graph(nodes_df = nodes,
edges_df = edges,
graph_attrs = "rankdir = LR")
DiagrammeR::render_graph(graph, width = plot.width, height = plot.height)
}
globalVariables(
c(
"Feature", "no.nodes.abs.pos", "ID", "Yes", "No", "Tree", "yes.nodes.abs.pos", "abs.node.position"
)
)

View File

@@ -1,84 +0,0 @@
#' Plot a boosted tree model
#'
#' Read a tree model text dump and plot the model.
#'
#' @importFrom data.table data.table
#' @importFrom data.table :=
#' @importFrom magrittr %>%
#' @param feature_names names of each feature as a \code{character} vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}.
#' @param model generated by the \code{xgb.train} function. Avoid the creation of a dump file.
#' @param n_first_tree limit the plot to the n first trees. If \code{NULL}, all trees of the model are plotted. Performance can be low for huge models.
#' @param plot.width the width of the diagram in pixels.
#' @param plot.height the height of the diagram in pixels.
#'
#' @return A \code{DiagrammeR} of the model.
#'
#' @details
#'
#' The content of each node is organised that way:
#'
#' \itemize{
#' \item \code{feature} value;
#' \item \code{cover}: the sum of second order gradient of training data classified to the leaf, if it is square loss, this simply corresponds to the number of instances in that branch. Deeper in the tree a node is, lower this metric will be;
#' \item \code{gain}: metric the importance of the node in the model.
#' }
#'
#' The function uses \href{http://www.graphviz.org/}{GraphViz} library for that purpose.
#'
#' @examples
#' data(agaricus.train, package='xgboost')
#'
#' bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max.depth = 2,
#' eta = 1, nthread = 2, nround = 2,objective = "binary:logistic")
#'
#' # agaricus.train$data@@Dimnames[[2]] represents the column names of the sparse matrix.
#' xgb.plot.tree(feature_names = agaricus.train$data@@Dimnames[[2]], model = bst)
#'
#' @export
xgb.plot.tree <- function(feature_names = NULL, model = NULL, n_first_tree = NULL, plot.width = NULL, plot.height = NULL){
if (class(model) != "xgb.Booster") {
stop("model: Has to be an object of class xgb.Booster model generaged by the xgb.train function.")
}
if (!requireNamespace("DiagrammeR", quietly = TRUE)) {
stop("DiagrammeR package is required for xgb.plot.tree", call. = FALSE)
}
allTrees <- xgb.model.dt.tree(feature_names = feature_names, model = model, n_first_tree = n_first_tree)
allTrees[, label:= paste0(Feature, "\nCover: ", Cover, "\nGain: ", Quality)]
allTrees[, shape:= "rectangle"][Feature == "Leaf", shape:= "oval"]
allTrees[, filledcolor:= "Beige"][Feature == "Leaf", filledcolor:= "Khaki"]
# rev is used to put the first tree on top.
nodes <- DiagrammeR::create_nodes(nodes = allTrees[,ID] %>% rev,
label = allTrees[,label] %>% rev,
style = "filled",
color = "DimGray",
fillcolor= allTrees[,filledcolor] %>% rev,
shape = allTrees[,shape] %>% rev,
data = allTrees[,Feature] %>% rev,
fontname = "Helvetica"
)
edges <- DiagrammeR::create_edges(from = allTrees[Feature != "Leaf", c(ID)] %>% rep(2),
to = allTrees[Feature != "Leaf", c(Yes, No)],
label = allTrees[Feature != "Leaf", paste("<",Split)] %>% c(rep("",nrow(allTrees[Feature != "Leaf"]))),
color = "DimGray",
arrowsize = "1.5",
arrowhead = "vee",
fontname = "Helvetica",
rel = "leading_to")
graph <- DiagrammeR::create_graph(nodes_df = nodes,
edges_df = edges,
graph_attrs = "rankdir = LR")
DiagrammeR::render_graph(graph, width = plot.width, height = plot.height)
}
# Avoid error messages during CRAN check.
# The reason is that these variables are never declared
# They are mainly column names inferred by Data.table...
globalVariables(c("Feature", "ID", "Cover", "Quality", "Split", "Yes", "No", ".", "shape", "filledcolor", "label"))

View File

@@ -1,31 +0,0 @@
#' Save xgboost model to binary file
#'
#' Save xgboost model from xgboost or xgb.train
#'
#' @param model the model object.
#' @param fname the name of the binary file.
#'
#' @examples
#' data(agaricus.train, package='xgboost')
#' data(agaricus.test, package='xgboost')
#' train <- agaricus.train
#' test <- agaricus.test
#' bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
#' eta = 1, nthread = 2, nround = 2,objective = "binary:logistic")
#' xgb.save(bst, 'xgb.model')
#' bst <- xgb.load('xgb.model')
#' pred <- predict(bst, test$data)
#' @export
xgb.save <- function(model, fname) {
if (typeof(fname) != "character") {
stop("xgb.save: fname must be character")
}
if (class(model) == "xgb.Booster") {
model <- xgb.Booster.check(model)
.Call("XGBoosterSaveModel_R", model$handle, fname, PACKAGE = "xgboost")
return(TRUE)
}
stop("xgb.save: the input must be xgb.Booster. Use xgb.DMatrix.save to save
xgb.DMatrix object.")
return(FALSE)
}

View File

@@ -1,29 +0,0 @@
#' Save xgboost model to R's raw vector,
#' user can call xgb.load to load the model back from raw vector
#'
#' Save xgboost model from xgboost or xgb.train
#'
#' @param model the model object.
#'
#' @examples
#' data(agaricus.train, package='xgboost')
#' data(agaricus.test, package='xgboost')
#' train <- agaricus.train
#' test <- agaricus.test
#' bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
#' eta = 1, nthread = 2, nround = 2,objective = "binary:logistic")
#' raw <- xgb.save.raw(bst)
#' bst <- xgb.load(raw)
#' pred <- predict(bst, test$data)
#' @export
xgb.save.raw <- function(model) {
if (class(model) == "xgb.Booster"){
model <- model$handle
}
if (class(model) == "xgb.Booster.handle") {
raw <- .Call("XGBoosterModelToRaw_R", model, PACKAGE = "xgboost")
return(raw)
}
stop("xgb.raw: the input must be xgb.Booster.handle. Use xgb.DMatrix.save to save
xgb.DMatrix object.")
}

View File

@@ -1,237 +0,0 @@
#' eXtreme Gradient Boosting Training
#'
#' An advanced interface for training xgboost model. Look at \code{\link{xgboost}} function for a simpler interface.
#'
#' @param params the list of parameters.
#'
#' 1. General Parameters
#'
#' \itemize{
#' \item \code{booster} which booster to use, can be \code{gbtree} or \code{gblinear}. Default: \code{gbtree}
#' \item \code{silent} 0 means printing running messages, 1 means silent mode. Default: 0
#' }
#'
#' 2. Booster Parameters
#'
#' 2.1. Parameter for Tree Booster
#'
#' \itemize{
#' \item \code{eta} control the learning rate: scale the contribution of each tree by a factor of \code{0 < eta < 1} when it is added to the current approximation. Used to prevent overfitting by making the boosting process more conservative. Lower value for \code{eta} implies larger value for \code{nrounds}: low \code{eta} value means model more robust to overfitting but slower to compute. Default: 0.3
#' \item \code{gamma} minimum loss reduction required to make a further partition on a leaf node of the tree. the larger, the more conservative the algorithm will be.
#' \item \code{max_depth} maximum depth of a tree. Default: 6
#' \item \code{min_child_weight} minimum sum of instance weight (hessian) needed in a child. If the tree partition step results in a leaf node with the sum of instance weight less than min_child_weight, then the building process will give up further partitioning. In linear regression mode, this simply corresponds to minimum number of instances needed to be in each node. The larger, the more conservative the algorithm will be. Default: 1
#' \item \code{subsample} subsample ratio of the training instance. Setting it to 0.5 means that xgboost randomly collected half of the data instances to grow trees and this will prevent overfitting. It makes computation shorter (because less data to analyse). It is advised to use this parameter with \code{eta} and increase \code{nround}. Default: 1
#' \item \code{colsample_bytree} subsample ratio of columns when constructing each tree. Default: 1
#' \item \code{num_parallel_tree} Experimental parameter. number of trees to grow per round. Useful to test Random Forest through Xgboost (set \code{colsample_bytree < 1}, \code{subsample < 1} and \code{round = 1}) accordingly. Default: 1
#' }
#'
#' 2.2. Parameter for Linear Booster
#'
#' \itemize{
#' \item \code{lambda} L2 regularization term on weights. Default: 0
#' \item \code{lambda_bias} L2 regularization term on bias. Default: 0
#' \item \code{alpha} L1 regularization term on weights. (there is no L1 reg on bias because it is not important). Default: 0
#' }
#'
#' 3. Task Parameters
#'
#' \itemize{
#' \item \code{objective} specify the learning task and the corresponding learning objective, users can pass a self-defined function to it. The default objective options are below:
#' \itemize{
#' \item \code{reg:linear} linear regression (Default).
#' \item \code{reg:logistic} logistic regression.
#' \item \code{binary:logistic} logistic regression for binary classification. Output probability.
#' \item \code{binary:logitraw} logistic regression for binary classification, output score before logistic transformation.
#' \item \code{num_class} set the number of classes. To use only with multiclass objectives.
#' \item \code{multi:softmax} set xgboost to do multiclass classification using the softmax objective. Class is represented by a number and should be from 0 to \code{num_class}.
#' \item \code{multi:softprob} same as softmax, but output a vector of ndata * nclass, which can be further reshaped to ndata, nclass matrix. The result contains predicted probabilities of each data point belonging to each class.
#' \item \code{rank:pairwise} set xgboost to do ranking task by minimizing the pairwise loss.
#' }
#' \item \code{base_score} the initial prediction score of all instances, global bias. Default: 0.5
#' \item \code{eval_metric} evaluation metrics for validation data. Users can pass a self-defined function to it. Default: metric will be assigned according to objective(rmse for regression, and error for classification, mean average precision for ranking). List is provided in detail section.
#' }
#'
#' @param data takes an \code{xgb.DMatrix} as the input.
#' @param nrounds the max number of iterations
#' @param watchlist what information should be printed when \code{verbose=1} or
#' \code{verbose=2}. Watchlist is used to specify validation set monitoring
#' during training. For example user can specify
#' watchlist=list(validation1=mat1, validation2=mat2) to watch
#' the performance of each round's model on mat1 and mat2
#'
#' @param obj customized objective function. Returns gradient and second order
#' gradient with given prediction and dtrain,
#' @param feval custimized evaluation function. Returns
#' \code{list(metric='metric-name', value='metric-value')} with given
#' prediction and dtrain,
#' @param verbose If 0, xgboost will stay silent. If 1, xgboost will print
#' information of performance. If 2, xgboost will print information of both
#' @param print.every.n Print every N progress messages when \code{verbose>0}. Default is 1 which means all messages are printed.
#' @param early.stop.round If \code{NULL}, the early stopping function is not triggered.
#' If set to an integer \code{k}, training with a validation set will stop if the performance
#' keeps getting worse consecutively for \code{k} rounds.
#' @param maximize If \code{feval} and \code{early.stop.round} are set, then \code{maximize} must be set as well.
#' \code{maximize=TRUE} means the larger the evaluation score the better.
#' @param save_period save the model to the disk in every \code{save_period} rounds, 0 means no such action.
#' @param save_name the name or path for periodically saved model file.
#' @param ... other parameters to pass to \code{params}.
#'
#' @details
#' This is the training function for \code{xgboost}.
#'
#' It supports advanced features such as \code{watchlist}, customized objective function (\code{feval}),
#' therefore it is more flexible than \code{\link{xgboost}} function.
#'
#' Parallelization is automatically enabled if \code{OpenMP} is present.
#' Number of threads can also be manually specified via \code{nthread} parameter.
#'
#' \code{eval_metric} parameter (not listed above) is set automatically by Xgboost but can be overriden by parameter. Below is provided the list of different metric optimized by Xgboost to help you to understand how it works inside or to use them with the \code{watchlist} parameter.
#' \itemize{
#' \item \code{rmse} root mean square error. \url{http://en.wikipedia.org/wiki/Root_mean_square_error}
#' \item \code{logloss} negative log-likelihood. \url{http://en.wikipedia.org/wiki/Log-likelihood}
#' \item \code{mlogloss} multiclass logloss. \url{https://www.kaggle.com/wiki/MultiClassLogLoss}
#' \item \code{error} Binary classification error rate. It is calculated as \code{(wrong cases) / (all cases)}. For the predictions, the evaluation will regard the instances with prediction value larger than 0.5 as positive instances, and the others as negative instances.
#' \item \code{merror} Multiclass classification error rate. It is calculated as \code{(wrong cases) / (all cases)}.
#' \item \code{auc} Area under the curve. \url{http://en.wikipedia.org/wiki/Receiver_operating_characteristic#'Area_under_curve} for ranking evaluation.
#' \item \code{ndcg} Normalized Discounted Cumulative Gain (for ranking task). \url{http://en.wikipedia.org/wiki/NDCG}
#' }
#'
#' Full list of parameters is available in the Wiki \url{https://github.com/dmlc/xgboost/wiki/Parameters}.
#'
#' This function only accepts an \code{\link{xgb.DMatrix}} object as the input.
#'
#' @examples
#' data(agaricus.train, package='xgboost')
#' dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
#' dtest <- dtrain
#' watchlist <- list(eval = dtest, train = dtrain)
#' logregobj <- function(preds, dtrain) {
#' labels <- getinfo(dtrain, "label")
#' preds <- 1/(1 + exp(-preds))
#' grad <- preds - labels
#' hess <- preds * (1 - preds)
#' return(list(grad = grad, hess = hess))
#' }
#' evalerror <- function(preds, dtrain) {
#' labels <- getinfo(dtrain, "label")
#' err <- as.numeric(sum(labels != (preds > 0)))/length(labels)
#' return(list(metric = "error", value = err))
#' }
#' param <- list(max.depth = 2, eta = 1, silent = 1, objective=logregobj,eval_metric=evalerror)
#' bst <- xgb.train(param, dtrain, nthread = 2, nround = 2, watchlist)
#' @export
xgb.train <- function(params=list(), data, nrounds, watchlist = list(),
obj = NULL, feval = NULL, verbose = 1, print.every.n=1L,
early.stop.round = NULL, maximize = NULL,
save_period = 0, save_name = "xgboost.model", ...) {
dtrain <- data
if (typeof(params) != "list") {
stop("xgb.train: first argument params must be list")
}
if (class(dtrain) != "xgb.DMatrix") {
stop("xgb.train: second argument dtrain must be xgb.DMatrix")
}
if (verbose > 1) {
params <- append(params, list(silent = 0))
} else {
params <- append(params, list(silent = 1))
}
if (length(watchlist) != 0 && verbose == 0) {
warning('watchlist is provided but verbose=0, no evaluation information will be printed')
}
fit.call <- match.call()
dot.params <- list(...)
nms.params <- names(params)
nms.dot.params <- names(dot.params)
if (length(intersect(nms.params,nms.dot.params)) > 0)
stop("Duplicated term in parameters. Please check your list of params.")
params <- append(params, dot.params)
# customized objective and evaluation metric interface
if (!is.null(params$objective) && !is.null(obj))
stop("xgb.train: cannot assign two different objectives")
if (!is.null(params$objective))
if (class(params$objective) == 'function') {
obj <- params$objective
params$objective <- NULL
}
if (!is.null(params$eval_metric) && !is.null(feval))
stop("xgb.train: cannot assign two different evaluation metrics")
if (!is.null(params$eval_metric))
if (class(params$eval_metric) == 'function') {
feval <- params$eval_metric
params$eval_metric <- NULL
}
# Early stopping
if (!is.null(early.stop.round)){
if (!is.null(feval) && is.null(maximize))
stop('Please set maximize to note whether the model is maximizing the evaluation or not.')
if (length(watchlist) == 0)
stop('For early stopping you need at least one set in watchlist.')
if (is.null(maximize) && is.null(params$eval_metric))
stop('Please set maximize to note whether the model is maximizing the evaluation or not.')
if (is.null(maximize))
{
if (params$eval_metric %in% c('rmse','logloss','error','merror','mlogloss')) {
maximize <- FALSE
} else {
maximize <- TRUE
}
}
if (maximize) {
bestScore <- 0
} else {
bestScore <- Inf
}
bestInd <- 0
earlyStopflag = FALSE
if (length(watchlist) > 1)
warning('Only the first data set in watchlist is used for early stopping process.')
}
handle <- xgb.Booster(params, append(watchlist, dtrain))
bst <- xgb.handleToBooster(handle)
print.every.n <- max( as.integer(print.every.n), 1L)
for (i in 1:nrounds) {
succ <- xgb.iter.update(bst$handle, dtrain, i - 1, obj)
if (length(watchlist) != 0) {
msg <- xgb.iter.eval(bst$handle, watchlist, i - 1, feval)
if (0 == ( (i - 1) %% print.every.n))
cat(paste(msg, "\n", sep = ""))
if (!is.null(early.stop.round))
{
score <- strsplit(msg,':|\\s+')[[1]][3]
score <- as.numeric(score)
if ( (maximize && score > bestScore) || (!maximize && score < bestScore)) {
bestScore <- score
bestInd <- i
} else {
earlyStopflag = TRUE
if (i - bestInd >= early.stop.round) {
cat('Stopping. Best iteration:', bestInd, '\n')
break
}
}
}
}
if (save_period > 0) {
if (i %% save_period == 0) {
xgb.save(bst, save_name)
}
}
}
bst <- xgb.Booster.check(bst)
if (!is.null(early.stop.round)) {
bst$bestScore <- bestScore
bst$bestInd <- bestInd
}
attr(bst, "call") <- fit.call
attr(bst, "params") <- params
return(bst)
}

View File

@@ -1,133 +0,0 @@
#' eXtreme Gradient Boosting (Tree) library
#'
#' A simple interface for training xgboost model. Look at \code{\link{xgb.train}} function for a more advanced interface.
#'
#' @param data takes \code{matrix}, \code{dgCMatrix}, local data file or
#' \code{xgb.DMatrix}.
#' @param label the response variable. User should not set this field,
#' if data is local data file or \code{xgb.DMatrix}.
#' @param params the list of parameters.
#'
#' Commonly used ones are:
#' \itemize{
#' \item \code{objective} objective function, common ones are
#' \itemize{
#' \item \code{reg:linear} linear regression
#' \item \code{binary:logistic} logistic regression for classification
#' }
#' \item \code{eta} step size of each boosting step
#' \item \code{max.depth} maximum depth of the tree
#' \item \code{nthread} number of thread used in training, if not set, all threads are used
#' }
#'
#' Look at \code{\link{xgb.train}} for a more complete list of parameters or \url{https://github.com/dmlc/xgboost/wiki/Parameters} for the full list.
#'
#' See also \code{demo/} for walkthrough example in R.
#'
#' @param nrounds the max number of iterations
#' @param verbose If 0, xgboost will stay silent. If 1, xgboost will print
#' information of performance. If 2, xgboost will print information of both
#' performance and construction progress information
#' @param print.every.n Print every N progress messages when \code{verbose>0}. Default is 1 which means all messages are printed.
#' @param missing Missing is only used when input is dense matrix, pick a float
#' value that represents missing value. Sometimes a data use 0 or other extreme value to represents missing values.
#' @param weight a vector indicating the weight for each row of the input.
#' @param early.stop.round If \code{NULL}, the early stopping function is not triggered.
#' If set to an integer \code{k}, training with a validation set will stop if the performance
#' keeps getting worse consecutively for \code{k} rounds.
#' @param maximize If \code{feval} and \code{early.stop.round} are set, then \code{maximize} must be set as well.
#' \code{maximize=TRUE} means the larger the evaluation score the better.
#' @param save_period save the model to the disk in every \code{save_period} rounds, 0 means no such action.
#' @param save_name the name or path for periodically saved model file.
#' @param ... other parameters to pass to \code{params}.
#'
#' @details
#' This is the modeling function for Xgboost.
#'
#' Parallelization is automatically enabled if \code{OpenMP} is present.
#'
#' Number of threads can also be manually specified via \code{nthread} parameter.
#'
#' @examples
#' data(agaricus.train, package='xgboost')
#' data(agaricus.test, package='xgboost')
#' train <- agaricus.train
#' test <- agaricus.test
#' bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
#' eta = 1, nthread = 2, nround = 2, objective = "binary:logistic")
#' pred <- predict(bst, test$data)
#'
#' @export
xgboost <- function(data = NULL, label = NULL, missing = NA, weight = NULL,
params = list(), nrounds,
verbose = 1, print.every.n = 1L, early.stop.round = NULL,
maximize = NULL, save_period = 0, save_name = "xgboost.model", ...) {
dtrain <- xgb.get.DMatrix(data, label, missing, weight)
params <- append(params, list(...))
if (verbose > 0) {
watchlist <- list(train = dtrain)
} else {
watchlist <- list()
}
bst <- xgb.train(params, dtrain, nrounds, watchlist, verbose = verbose, print.every.n=print.every.n,
early.stop.round = early.stop.round, maximize = maximize,
save_period = save_period, save_name = save_name)
return(bst)
}
#' Training part from Mushroom Data Set
#'
#' This data set is originally from the Mushroom data set,
#' UCI Machine Learning Repository.
#'
#' This data set includes the following fields:
#'
#' \itemize{
#' \item \code{label} the label for each record
#' \item \code{data} a sparse Matrix of \code{dgCMatrix} class, with 126 columns.
#' }
#'
#' @references
#' https://archive.ics.uci.edu/ml/datasets/Mushroom
#'
#' Bache, K. & Lichman, M. (2013). UCI Machine Learning Repository
#' [http://archive.ics.uci.edu/ml]. Irvine, CA: University of California,
#' School of Information and Computer Science.
#'
#' @docType data
#' @keywords datasets
#' @name agaricus.train
#' @usage data(agaricus.train)
#' @format A list containing a label vector, and a dgCMatrix object with 6513
#' rows and 127 variables
NULL
#' Test part from Mushroom Data Set
#'
#' This data set is originally from the Mushroom data set,
#' UCI Machine Learning Repository.
#'
#' This data set includes the following fields:
#'
#' \itemize{
#' \item \code{label} the label for each record
#' \item \code{data} a sparse Matrix of \code{dgCMatrix} class, with 126 columns.
#' }
#'
#' @references
#' https://archive.ics.uci.edu/ml/datasets/Mushroom
#'
#' Bache, K. & Lichman, M. (2013). UCI Machine Learning Repository
#' [http://archive.ics.uci.edu/ml]. Irvine, CA: University of California,
#' School of Information and Computer Science.
#'
#' @docType data
#' @keywords datasets
#' @name agaricus.test
#' @usage data(agaricus.test)
#' @format A list containing a label vector, and a dgCMatrix object with 1611
#' rows and 126 variables
NULL

View File

@@ -1,44 +0,0 @@
R package for xgboost
=====================
[![CRAN Status Badge](http://www.r-pkg.org/badges/version/xgboost)](http://cran.r-project.org/web/packages/xgboost)
[![CRAN Downloads](http://cranlogs.r-pkg.org/badges/xgboost)](http://cran.rstudio.com/web/packages/xgboost/index.html)
Installation
------------
We are [on CRAN](https://cran.r-project.org/web/packages/xgboost/index.html) now. For stable/pre-compiled(for Windows and OS X) version, please install from CRAN:
```r
install.packages('xgboost')
```
For up-to-date version, please install from github. Windows user will need to install [RTools](http://cran.r-project.org/bin/windows/Rtools/) first.
```r
devtools::install_github('dmlc/xgboost',subdir='R-package')
```
Examples
--------
* Please visit [walk through example](demo).
* See also the [example scripts](../demo/kaggle-higgs) for Kaggle Higgs Challenge, including [speedtest script](../demo/kaggle-higgs/speedtest.R) on this dataset and the one related to [Otto challenge](../demo/kaggle-otto), including a [RMarkdown documentation](../demo/kaggle-otto/understandingXGBoostModel.Rmd).
Notes
-----
If you face an issue installing the package using ```devtools::install_github```, something like this (even after updating libxml and RCurl as lot of forums say) -
```
devtools::install_github('dmlc/xgboost',subdir='R-package')
Downloading github repo dmlc/xgboost@master
Error in function (type, msg, asError = TRUE) :
Peer certificate cannot be authenticated with given CA certificates
```
To get around this you can build the package locally as mentioned [here](https://github.com/dmlc/xgboost/issues/347) -
```
1. Clone the current repository and set your workspace to xgboost/R-package/
2. Run R CMD INSTALL --build . in terminal to get the tarball.
3. Run install.packages('path_to_the_tarball',repo=NULL) in R to install.
```

Binary file not shown.

Binary file not shown.

View File

@@ -1,11 +0,0 @@
basic_walkthrough Basic feature walkthrough
caret_wrapper Use xgboost to train in caret library
custom_objective Cutomize loss function, and evaluation metric
boost_from_prediction Boosting from existing prediction
predict_first_ntree Predicting using first n trees
generalized_linear_model Generalized Linear Model
cross_validation Cross validation
create_sparse_matrix Create Sparse Matrix
predict_leaf_indices Predicting the corresponding leaves
early_stopping Early Stop in training
poisson_regression Poisson Regression on count data

View File

@@ -1,19 +0,0 @@
XGBoost R Feature Walkthrough
====
* [Basic walkthrough of wrappers](basic_walkthrough.R)
* [Train a xgboost model from caret library](caret_wrapper.R)
* [Cutomize loss function, and evaluation metric](custom_objective.R)
* [Boosting from existing prediction](boost_from_prediction.R)
* [Predicting using first n trees](predict_first_ntree.R)
* [Generalized Linear Model](generalized_linear_model.R)
* [Cross validation](cross_validation.R)
* [Create a sparse matrix from a dense one](create_sparse_matrix.R)
Benchmarks
====
* [Starter script for Kaggle Higgs Boson](../../demo/kaggle-higgs)
Notes
====
* Contribution of examples, benchmarks is more than welcomed!
* If you like to share how you use xgboost to solve your problem, send a pull request:)

View File

@@ -1,110 +0,0 @@
require(xgboost)
require(methods)
# we load in the agaricus dataset
# In this example, we are aiming to predict whether a mushroom can be eaten
data(agaricus.train, package='xgboost')
data(agaricus.test, package='xgboost')
train <- agaricus.train
test <- agaricus.test
# the loaded data is stored in sparseMatrix, and label is a numeric vector in {0,1}
class(train$label)
class(train$data)
#-------------Basic Training using XGBoost-----------------
# this is the basic usage of xgboost you can put matrix in data field
# note: we are putting in sparse matrix here, xgboost naturally handles sparse input
# use sparse matrix when your feature is sparse(e.g. when you are using one-hot encoding vector)
print("Training xgboost with sparseMatrix")
bst <- xgboost(data = train$data, label = train$label, max.depth = 2, eta = 1, nround = 2,
nthread = 2, objective = "binary:logistic")
# alternatively, you can put in dense matrix, i.e. basic R-matrix
print("Training xgboost with Matrix")
bst <- xgboost(data = as.matrix(train$data), label = train$label, max.depth = 2, eta = 1, nround = 2,
nthread = 2, objective = "binary:logistic")
# you can also put in xgb.DMatrix object, which stores label, data and other meta datas needed for advanced features
print("Training xgboost with xgb.DMatrix")
dtrain <- xgb.DMatrix(data = train$data, label = train$label)
bst <- xgboost(data = dtrain, max.depth = 2, eta = 1, nround = 2, nthread = 2,
objective = "binary:logistic")
# Verbose = 0,1,2
print("Train xgboost with verbose 0, no message")
bst <- xgboost(data = dtrain, max.depth = 2, eta = 1, nround = 2,
nthread = 2, objective = "binary:logistic", verbose = 0)
print("Train xgboost with verbose 1, print evaluation metric")
bst <- xgboost(data = dtrain, max.depth = 2, eta = 1, nround = 2,
nthread = 2, objective = "binary:logistic", verbose = 1)
print("Train xgboost with verbose 2, also print information about tree")
bst <- xgboost(data = dtrain, max.depth = 2, eta = 1, nround = 2,
nthread = 2, objective = "binary:logistic", verbose = 2)
# you can also specify data as file path to a LibSVM format input
# since we do not have this file with us, the following line is just for illustration
# bst <- xgboost(data = 'agaricus.train.svm', max.depth = 2, eta = 1, nround = 2,objective = "binary:logistic")
#--------------------basic prediction using xgboost--------------
# you can do prediction using the following line
# you can put in Matrix, sparseMatrix, or xgb.DMatrix
pred <- predict(bst, test$data)
err <- mean(as.numeric(pred > 0.5) != test$label)
print(paste("test-error=", err))
#-------------------save and load models-------------------------
# save model to binary local file
xgb.save(bst, "xgboost.model")
# load binary model to R
bst2 <- xgb.load("xgboost.model")
pred2 <- predict(bst2, test$data)
# pred2 should be identical to pred
print(paste("sum(abs(pred2-pred))=", sum(abs(pred2-pred))))
# save model to R's raw vector
raw = xgb.save.raw(bst)
# load binary model to R
bst3 <- xgb.load(raw)
pred3 <- predict(bst3, test$data)
# pred2 should be identical to pred
print(paste("sum(abs(pred3-pred))=", sum(abs(pred2-pred))))
#----------------Advanced features --------------
# to use advanced features, we need to put data in xgb.DMatrix
dtrain <- xgb.DMatrix(data = train$data, label=train$label)
dtest <- xgb.DMatrix(data = test$data, label=test$label)
#---------------Using watchlist----------------
# watchlist is a list of xgb.DMatrix, each of them is tagged with name
watchlist <- list(train=dtrain, test=dtest)
# to train with watchlist, use xgb.train, which contains more advanced features
# watchlist allows us to monitor the evaluation result on all data in the list
print("Train xgboost using xgb.train with watchlist")
bst <- xgb.train(data=dtrain, max.depth=2, eta=1, nround=2, watchlist=watchlist,
nthread = 2, objective = "binary:logistic")
# we can change evaluation metrics, or use multiple evaluation metrics
print("train xgboost using xgb.train with watchlist, watch logloss and error")
bst <- xgb.train(data=dtrain, max.depth=2, eta=1, nround=2, watchlist=watchlist,
eval.metric = "error", eval.metric = "logloss",
nthread = 2, objective = "binary:logistic")
# xgb.DMatrix can also be saved using xgb.DMatrix.save
xgb.DMatrix.save(dtrain, "dtrain.buffer")
# to load it in, simply call xgb.DMatrix
dtrain2 <- xgb.DMatrix("dtrain.buffer")
bst <- xgb.train(data=dtrain2, max.depth=2, eta=1, nround=2, watchlist=watchlist,
nthread = 2, objective = "binary:logistic")
# information can be extracted from xgb.DMatrix using getinfo
label = getinfo(dtest, "label")
pred <- predict(bst, dtest)
err <- as.numeric(sum(as.integer(pred > 0.5) != label))/length(label)
print(paste("test-error=", err))
# You can dump the tree you learned using xgb.dump into a text file
xgb.dump(bst, "dump.raw.txt", with.stats = T)
# Finally, you can check which features are the most important.
print("Most important features (look at column Gain):")
imp_matrix <- xgb.importance(feature_names = train$data@Dimnames[[2]], model = bst)
print(imp_matrix)
# Feature importance bar plot by gain
print("Feature importance Plot : ")
print(xgb.plot.importance(importance_matrix = imp_matrix))

View File

@@ -1,26 +0,0 @@
require(xgboost)
# load in the agaricus dataset
data(agaricus.train, package='xgboost')
data(agaricus.test, package='xgboost')
dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label)
watchlist <- list(eval = dtest, train = dtrain)
###
# advanced: start from a initial base prediction
#
print('start running example to start from a initial prediction')
# train xgboost for 1 round
param <- list(max.depth=2,eta=1,nthread = 2, silent=1,objective='binary:logistic')
bst <- xgb.train( param, dtrain, 1, watchlist )
# Note: we need the margin value instead of transformed prediction in set_base_margin
# do predict with output_margin=TRUE, will always give you margin values before logistic transformation
ptrain <- predict(bst, dtrain, outputmargin=TRUE)
ptest <- predict(bst, dtest, outputmargin=TRUE)
# set the base_margin property of dtrain and dtest
# base margin is the base prediction we will boost from
setinfo(dtrain, "base_margin", ptrain)
setinfo(dtest, "base_margin", ptest)
print('this is result of boost from initial prediction')
bst <- xgb.train(params = param, data = dtrain, nrounds = 1, watchlist = watchlist)

View File

@@ -1,35 +0,0 @@
# install development version of caret library that contains xgboost models
devtools::install_github("topepo/caret/pkg/caret")
require(caret)
require(xgboost)
require(data.table)
require(vcd)
require(e1071)
# Load Arthritis dataset in memory.
data(Arthritis)
# Create a copy of the dataset with data.table package (data.table is 100% compliant with R dataframe but its syntax is a lot more consistent and its performance are really good).
df <- data.table(Arthritis, keep.rownames = F)
# Let's add some new categorical features to see if it helps. Of course these feature are highly correlated to the Age feature. Usually it's not a good thing in ML, but Tree algorithms (including boosted trees) are able to select the best features, even in case of highly correlated features.
# For the first feature we create groups of age by rounding the real age. Note that we transform it to factor (categorical data) so the algorithm treat them as independant values.
df[,AgeDiscret:= as.factor(round(Age/10,0))]
# Here is an even stronger simplification of the real age with an arbitrary split at 30 years old. I choose this value based on nothing. We will see later if simplifying the information based on arbitrary values is a good strategy (I am sure you already have an idea of how well it will work!).
df[,AgeCat:= as.factor(ifelse(Age > 30, "Old", "Young"))]
# We remove ID as there is nothing to learn from this feature (it will just add some noise as the dataset is small).
df[,ID:=NULL]
#-------------Basic Training using XGBoost in caret Library-----------------
# Set up control parameters for caret::train
# Here we use 10-fold cross-validation, repeating twice, and using random search for tuning hyper-parameters.
fitControl <- trainControl(method = "cv", number = 10, repeats = 2, search = "random")
# train a xgbTree model using caret::train
model <- train(factor(Improved)~., data = df, method = "xgbTree", trControl = fitControl)
# Instead of tree for our boosters, you can also fit a linear regression or logistic regression model using xgbLinear
# model <- train(factor(Improved)~., data = df, method = "xgbLinear", trControl = fitControl)
# See model results
print(model)

View File

@@ -1,90 +0,0 @@
require(xgboost)
require(Matrix)
require(data.table)
if (!require(vcd)) {
install.packages('vcd') #Available in Cran. Used for its dataset with categorical values.
require(vcd)
}
# According to its documentation, Xgboost works only on numbers.
# Sometimes the dataset we have to work on have categorical data.
# A categorical variable is one which have a fixed number of values. By example, if for each observation a variable called "Colour" can have only "red", "blue" or "green" as value, it is a categorical variable.
#
# In R, categorical variable is called Factor.
# Type ?factor in console for more information.
#
# In this demo we will see how to transform a dense dataframe with categorical variables to a sparse matrix before analyzing it in Xgboost.
# The method we are going to see is usually called "one hot encoding".
#load Arthritis dataset in memory.
data(Arthritis)
# create a copy of the dataset with data.table package (data.table is 100% compliant with R dataframe but its syntax is a lot more consistent and its performance are really good).
df <- data.table(Arthritis, keep.rownames = F)
# Let's have a look to the data.table
cat("Print the dataset\n")
print(df)
# 2 columns have factor type, one has ordinal type (ordinal variable is a categorical variable with values wich can be ordered, here: None > Some > Marked).
cat("Structure of the dataset\n")
str(df)
# Let's add some new categorical features to see if it helps. Of course these feature are highly correlated to the Age feature. Usually it's not a good thing in ML, but Tree algorithms (including boosted trees) are able to select the best features, even in case of highly correlated features.
# For the first feature we create groups of age by rounding the real age. Note that we transform it to factor (categorical data) so the algorithm treat them as independant values.
df[,AgeDiscret:= as.factor(round(Age/10,0))]
# Here is an even stronger simplification of the real age with an arbitrary split at 30 years old. I choose this value based on nothing. We will see later if simplifying the information based on arbitrary values is a good strategy (I am sure you already have an idea of how well it will work!).
df[,AgeCat:= as.factor(ifelse(Age > 30, "Old", "Young"))]
# We remove ID as there is nothing to learn from this feature (it will just add some noise as the dataset is small).
df[,ID:=NULL]
# List the different values for the column Treatment: Placebo, Treated.
cat("Values of the categorical feature Treatment\n")
print(levels(df[,Treatment]))
# Next step, we will transform the categorical data to dummy variables.
# This method is also called one hot encoding.
# The purpose is to transform each value of each categorical feature in one binary feature.
#
# Let's take, the column Treatment will be replaced by two columns, Placebo, and Treated. Each of them will be binary. For example an observation which had the value Placebo in column Treatment before the transformation will have, after the transformation, the value 1 in the new column Placebo and the value 0 in the new column Treated.
#
# Formulae Improved~.-1 used below means transform all categorical features but column Improved to binary values.
# Column Improved is excluded because it will be our output column, the one we want to predict.
sparse_matrix = sparse.model.matrix(Improved~.-1, data = df)
cat("Encoding of the sparse Matrix\n")
print(sparse_matrix)
# Create the output vector (not sparse)
# 1. Set, for all rows, field in Y column to 0;
# 2. set Y to 1 when Improved == Marked;
# 3. Return Y column
output_vector = df[,Y:=0][Improved == "Marked",Y:=1][,Y]
# Following is the same process as other demo
cat("Learning...\n")
bst <- xgboost(data = sparse_matrix, label = output_vector, max.depth = 9,
eta = 1, nthread = 2, nround = 10,objective = "binary:logistic")
# sparse_matrix@Dimnames[[2]] represents the column names of the sparse matrix.
importance <- xgb.importance(feature_names = sparse_matrix@Dimnames[[2]], model = bst)
print(importance)
# According to the matrix below, the most important feature in this dataset to predict if the treatment will work is the Age. The second most important feature is having received a placebo or not. The sex is third. Then we see our generated features (AgeDiscret). We can see that their contribution is very low (Gain column).
# Does these result make sense?
# Let's check some Chi2 between each of these features and the outcome.
print(chisq.test(df$Age, df$Y))
# Pearson correlation between Age and illness disappearing is 35
print(chisq.test(df$AgeDiscret, df$Y))
# Our first simplification of Age gives a Pearson correlation of 8.
print(chisq.test(df$AgeCat, df$Y))
# The perfectly random split I did between young and old at 30 years old have a low correlation of 2. It's a result we may expect as may be in my mind > 30 years is being old (I am 32 and starting feeling old, this may explain that), but for the illness we are studying, the age to be vulnerable is not the same. Don't let your "gut" lower the quality of your model. In "data science", there is science :-)
# As you can see, in general destroying information by simplifying it won't improve your model. Chi2 just demonstrates that. But in more complex cases, creating a new feature based on existing one which makes link with the outcome more obvious may help the algorithm and improve the model. The case studied here is not enough complex to show that. Check Kaggle forum for some challenging datasets.
# However it's almost always worse when you add some arbitrary rules.
# Moreover, you can notice that even if we have added some not useful new features highly correlated with other features, the boosting tree algorithm have been able to choose the best one, which in this case is the Age. Linear model may not be that strong in these scenario.

View File

@@ -1,51 +0,0 @@
require(xgboost)
# load in the agaricus dataset
data(agaricus.train, package='xgboost')
data(agaricus.test, package='xgboost')
dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label)
nround <- 2
param <- list(max.depth=2,eta=1,silent=1,nthread = 2, objective='binary:logistic')
cat('running cross validation\n')
# do cross validation, this will print result out as
# [iteration] metric_name:mean_value+std_value
# std_value is standard deviation of the metric
xgb.cv(param, dtrain, nround, nfold=5, metrics={'error'})
cat('running cross validation, disable standard deviation display\n')
# do cross validation, this will print result out as
# [iteration] metric_name:mean_value+std_value
# std_value is standard deviation of the metric
xgb.cv(param, dtrain, nround, nfold=5,
metrics={'error'}, showsd = FALSE)
###
# you can also do cross validation with cutomized loss function
# See custom_objective.R
##
print ('running cross validation, with cutomsized loss function')
logregobj <- function(preds, dtrain) {
labels <- getinfo(dtrain, "label")
preds <- 1/(1 + exp(-preds))
grad <- preds - labels
hess <- preds * (1 - preds)
return(list(grad = grad, hess = hess))
}
evalerror <- function(preds, dtrain) {
labels <- getinfo(dtrain, "label")
err <- as.numeric(sum(labels != (preds > 0)))/length(labels)
return(list(metric = "error", value = err))
}
param <- list(max.depth=2,eta=1,silent=1,
objective = logregobj, eval_metric = evalerror)
# train with customized objective
xgb.cv(params = param, data = dtrain, nrounds = nround, nfold = 5)
# do cross validation with prediction values for each fold
res <- xgb.cv(params = param, data = dtrain, nrounds = nround, nfold = 5, prediction = TRUE)
res$dt
length(res$pred)

View File

@@ -1,65 +0,0 @@
require(xgboost)
# load in the agaricus dataset
data(agaricus.train, package='xgboost')
data(agaricus.test, package='xgboost')
dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label)
# note: for customized objective function, we leave objective as default
# note: what we are getting is margin value in prediction
# you must know what you are doing
watchlist <- list(eval = dtest, train = dtrain)
num_round <- 2
# user define objective function, given prediction, return gradient and second order gradient
# this is loglikelihood loss
logregobj <- function(preds, dtrain) {
labels <- getinfo(dtrain, "label")
preds <- 1/(1 + exp(-preds))
grad <- preds - labels
hess <- preds * (1 - preds)
return(list(grad = grad, hess = hess))
}
# user defined evaluation function, return a pair metric_name, result
# NOTE: when you do customized loss function, the default prediction value is margin
# this may make buildin evalution metric not function properly
# for example, we are doing logistic loss, the prediction is score before logistic transformation
# the buildin evaluation error assumes input is after logistic transformation
# Take this in mind when you use the customization, and maybe you need write customized evaluation function
evalerror <- function(preds, dtrain) {
labels <- getinfo(dtrain, "label")
err <- as.numeric(sum(labels != (preds > 0)))/length(labels)
return(list(metric = "error", value = err))
}
param <- list(max.depth=2, eta=1, nthread = 2, silent=1,
objective=logregobj, eval_metric=evalerror)
print ('start training with user customized objective')
# training with customized objective, we can also do step by step training
# simply look at xgboost.py's implementation of train
bst <- xgb.train(param, dtrain, num_round, watchlist)
#
# there can be cases where you want additional information
# being considered besides the property of DMatrix you can get by getinfo
# you can set additional information as attributes if DMatrix
# set label attribute of dtrain to be label, we use label as an example, it can be anything
attr(dtrain, 'label') <- getinfo(dtrain, 'label')
# this is new customized objective, where you can access things you set
# same thing applies to customized evaluation function
logregobjattr <- function(preds, dtrain) {
# now you can access the attribute in customized function
labels <- attr(dtrain, 'label')
preds <- 1/(1 + exp(-preds))
grad <- preds - labels
hess <- preds * (1 - preds)
return(list(grad = grad, hess = hess))
}
param <- list(max.depth=2, eta=1, nthread = 2, silent=1,
objective=logregobjattr, eval_metric=evalerror)
print ('start training with user customized objective, with additional attributes in DMatrix')
# training with customized objective, we can also do step by step training
# simply look at xgboost.py's implementation of train
bst <- xgb.train(param, dtrain, num_round, watchlist)

View File

@@ -1,40 +0,0 @@
require(xgboost)
# load in the agaricus dataset
data(agaricus.train, package='xgboost')
data(agaricus.test, package='xgboost')
dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label)
# note: for customized objective function, we leave objective as default
# note: what we are getting is margin value in prediction
# you must know what you are doing
param <- list(max.depth=2,eta=1,nthread = 2, silent=1)
watchlist <- list(eval = dtest)
num_round <- 20
# user define objective function, given prediction, return gradient and second order gradient
# this is loglikelihood loss
logregobj <- function(preds, dtrain) {
labels <- getinfo(dtrain, "label")
preds <- 1/(1 + exp(-preds))
grad <- preds - labels
hess <- preds * (1 - preds)
return(list(grad = grad, hess = hess))
}
# user defined evaluation function, return a pair metric_name, result
# NOTE: when you do customized loss function, the default prediction value is margin
# this may make buildin evalution metric not function properly
# for example, we are doing logistic loss, the prediction is score before logistic transformation
# the buildin evaluation error assumes input is after logistic transformation
# Take this in mind when you use the customization, and maybe you need write customized evaluation function
evalerror <- function(preds, dtrain) {
labels <- getinfo(dtrain, "label")
err <- as.numeric(sum(labels != (preds > 0)))/length(labels)
return(list(metric = "error", value = err))
}
print ('start training with early Stopping setting')
bst <- xgb.train(param, dtrain, num_round, watchlist,
objective = logregobj, eval_metric = evalerror, maximize = FALSE,
early.stop.round = 3)
bst <- xgb.cv(param, dtrain, num_round, nfold = 5,
objective = logregobj, eval_metric = evalerror,
maximize = FALSE, early.stop.round = 3)

View File

@@ -1,34 +0,0 @@
require(xgboost)
# load in the agaricus dataset
data(agaricus.train, package='xgboost')
data(agaricus.test, package='xgboost')
dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label)
##
# this script demonstrate how to fit generalized linear model in xgboost
# basically, we are using linear model, instead of tree for our boosters
# you can fit a linear regression, or logistic regression model
##
# change booster to gblinear, so that we are fitting a linear model
# alpha is the L1 regularizer
# lambda is the L2 regularizer
# you can also set lambda_bias which is L2 regularizer on the bias term
param <- list(objective = "binary:logistic", booster = "gblinear",
nthread = 2, alpha = 0.0001, lambda = 1)
# normally, you do not need to set eta (step_size)
# XGBoost uses a parallel coordinate descent algorithm (shotgun),
# there could be affection on convergence with parallelization on certain cases
# setting eta to be smaller value, e.g 0.5 can make the optimization more stable
##
# the rest of settings are the same
##
watchlist <- list(eval = dtest, train = dtrain)
num_round <- 2
bst <- xgb.train(param, dtrain, num_round, watchlist)
ypred <- predict(bst, dtest)
labels <- getinfo(dtest, 'label')
cat('error of preds=', mean(as.numeric(ypred>0.5)!=labels),'\n')

View File

@@ -1,7 +0,0 @@
data(mtcars)
head(mtcars)
bst = xgboost(data=as.matrix(mtcars[,-11]),label=mtcars[,11],
objective='count:poisson',nrounds=5)
pred = predict(bst,as.matrix(mtcars[,-11]))
sqrt(mean((pred-mtcars[,11])^2))

View File

@@ -1,23 +0,0 @@
require(xgboost)
# load in the agaricus dataset
data(agaricus.train, package='xgboost')
data(agaricus.test, package='xgboost')
dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label)
param <- list(max.depth=2,eta=1,silent=1,objective='binary:logistic')
watchlist <- list(eval = dtest, train = dtrain)
nround = 2
# training the model for two rounds
bst = xgb.train(param, dtrain, nround, nthread = 2, watchlist)
cat('start testing prediction from first n trees\n')
labels <- getinfo(dtest,'label')
### predict using first 1 tree
ypred1 = predict(bst, dtest, ntreelimit=1)
# by default, we predict using all the trees
ypred2 = predict(bst, dtest)
cat('error of ypred1=', mean(as.numeric(ypred1>0.5)!=labels),'\n')
cat('error of ypred2=', mean(as.numeric(ypred2>0.5)!=labels),'\n')

View File

@@ -1,52 +0,0 @@
require(xgboost)
require(data.table)
require(Matrix)
set.seed(1982)
# load in the agaricus dataset
data(agaricus.train, package='xgboost')
data(agaricus.test, package='xgboost')
dtrain <- xgb.DMatrix(data = agaricus.train$data, label = agaricus.train$label)
dtest <- xgb.DMatrix(data = agaricus.test$data, label = agaricus.test$label)
param <- list(max.depth=2, eta=1, silent=1, objective='binary:logistic')
nround = 4
# training the model for two rounds
bst = xgb.train(params = param, data = dtrain, nrounds = nround, nthread = 2)
# Model accuracy without new features
accuracy.before <- sum((predict(bst, agaricus.test$data) >= 0.5) == agaricus.test$label) / length(agaricus.test$label)
# by default, we predict using all the trees
pred_with_leaf = predict(bst, dtest, predleaf = TRUE)
head(pred_with_leaf)
create.new.tree.features <- function(model, original.features){
pred_with_leaf <- predict(model, original.features, predleaf = TRUE)
cols <- list()
for(i in 1:length(trees)){
# max is not the real max but it s not important for the purpose of adding features
leaf.id <- sort(unique(pred_with_leaf[,i]))
cols[[i]] <- factor(x = pred_with_leaf[,i], level = leaf.id)
}
cBind(original.features, sparse.model.matrix( ~ . -1, as.data.frame(cols)))
}
# Convert previous features to one hot encoding
new.features.train <- create.new.tree.features(bst, agaricus.train$data)
new.features.test <- create.new.tree.features(bst, agaricus.test$data)
# learning with new features
new.dtrain <- xgb.DMatrix(data = new.features.train, label = agaricus.train$label)
new.dtest <- xgb.DMatrix(data = new.features.test, label = agaricus.test$label)
watchlist <- list(train = new.dtrain)
bst <- xgb.train(params = param, data = new.dtrain, nrounds = nround, nthread = 2)
# Model accuracy with new features
accuracy.after <- sum((predict(bst, new.dtest) >= 0.5) == agaricus.test$label) / length(agaricus.test$label)
# Here the accuracy was already good and is now perfect.
cat(paste("The accuracy was", accuracy.before, "before adding leaf features and it is now", accuracy.after, "!\n"))

View File

@@ -1,12 +0,0 @@
# running all scripts in demo folder
demo(basic_walkthrough)
demo(custom_objective)
demo(boost_from_prediction)
demo(predict_first_ntree)
demo(generalized_linear_model)
demo(cross_validation)
demo(create_sparse_matrix)
demo(predict_leaf_indices)
demo(early_stopping)
demo(poisson_regression)
demo(caret_wrapper)

View File

@@ -1,32 +0,0 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/xgboost.R
\docType{data}
\name{agaricus.test}
\alias{agaricus.test}
\title{Test part from Mushroom Data Set}
\format{A list containing a label vector, and a dgCMatrix object with 1611
rows and 126 variables}
\usage{
data(agaricus.test)
}
\description{
This data set is originally from the Mushroom data set,
UCI Machine Learning Repository.
}
\details{
This data set includes the following fields:
\itemize{
\item \code{label} the label for each record
\item \code{data} a sparse Matrix of \code{dgCMatrix} class, with 126 columns.
}
}
\references{
https://archive.ics.uci.edu/ml/datasets/Mushroom
Bache, K. & Lichman, M. (2013). UCI Machine Learning Repository
[http://archive.ics.uci.edu/ml]. Irvine, CA: University of California,
School of Information and Computer Science.
}
\keyword{datasets}

View File

@@ -1,32 +0,0 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/xgboost.R
\docType{data}
\name{agaricus.train}
\alias{agaricus.train}
\title{Training part from Mushroom Data Set}
\format{A list containing a label vector, and a dgCMatrix object with 6513
rows and 127 variables}
\usage{
data(agaricus.train)
}
\description{
This data set is originally from the Mushroom data set,
UCI Machine Learning Repository.
}
\details{
This data set includes the following fields:
\itemize{
\item \code{label} the label for each record
\item \code{data} a sparse Matrix of \code{dgCMatrix} class, with 126 columns.
}
}
\references{
https://archive.ics.uci.edu/ml/datasets/Mushroom
Bache, K. & Lichman, M. (2013). UCI Machine Learning Repository
[http://archive.ics.uci.edu/ml]. Irvine, CA: University of California,
School of Information and Computer Science.
}
\keyword{datasets}

View File

@@ -1,15 +0,0 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/xgb.plot.deepness.R
\name{edge.parser}
\alias{edge.parser}
\title{Parse the graph to extract vector of edges}
\usage{
edge.parser(element)
}
\arguments{
\item{element}{igraph object containing the path from the root to the leaf.}
}
\description{
Parse the graph to extract vector of edges
}

View File

@@ -1,15 +0,0 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/xgb.plot.deepness.R
\name{get.paths.to.leaf}
\alias{get.paths.to.leaf}
\title{Extract path from root to leaf from data.table}
\usage{
get.paths.to.leaf(dt.tree)
}
\arguments{
\item{dt.tree}{data.table containing the nodes and edges of the trees}
}
\description{
Extract path from root to leaf from data.table
}

View File

@@ -1,42 +0,0 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/getinfo.xgb.DMatrix.R
\docType{methods}
\name{getinfo}
\alias{getinfo}
\alias{getinfo,xgb.DMatrix-method}
\title{Get information of an xgb.DMatrix object}
\usage{
getinfo(object, ...)
\S4method{getinfo}{xgb.DMatrix}(object, name)
}
\arguments{
\item{object}{Object of class \code{xgb.DMatrix}}
\item{...}{other parameters}
\item{name}{the name of the field to get}
}
\description{
Get information of an xgb.DMatrix object
}
\details{
The information can be one of the following:
\itemize{
\item \code{label}: label Xgboost learn from ;
\item \code{weight}: to do a weight rescale ;
\item \code{base_margin}: base margin is the base prediction Xgboost will boost from ;
\item \code{nrow}: number of rows of the \code{xgb.DMatrix}.
}
}
\examples{
data(agaricus.train, package='xgboost')
train <- agaricus.train
dtrain <- xgb.DMatrix(train$data, label=train$label)
labels <- getinfo(dtrain, 'label')
setinfo(dtrain, 'label', 1-labels)
labels2 <- getinfo(dtrain, 'label')
stopifnot(all(labels2 == 1-labels))
}

View File

@@ -1,15 +0,0 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/xgb.plot.deepness.R
\name{multiplot}
\alias{multiplot}
\title{Plot multiple graphs at the same time}
\usage{
multiplot(..., cols = 1)
}
\arguments{
\item{cols}{number of columns}
}
\description{
Plot multiple graph aligned by rows and columns.
}

View File

@@ -1,23 +0,0 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/nrow.xgb.DMatrix.R
\docType{methods}
\name{nrow,xgb.DMatrix-method}
\alias{nrow,xgb.DMatrix-method}
\title{Number of xgb.DMatrix rows}
\usage{
\S4method{nrow}{xgb.DMatrix}(x)
}
\arguments{
\item{x}{Object of class \code{xgb.DMatrix}}
}
\description{
\code{nrow} return the number of rows present in the \code{xgb.DMatrix}.
}
\examples{
data(agaricus.train, package='xgboost')
train <- agaricus.train
dtrain <- xgb.DMatrix(train$data, label=train$label)
stopifnot(nrow(dtrain) == nrow(train$data))
}

View File

@@ -1,53 +0,0 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/predict.xgb.Booster.R
\docType{methods}
\name{predict,xgb.Booster-method}
\alias{predict,xgb.Booster-method}
\title{Predict method for eXtreme Gradient Boosting model}
\usage{
\S4method{predict}{xgb.Booster}(object, newdata, missing = NA,
outputmargin = FALSE, ntreelimit = NULL, predleaf = FALSE)
}
\arguments{
\item{object}{Object of class "xgb.Boost"}
\item{newdata}{takes \code{matrix}, \code{dgCMatrix}, local data file or
\code{xgb.DMatrix}.}
\item{missing}{Missing is only used when input is dense matrix, pick a float
value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values.}
\item{outputmargin}{whether the prediction should be shown in the original
value of sum of functions, when outputmargin=TRUE, the prediction is
untransformed margin value. In logistic regression, outputmargin=T will
output value before logistic transformation.}
\item{ntreelimit}{limit number of trees used in prediction, this parameter is
only valid for gbtree, but not for gblinear. set it to be value bigger
than 0. It will use all trees by default.}
\item{predleaf}{whether predict leaf index instead. If set to TRUE, the output will be a matrix object.}
}
\description{
Predicted values based on xgboost model object.
}
\details{
The option \code{ntreelimit} purpose is to let the user train a model with lots
of trees but use only the first trees for prediction to avoid overfitting
(without having to train a new model with less trees).
The option \code{predleaf} purpose is inspired from §3.1 of the paper
\code{Practical Lessons from Predicting Clicks on Ads at Facebook}.
The idea is to use the model as a generator of new features which capture non linear link
from original features.
}
\examples{
data(agaricus.train, package='xgboost')
data(agaricus.test, package='xgboost')
train <- agaricus.train
test <- agaricus.test
bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
eta = 1, nthread = 2, nround = 2,objective = "binary:logistic")
pred <- predict(bst, test$data)
}

View File

@@ -1,18 +0,0 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/predict.xgb.Booster.handle.R
\docType{methods}
\name{predict,xgb.Booster.handle-method}
\alias{predict,xgb.Booster.handle-method}
\title{Predict method for eXtreme Gradient Boosting model handle}
\usage{
\S4method{predict}{xgb.Booster.handle}(object, ...)
}
\arguments{
\item{object}{Object of class "xgb.Boost.handle"}
\item{...}{Parameters pass to \code{predict.xgb.Booster}}
}
\description{
Predicted values based on xgb.Booster.handle object.
}

View File

@@ -1,44 +0,0 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/setinfo.xgb.DMatrix.R
\docType{methods}
\name{setinfo}
\alias{setinfo}
\alias{setinfo,xgb.DMatrix-method}
\title{Set information of an xgb.DMatrix object}
\usage{
setinfo(object, ...)
\S4method{setinfo}{xgb.DMatrix}(object, name, info)
}
\arguments{
\item{object}{Object of class "xgb.DMatrix"}
\item{...}{other parameters}
\item{name}{the name of the field to get}
\item{info}{the specific field of information to set}
}
\description{
Set information of an xgb.DMatrix object
}
\details{
It can be one of the following:
\itemize{
\item \code{label}: label Xgboost learn from ;
\item \code{weight}: to do a weight rescale ;
\item \code{base_margin}: base margin is the base prediction Xgboost will boost from ;
\item \code{group}.
}
}
\examples{
data(agaricus.train, package='xgboost')
train <- agaricus.train
dtrain <- xgb.DMatrix(train$data, label=train$label)
labels <- getinfo(dtrain, 'label')
setinfo(dtrain, 'label', 1-labels)
labels2 <- getinfo(dtrain, 'label')
stopifnot(all(labels2 == 1-labels))
}

View File

@@ -1,31 +0,0 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/slice.xgb.DMatrix.R
\docType{methods}
\name{slice}
\alias{slice}
\alias{slice,xgb.DMatrix-method}
\title{Get a new DMatrix containing the specified rows of
orginal xgb.DMatrix object}
\usage{
slice(object, ...)
\S4method{slice}{xgb.DMatrix}(object, idxset, ...)
}
\arguments{
\item{object}{Object of class "xgb.DMatrix"}
\item{...}{other parameters}
\item{idxset}{a integer vector of indices of rows needed}
}
\description{
Get a new DMatrix containing the specified rows of
orginal xgb.DMatrix object
}
\examples{
data(agaricus.train, package='xgboost')
train <- agaricus.train
dtrain <- xgb.DMatrix(train$data, label=train$label)
dsub <- slice(dtrain, 1:3)
}

View File

@@ -1,30 +0,0 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/xgb.DMatrix.R
\name{xgb.DMatrix}
\alias{xgb.DMatrix}
\title{Contruct xgb.DMatrix object}
\usage{
xgb.DMatrix(data, info = list(), missing = NA, ...)
}
\arguments{
\item{data}{a \code{matrix} object, a \code{dgCMatrix} object or a character
indicating the data file.}
\item{info}{a list of information of the xgb.DMatrix object}
\item{missing}{Missing is only used when input is dense matrix, pick a float
value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values.}
\item{...}{other information to pass to \code{info}.}
}
\description{
Contruct xgb.DMatrix object from dense matrix, sparse matrix or local file.
}
\examples{
data(agaricus.train, package='xgboost')
train <- agaricus.train
dtrain <- xgb.DMatrix(train$data, label=train$label)
xgb.DMatrix.save(dtrain, 'xgb.DMatrix.data')
dtrain <- xgb.DMatrix('xgb.DMatrix.data')
}

View File

@@ -1,24 +0,0 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/xgb.DMatrix.save.R
\name{xgb.DMatrix.save}
\alias{xgb.DMatrix.save}
\title{Save xgb.DMatrix object to binary file}
\usage{
xgb.DMatrix.save(DMatrix, fname)
}
\arguments{
\item{DMatrix}{the DMatrix object}
\item{fname}{the name of the binary file.}
}
\description{
Save xgb.DMatrix object to binary file
}
\examples{
data(agaricus.train, package='xgboost')
train <- agaricus.train
dtrain <- xgb.DMatrix(train$data, label=train$label)
xgb.DMatrix.save(dtrain, 'xgb.DMatrix.data')
dtrain <- xgb.DMatrix('xgb.DMatrix.data')
}

View File

@@ -1,88 +0,0 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/xgb.create.features.R
\name{xgb.create.features}
\alias{xgb.create.features}
\title{Create new features from a previously learned model}
\usage{
xgb.create.features(model, training.data)
}
\arguments{
\item{model}{decision tree boosting model learned on the original data}
\item{training.data}{original data (usually provided as a \code{dgCMatrix} matrix)}
}
\value{
\code{dgCMatrix} matrix including both the original data and the new features.
}
\description{
May improve the learning by adding new features to the training data based on the decision trees from a previously learned model.
}
\details{
This is the function inspired from the paragraph 3.1 of the paper:
\strong{Practical Lessons from Predicting Clicks on Ads at Facebook}
\emph{(Xinran He, Junfeng Pan, Ou Jin, Tianbing Xu, Bo Liu, Tao Xu, Yan, xin Shi, Antoine Atallah, Ralf Herbrich, Stuart Bowers,
Joaquin Quiñonero Candela)}
International Workshop on Data Mining for Online Advertising (ADKDD) - August 24, 2014
\url{https://research.facebook.com/publications/758569837499391/practical-lessons-from-predicting-clicks-on-ads-at-facebook/}.
Extract explaining the method:
"\emph{We found that boosted decision trees are a powerful and very
convenient way to implement non-linear and tuple transformations
of the kind we just described. We treat each individual
tree as a categorical feature that takes as value the
index of the leaf an instance ends up falling in. We use
1-of-K coding of this type of features.
For example, consider the boosted tree model in Figure 1 with 2 subtrees,
where the first subtree has 3 leafs and the second 2 leafs. If an
instance ends up in leaf 2 in the first subtree and leaf 1 in
second subtree, the overall input to the linear classifier will
be the binary vector \code{[0, 1, 0, 1, 0]}, where the first 3 entries
correspond to the leaves of the first subtree and last 2 to
those of the second subtree.
[...]
We can understand boosted decision tree
based transformation as a supervised feature encoding that
converts a real-valued vector into a compact binary-valued
vector. A traversal from root node to a leaf node represents
a rule on certain features.}"
}
\examples{
data(agaricus.train, package='xgboost')
data(agaricus.test, package='xgboost')
dtrain <- xgb.DMatrix(data = agaricus.train$data, label = agaricus.train$label)
dtest <- xgb.DMatrix(data = agaricus.test$data, label = agaricus.test$label)
param <- list(max.depth=2, eta=1, silent=1, objective='binary:logistic')
nround = 4
bst = xgb.train(params = param, data = dtrain, nrounds = nround, nthread = 2)
# Model accuracy without new features
accuracy.before <- sum((predict(bst, agaricus.test$data) >= 0.5) == agaricus.test$label) / length(agaricus.test$label)
# Convert previous features to one hot encoding
new.features.train <- xgb.create.features(model = bst, agaricus.train$data)
new.features.test <- xgb.create.features(model = bst, agaricus.test$data)
# learning with new features
new.dtrain <- xgb.DMatrix(data = new.features.train, label = agaricus.train$label)
new.dtest <- xgb.DMatrix(data = new.features.test, label = agaricus.test$label)
watchlist <- list(train = new.dtrain)
bst <- xgb.train(params = param, data = new.dtrain, nrounds = nround, nthread = 2)
# Model accuracy with new features
accuracy.after <- sum((predict(bst, new.dtest) >= 0.5) == agaricus.test$label) / length(agaricus.test$label)
# Here the accuracy was already good and is now perfect.
cat(paste("The accuracy was", accuracy.before, "before adding leaf features and it is now", accuracy.after, "!\\n"))
}

View File

@@ -1,109 +0,0 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/xgb.cv.R
\name{xgb.cv}
\alias{xgb.cv}
\title{Cross Validation}
\usage{
xgb.cv(params = list(), data, nrounds, nfold, label = NULL, missing = NA,
prediction = FALSE, showsd = TRUE, metrics = list(), obj = NULL,
feval = NULL, stratified = TRUE, folds = NULL, verbose = T,
print.every.n = 1L, early.stop.round = NULL, maximize = NULL, ...)
}
\arguments{
\item{params}{the list of parameters. Commonly used ones are:
\itemize{
\item \code{objective} objective function, common ones are
\itemize{
\item \code{reg:linear} linear regression
\item \code{binary:logistic} logistic regression for classification
}
\item \code{eta} step size of each boosting step
\item \code{max.depth} maximum depth of the tree
\item \code{nthread} number of thread used in training, if not set, all threads are used
}
See \link{xgb.train} for further details.
See also demo/ for walkthrough example in R.}
\item{data}{takes an \code{xgb.DMatrix} or \code{Matrix} as the input.}
\item{nrounds}{the max number of iterations}
\item{nfold}{the original dataset is randomly partitioned into \code{nfold} equal size subsamples.}
\item{label}{option field, when data is \code{Matrix}}
\item{missing}{Missing is only used when input is dense matrix, pick a float
value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values.}
\item{prediction}{A logical value indicating whether to return the prediction vector.}
\item{showsd}{\code{boolean}, whether show standard deviation of cross validation}
\item{metrics, }{list of evaluation metrics to be used in corss validation,
when it is not specified, the evaluation metric is chosen according to objective function.
Possible options are:
\itemize{
\item \code{error} binary classification error rate
\item \code{rmse} Rooted mean square error
\item \code{logloss} negative log-likelihood function
\item \code{auc} Area under curve
\item \code{merror} Exact matching error, used to evaluate multi-class classification
}}
\item{obj}{customized objective function. Returns gradient and second order
gradient with given prediction and dtrain.}
\item{feval}{custimized evaluation function. Returns
\code{list(metric='metric-name', value='metric-value')} with given
prediction and dtrain.}
\item{stratified}{\code{boolean} whether sampling of folds should be stratified by the values of labels in \code{data}}
\item{folds}{\code{list} provides a possibility of using a list of pre-defined CV folds (each element must be a vector of fold's indices).
If folds are supplied, the nfold and stratified parameters would be ignored.}
\item{verbose}{\code{boolean}, print the statistics during the process}
\item{print.every.n}{Print every N progress messages when \code{verbose>0}. Default is 1 which means all messages are printed.}
\item{early.stop.round}{If \code{NULL}, the early stopping function is not triggered.
If set to an integer \code{k}, training with a validation set will stop if the performance
keeps getting worse consecutively for \code{k} rounds.}
\item{maximize}{If \code{feval} and \code{early.stop.round} are set, then \code{maximize} must be set as well.
\code{maximize=TRUE} means the larger the evaluation score the better.}
\item{...}{other parameters to pass to \code{params}.}
}
\value{
If \code{prediction = TRUE}, a list with the following elements is returned:
\itemize{
\item \code{dt} a \code{data.table} with each mean and standard deviation stat for training set and test set
\item \code{pred} an array or matrix (for multiclass classification) with predictions for each CV-fold for the model having been trained on the data in all other folds.
}
If \code{prediction = FALSE}, just a \code{data.table} with each mean and standard deviation stat for training set and test set is returned.
}
\description{
The cross valudation function of xgboost
}
\details{
The original sample is randomly partitioned into \code{nfold} equal size subsamples.
Of the \code{nfold} subsamples, a single subsample is retained as the validation data for testing the model, and the remaining \code{nfold - 1} subsamples are used as training data.
The cross-validation process is then repeated \code{nrounds} times, with each of the \code{nfold} subsamples used exactly once as the validation data.
All observations are used for both training and validation.
Adapted from \url{http://en.wikipedia.org/wiki/Cross-validation_\%28statistics\%29#k-fold_cross-validation}
}
\examples{
data(agaricus.train, package='xgboost')
dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
history <- xgb.cv(data = dtrain, nround=3, nthread = 2, nfold = 5, metrics=list("rmse","auc"),
max.depth =3, eta = 1, objective = "binary:logistic")
print(history)
}

View File

@@ -1,45 +0,0 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/xgb.dump.R
\name{xgb.dump}
\alias{xgb.dump}
\title{Save xgboost model to text file}
\usage{
xgb.dump(model = NULL, fname = NULL, fmap = "", with.stats = FALSE)
}
\arguments{
\item{model}{the model object.}
\item{fname}{the name of the text file where to save the model text dump. If not provided or set to \code{NULL} the function will return the model as a \code{character} vector.}
\item{fmap}{feature map file representing the type of feature.
Detailed description could be found at
\url{https://github.com/dmlc/xgboost/wiki/Binary-Classification#dump-model}.
See demo/ for walkthrough example in R, and
\url{https://github.com/dmlc/xgboost/blob/master/demo/data/featmap.txt}
for example Format.}
\item{with.stats}{whether dump statistics of splits
When this option is on, the model dump comes with two additional statistics:
gain is the approximate loss function gain we get in each split;
cover is the sum of second order gradient in each node.}
}
\value{
if fname is not provided or set to \code{NULL} the function will return the model as a \code{character} vector. Otherwise it will return \code{TRUE}.
}
\description{
Save a xgboost model to text file. Could be parsed later.
}
\examples{
data(agaricus.train, package='xgboost')
data(agaricus.test, package='xgboost')
train <- agaricus.train
test <- agaricus.test
bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
eta = 1, nthread = 2, nround = 2,objective = "binary:logistic")
# save the model in file 'xgb.model.dump'
xgb.dump(bst, 'xgb.model.dump', with.stats = TRUE)
# print the model without saving it to a file
print(xgb.dump(bst))
}

View File

@@ -1,65 +0,0 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/xgb.importance.R
\name{xgb.importance}
\alias{xgb.importance}
\title{Show importance of features in a model}
\usage{
xgb.importance(feature_names = NULL, model = NULL, data = NULL,
label = NULL, target = function(x) ((x + label) == 2))
}
\arguments{
\item{feature_names}{names of each feature as a \code{character} vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}.}
\item{model}{generated by the \code{xgb.train} function.}
\item{data}{the dataset used for the training step. Will be used with \code{label} parameter for co-occurence computation. More information in \code{Detail} part. This parameter is optional.}
\item{label}{the label vetor used for the training step. Will be used with \code{data} parameter for co-occurence computation. More information in \code{Detail} part. This parameter is optional.}
\item{target}{a function which returns \code{TRUE} or \code{1} when an observation should be count as a co-occurence and \code{FALSE} or \code{0} otherwise. Default function is provided for computing co-occurences in a binary classification. The \code{target} function should have only one parameter. This parameter will be used to provide each important feature vector after having applied the split condition, therefore these vector will be only made of 0 and 1 only, whatever was the information before. More information in \code{Detail} part. This parameter is optional.}
}
\value{
A \code{data.table} of the features used in the model with their average gain (and their weight for boosted tree model) in the model.
}
\description{
Create a \code{data.table} of the most important features of a model.
}
\details{
This function is for both linear and tree models.
\code{data.table} is returned by the function.
The columns are :
\itemize{
\item \code{Features} name of the features as provided in \code{feature_names} or already present in the model dump;
\item \code{Gain} contribution of each feature to the model. For boosted tree model, each gain of each feature of each tree is taken into account, then average per feature to give a vision of the entire model. Highest percentage means important feature to predict the \code{label} used for the training (only available for tree models);
\item \code{Cover} metric of the number of observation related to this feature (only available for tree models);
\item \code{Weight} percentage representing the relative number of times a feature have been taken into trees.
}
If you don't provide \code{feature_names}, index of the features will be used instead.
Because the index is extracted from the model dump (made on the C++ side), it starts at 0 (usual in C++) instead of 1 (usual in R).
Co-occurence count
------------------
The gain gives you indication about the information of how a feature is important in making a branch of a decision tree more pure. However, with this information only, you can't know if this feature has to be present or not to get a specific classification. In the example code, you may wonder if odor=none should be \code{TRUE} to not eat a mushroom.
Co-occurence computation is here to help in understanding this relation between a predictor and a specific class. It will count how many observations are returned as \code{TRUE} by the \code{target} function (see parameters). When you execute the example below, there are 92 times only over the 3140 observations of the train dataset where a mushroom have no odor and can be eaten safely.
If you need to remember one thing only: until you want to leave us early, don't eat a mushroom which has no odor :-)
}
\examples{
data(agaricus.train, package='xgboost')
bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max.depth = 2,
eta = 1, nthread = 2, nround = 2,objective = "binary:logistic")
# agaricus.train$data@Dimnames[[2]] represents the column names of the sparse matrix.
xgb.importance(agaricus.train$data@Dimnames[[2]], model = bst)
# Same thing with co-occurence computation this time
xgb.importance(agaricus.train$data@Dimnames[[2]], model = bst, data = agaricus.train$data, label = agaricus.train$label)
}

View File

@@ -1,26 +0,0 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/xgb.load.R
\name{xgb.load}
\alias{xgb.load}
\title{Load xgboost model from binary file}
\usage{
xgb.load(modelfile)
}
\arguments{
\item{modelfile}{the name of the binary file.}
}
\description{
Load xgboost model from the binary model file
}
\examples{
data(agaricus.train, package='xgboost')
data(agaricus.test, package='xgboost')
train <- agaricus.train
test <- agaricus.test
bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
eta = 1, nthread = 2, nround = 2,objective = "binary:logistic")
xgb.save(bst, 'xgb.model')
bst <- xgb.load('xgb.model')
pred <- predict(bst, test$data)
}

View File

@@ -1,55 +0,0 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/xgb.model.dt.tree.R
\name{xgb.model.dt.tree}
\alias{xgb.model.dt.tree}
\title{Parse boosted tree model text dump}
\usage{
xgb.model.dt.tree(feature_names = NULL, model = NULL, text = NULL,
n_first_tree = NULL)
}
\arguments{
\item{feature_names}{names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If the model already contains feature names, this argument should be \code{NULL} (default value).}
\item{model}{object created by the \code{xgb.train} function.}
\item{text}{\code{character} vector generated by the \code{xgb.dump} function. Model dump must include the gain per feature and per tree (parameter \code{with.stats = TRUE} in function \code{xgb.dump}).}
\item{n_first_tree}{limit the plot to the \code{n} first trees. If set to \code{NULL}, all trees of the model are plotted. Performance can be low depending of the size of the model.}
}
\value{
A \code{data.table} of the features used in the model with their gain, cover and few other information.
}
\description{
Parse a boosted tree model text dump and return a \code{data.table}.
}
\details{
General function to convert a text dump of tree model to a \code{data.table}.
The purpose is to help user to explore the model and get a better understanding of it.
The columns of the \code{data.table} are:
\itemize{
\item \code{ID}: unique identifier of a node ;
\item \code{Feature}: feature used in the tree to operate a split. When Leaf is indicated, it is the end of a branch ;
\item \code{Split}: value of the chosen feature where is operated the split ;
\item \code{Yes}: ID of the feature for the next node in the branch when the split condition is met ;
\item \code{No}: ID of the feature for the next node in the branch when the split condition is not met ;
\item \code{Missing}: ID of the feature for the next node in the branch for observation where the feature used for the split are not provided ;
\item \code{Quality}: it's the gain related to the split in this specific node ;
\item \code{Cover}: metric to measure the number of observation affected by the split ;
\item \code{Tree}: ID of the tree. It is included in the main ID ;
\item \code{Yes.Feature}, \code{No.Feature}, \code{Yes.Cover}, \code{No.Cover}, \code{Yes.Quality} and \code{No.Quality}: data related to the pointer in \code{Yes} or \code{No} column ;
}
}
\examples{
data(agaricus.train, package='xgboost')
bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max.depth = 2,
eta = 1, nthread = 2, nround = 2,objective = "binary:logistic")
# agaricus.train$data@Dimnames[[2]] represents the column names of the sparse matrix.
xgb.model.dt.tree(feature_names = agaricus.train$data@Dimnames[[2]], model = bst)
}

View File

@@ -1,46 +0,0 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/xgb.plot.deepness.R
\name{xgb.plot.deepness}
\alias{xgb.plot.deepness}
\title{Plot model trees deepness}
\usage{
xgb.plot.deepness(model = NULL)
}
\arguments{
\item{model}{dump generated by the \code{xgb.train} function.}
}
\value{
Two graphs showing the distribution of the model deepness.
}
\description{
Generate a graph to plot the distribution of deepness among trees.
}
\details{
Display both the number of \code{leaf} and the distribution of \code{weighted observations}
by tree deepness level.
The purpose of this function is to help the user to find the best trade-off to set
the \code{max.depth} and \code{min_child_weight} parameters according to the bias / variance trade-off.
See \link{xgb.train} for more information about these parameters.
The graph is made of two parts:
\itemize{
\item Count: number of leaf per level of deepness;
\item Weighted cover: noramlized weighted cover per leaf (weighted number of instances).
}
This function is inspired by the blog post \url{http://aysent.github.io/2015/11/08/random-forest-leaf-visualization.html}
}
\examples{
data(agaricus.train, package='xgboost')
bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max.depth = 15,
eta = 1, nthread = 2, nround = 30, objective = "binary:logistic",
min_child_weight = 50)
xgb.plot.deepness(model = bst)
}

View File

@@ -1,40 +0,0 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/xgb.plot.importance.R
\name{xgb.plot.importance}
\alias{xgb.plot.importance}
\title{Plot feature importance bar graph}
\usage{
xgb.plot.importance(importance_matrix = NULL, numberOfClusters = c(1:10))
}
\arguments{
\item{importance_matrix}{a \code{data.table} returned by the \code{xgb.importance} function.}
\item{numberOfClusters}{a \code{numeric} vector containing the min and the max range of the possible number of clusters of bars.}
}
\value{
A \code{ggplot2} bar graph representing each feature by a horizontal bar. Longer is the bar, more important is the feature. Features are classified by importance and clustered by importance. The group is represented through the color of the bar.
}
\description{
Read a data.table containing feature importance details and plot it (for both GLM and Trees).
}
\details{
The purpose of this function is to easily represent the importance of each feature of a model.
The function returns a ggplot graph, therefore each of its characteristic can be overriden (to customize it).
In particular you may want to override the title of the graph. To do so, add \code{+ ggtitle("A GRAPH NAME")} next to the value returned by this function.
}
\examples{
data(agaricus.train, package='xgboost')
#Both dataset are list with two items, a sparse matrix and labels
#(labels = outcome column which will be learned).
#Each column of the sparse Matrix is a feature in one hot encoding format.
bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max.depth = 2,
eta = 1, nthread = 2, nround = 2,objective = "binary:logistic")
#agaricus.train$data@Dimnames[[2]] represents the column names of the sparse matrix.
importance_matrix <- xgb.importance(agaricus.train$data@Dimnames[[2]], model = bst)
xgb.plot.importance(importance_matrix)
}

View File

@@ -1,58 +0,0 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/xgb.plot.multi.trees.R
\name{xgb.plot.multi.trees}
\alias{xgb.plot.multi.trees}
\title{Project all trees on one tree and plot it}
\usage{
xgb.plot.multi.trees(model, feature_names = NULL, features.keep = 5,
plot.width = NULL, plot.height = NULL)
}
\arguments{
\item{model}{dump generated by the \code{xgb.train} function.}
\item{feature_names}{names of each feature as a \code{character} vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}.}
\item{features.keep}{number of features to keep in each position of the multi trees.}
\item{plot.width}{width in pixels of the graph to produce}
\item{plot.height}{height in pixels of the graph to produce}
}
\value{
Two graphs showing the distribution of the model deepness.
}
\description{
Visualization of the ensemble of trees as a single collective unit.
}
\details{
This function tries to capture the complexity of gradient boosted tree ensemble
in a cohesive way.
The goal is to improve the interpretability of the model generally seen as black box.
The function is dedicated to boosting applied to decision trees only.
The purpose is to move from an ensemble of trees to a single tree only.
It takes advantage of the fact that the shape of a binary tree is only defined by
its deepness (therefore in a boosting model, all trees have the same shape).
Moreover, the trees tend to reuse the same features.
The function will project each tree on one, and keep for each position the
\code{features.keep} first features (based on Gain per feature measure).
This function is inspired by this blog post:
\url{https://wellecks.wordpress.com/2015/02/21/peering-into-the-black-box-visualizing-lambdamart/}
}
\examples{
data(agaricus.train, package='xgboost')
bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max.depth = 15,
eta = 1, nthread = 2, nround = 30, objective = "binary:logistic",
min_child_weight = 50)
p <- xgb.plot.multi.trees(model = bst, feature_names = agaricus.train$data@Dimnames[[2]], features.keep = 3)
print(p)
}

View File

@@ -1,48 +0,0 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/xgb.plot.tree.R
\name{xgb.plot.tree}
\alias{xgb.plot.tree}
\title{Plot a boosted tree model}
\usage{
xgb.plot.tree(feature_names = NULL, model = NULL, n_first_tree = NULL,
plot.width = NULL, plot.height = NULL)
}
\arguments{
\item{feature_names}{names of each feature as a \code{character} vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}.}
\item{model}{generated by the \code{xgb.train} function. Avoid the creation of a dump file.}
\item{n_first_tree}{limit the plot to the n first trees. If \code{NULL}, all trees of the model are plotted. Performance can be low for huge models.}
\item{plot.width}{the width of the diagram in pixels.}
\item{plot.height}{the height of the diagram in pixels.}
}
\value{
A \code{DiagrammeR} of the model.
}
\description{
Read a tree model text dump and plot the model.
}
\details{
The content of each node is organised that way:
\itemize{
\item \code{feature} value;
\item \code{cover}: the sum of second order gradient of training data classified to the leaf, if it is square loss, this simply corresponds to the number of instances in that branch. Deeper in the tree a node is, lower this metric will be;
\item \code{gain}: metric the importance of the node in the model.
}
The function uses \href{http://www.graphviz.org/}{GraphViz} library for that purpose.
}
\examples{
data(agaricus.train, package='xgboost')
bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max.depth = 2,
eta = 1, nthread = 2, nround = 2,objective = "binary:logistic")
# agaricus.train$data@Dimnames[[2]] represents the column names of the sparse matrix.
xgb.plot.tree(feature_names = agaricus.train$data@Dimnames[[2]], model = bst)
}

View File

@@ -1,28 +0,0 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/xgb.save.R
\name{xgb.save}
\alias{xgb.save}
\title{Save xgboost model to binary file}
\usage{
xgb.save(model, fname)
}
\arguments{
\item{model}{the model object.}
\item{fname}{the name of the binary file.}
}
\description{
Save xgboost model from xgboost or xgb.train
}
\examples{
data(agaricus.train, package='xgboost')
data(agaricus.test, package='xgboost')
train <- agaricus.train
test <- agaricus.test
bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
eta = 1, nthread = 2, nround = 2,objective = "binary:logistic")
xgb.save(bst, 'xgb.model')
bst <- xgb.load('xgb.model')
pred <- predict(bst, test$data)
}

View File

@@ -1,27 +0,0 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/xgb.save.raw.R
\name{xgb.save.raw}
\alias{xgb.save.raw}
\title{Save xgboost model to R's raw vector,
user can call xgb.load to load the model back from raw vector}
\usage{
xgb.save.raw(model)
}
\arguments{
\item{model}{the model object.}
}
\description{
Save xgboost model from xgboost or xgb.train
}
\examples{
data(agaricus.train, package='xgboost')
data(agaricus.test, package='xgboost')
train <- agaricus.train
test <- agaricus.test
bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
eta = 1, nthread = 2, nround = 2,objective = "binary:logistic")
raw <- xgb.save.raw(bst)
bst <- xgb.load(raw)
pred <- predict(bst, test$data)
}

View File

@@ -1,144 +0,0 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/xgb.train.R
\name{xgb.train}
\alias{xgb.train}
\title{eXtreme Gradient Boosting Training}
\usage{
xgb.train(params = list(), data, nrounds, watchlist = list(), obj = NULL,
feval = NULL, verbose = 1, print.every.n = 1L,
early.stop.round = NULL, maximize = NULL, save_period = 0,
save_name = "xgboost.model", ...)
}
\arguments{
\item{params}{the list of parameters.
1. General Parameters
\itemize{
\item \code{booster} which booster to use, can be \code{gbtree} or \code{gblinear}. Default: \code{gbtree}
\item \code{silent} 0 means printing running messages, 1 means silent mode. Default: 0
}
2. Booster Parameters
2.1. Parameter for Tree Booster
\itemize{
\item \code{eta} control the learning rate: scale the contribution of each tree by a factor of \code{0 < eta < 1} when it is added to the current approximation. Used to prevent overfitting by making the boosting process more conservative. Lower value for \code{eta} implies larger value for \code{nrounds}: low \code{eta} value means model more robust to overfitting but slower to compute. Default: 0.3
\item \code{gamma} minimum loss reduction required to make a further partition on a leaf node of the tree. the larger, the more conservative the algorithm will be.
\item \code{max_depth} maximum depth of a tree. Default: 6
\item \code{min_child_weight} minimum sum of instance weight (hessian) needed in a child. If the tree partition step results in a leaf node with the sum of instance weight less than min_child_weight, then the building process will give up further partitioning. In linear regression mode, this simply corresponds to minimum number of instances needed to be in each node. The larger, the more conservative the algorithm will be. Default: 1
\item \code{subsample} subsample ratio of the training instance. Setting it to 0.5 means that xgboost randomly collected half of the data instances to grow trees and this will prevent overfitting. It makes computation shorter (because less data to analyse). It is advised to use this parameter with \code{eta} and increase \code{nround}. Default: 1
\item \code{colsample_bytree} subsample ratio of columns when constructing each tree. Default: 1
\item \code{num_parallel_tree} Experimental parameter. number of trees to grow per round. Useful to test Random Forest through Xgboost (set \code{colsample_bytree < 1}, \code{subsample < 1} and \code{round = 1}) accordingly. Default: 1
}
2.2. Parameter for Linear Booster
\itemize{
\item \code{lambda} L2 regularization term on weights. Default: 0
\item \code{lambda_bias} L2 regularization term on bias. Default: 0
\item \code{alpha} L1 regularization term on weights. (there is no L1 reg on bias because it is not important). Default: 0
}
3. Task Parameters
\itemize{
\item \code{objective} specify the learning task and the corresponding learning objective, users can pass a self-defined function to it. The default objective options are below:
\itemize{
\item \code{reg:linear} linear regression (Default).
\item \code{reg:logistic} logistic regression.
\item \code{binary:logistic} logistic regression for binary classification. Output probability.
\item \code{binary:logitraw} logistic regression for binary classification, output score before logistic transformation.
\item \code{num_class} set the number of classes. To use only with multiclass objectives.
\item \code{multi:softmax} set xgboost to do multiclass classification using the softmax objective. Class is represented by a number and should be from 0 to \code{num_class}.
\item \code{multi:softprob} same as softmax, but output a vector of ndata * nclass, which can be further reshaped to ndata, nclass matrix. The result contains predicted probabilities of each data point belonging to each class.
\item \code{rank:pairwise} set xgboost to do ranking task by minimizing the pairwise loss.
}
\item \code{base_score} the initial prediction score of all instances, global bias. Default: 0.5
\item \code{eval_metric} evaluation metrics for validation data. Users can pass a self-defined function to it. Default: metric will be assigned according to objective(rmse for regression, and error for classification, mean average precision for ranking). List is provided in detail section.
}}
\item{data}{takes an \code{xgb.DMatrix} as the input.}
\item{nrounds}{the max number of iterations}
\item{watchlist}{what information should be printed when \code{verbose=1} or
\code{verbose=2}. Watchlist is used to specify validation set monitoring
during training. For example user can specify
watchlist=list(validation1=mat1, validation2=mat2) to watch
the performance of each round's model on mat1 and mat2}
\item{obj}{customized objective function. Returns gradient and second order
gradient with given prediction and dtrain,}
\item{feval}{custimized evaluation function. Returns
\code{list(metric='metric-name', value='metric-value')} with given
prediction and dtrain,}
\item{verbose}{If 0, xgboost will stay silent. If 1, xgboost will print
information of performance. If 2, xgboost will print information of both}
\item{print.every.n}{Print every N progress messages when \code{verbose>0}. Default is 1 which means all messages are printed.}
\item{early.stop.round}{If \code{NULL}, the early stopping function is not triggered.
If set to an integer \code{k}, training with a validation set will stop if the performance
keeps getting worse consecutively for \code{k} rounds.}
\item{maximize}{If \code{feval} and \code{early.stop.round} are set, then \code{maximize} must be set as well.
\code{maximize=TRUE} means the larger the evaluation score the better.}
\item{save_period}{save the model to the disk in every \code{save_period} rounds, 0 means no such action.}
\item{save_name}{the name or path for periodically saved model file.}
\item{...}{other parameters to pass to \code{params}.}
}
\description{
An advanced interface for training xgboost model. Look at \code{\link{xgboost}} function for a simpler interface.
}
\details{
This is the training function for \code{xgboost}.
It supports advanced features such as \code{watchlist}, customized objective function (\code{feval}),
therefore it is more flexible than \code{\link{xgboost}} function.
Parallelization is automatically enabled if \code{OpenMP} is present.
Number of threads can also be manually specified via \code{nthread} parameter.
\code{eval_metric} parameter (not listed above) is set automatically by Xgboost but can be overriden by parameter. Below is provided the list of different metric optimized by Xgboost to help you to understand how it works inside or to use them with the \code{watchlist} parameter.
\itemize{
\item \code{rmse} root mean square error. \url{http://en.wikipedia.org/wiki/Root_mean_square_error}
\item \code{logloss} negative log-likelihood. \url{http://en.wikipedia.org/wiki/Log-likelihood}
\item \code{mlogloss} multiclass logloss. \url{https://www.kaggle.com/wiki/MultiClassLogLoss}
\item \code{error} Binary classification error rate. It is calculated as \code{(wrong cases) / (all cases)}. For the predictions, the evaluation will regard the instances with prediction value larger than 0.5 as positive instances, and the others as negative instances.
\item \code{merror} Multiclass classification error rate. It is calculated as \code{(wrong cases) / (all cases)}.
\item \code{auc} Area under the curve. \url{http://en.wikipedia.org/wiki/Receiver_operating_characteristic#'Area_under_curve} for ranking evaluation.
\item \code{ndcg} Normalized Discounted Cumulative Gain (for ranking task). \url{http://en.wikipedia.org/wiki/NDCG}
}
Full list of parameters is available in the Wiki \url{https://github.com/dmlc/xgboost/wiki/Parameters}.
This function only accepts an \code{\link{xgb.DMatrix}} object as the input.
}
\examples{
data(agaricus.train, package='xgboost')
dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
dtest <- dtrain
watchlist <- list(eval = dtest, train = dtrain)
logregobj <- function(preds, dtrain) {
labels <- getinfo(dtrain, "label")
preds <- 1/(1 + exp(-preds))
grad <- preds - labels
hess <- preds * (1 - preds)
return(list(grad = grad, hess = hess))
}
evalerror <- function(preds, dtrain) {
labels <- getinfo(dtrain, "label")
err <- as.numeric(sum(labels != (preds > 0)))/length(labels)
return(list(metric = "error", value = err))
}
param <- list(max.depth = 2, eta = 1, silent = 1, objective=logregobj,eval_metric=evalerror)
bst <- xgb.train(param, dtrain, nthread = 2, nround = 2, watchlist)
}

View File

@@ -1,83 +0,0 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/xgboost.R
\name{xgboost}
\alias{xgboost}
\title{eXtreme Gradient Boosting (Tree) library}
\usage{
xgboost(data = NULL, label = NULL, missing = NA, weight = NULL,
params = list(), nrounds, verbose = 1, print.every.n = 1L,
early.stop.round = NULL, maximize = NULL, save_period = 0,
save_name = "xgboost.model", ...)
}
\arguments{
\item{data}{takes \code{matrix}, \code{dgCMatrix}, local data file or
\code{xgb.DMatrix}.}
\item{label}{the response variable. User should not set this field,
if data is local data file or \code{xgb.DMatrix}.}
\item{missing}{Missing is only used when input is dense matrix, pick a float
value that represents missing value. Sometimes a data use 0 or other extreme value to represents missing values.}
\item{weight}{a vector indicating the weight for each row of the input.}
\item{params}{the list of parameters.
Commonly used ones are:
\itemize{
\item \code{objective} objective function, common ones are
\itemize{
\item \code{reg:linear} linear regression
\item \code{binary:logistic} logistic regression for classification
}
\item \code{eta} step size of each boosting step
\item \code{max.depth} maximum depth of the tree
\item \code{nthread} number of thread used in training, if not set, all threads are used
}
Look at \code{\link{xgb.train}} for a more complete list of parameters or \url{https://github.com/dmlc/xgboost/wiki/Parameters} for the full list.
See also \code{demo/} for walkthrough example in R.}
\item{nrounds}{the max number of iterations}
\item{verbose}{If 0, xgboost will stay silent. If 1, xgboost will print
information of performance. If 2, xgboost will print information of both
performance and construction progress information}
\item{print.every.n}{Print every N progress messages when \code{verbose>0}. Default is 1 which means all messages are printed.}
\item{early.stop.round}{If \code{NULL}, the early stopping function is not triggered.
If set to an integer \code{k}, training with a validation set will stop if the performance
keeps getting worse consecutively for \code{k} rounds.}
\item{maximize}{If \code{feval} and \code{early.stop.round} are set, then \code{maximize} must be set as well.
\code{maximize=TRUE} means the larger the evaluation score the better.}
\item{save_period}{save the model to the disk in every \code{save_period} rounds, 0 means no such action.}
\item{save_name}{the name or path for periodically saved model file.}
\item{...}{other parameters to pass to \code{params}.}
}
\description{
A simple interface for training xgboost model. Look at \code{\link{xgb.train}} function for a more advanced interface.
}
\details{
This is the modeling function for Xgboost.
Parallelization is automatically enabled if \code{OpenMP} is present.
Number of threads can also be manually specified via \code{nthread} parameter.
}
\examples{
data(agaricus.train, package='xgboost')
data(agaricus.test, package='xgboost')
train <- agaricus.train
test <- agaricus.test
bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
eta = 1, nthread = 2, nround = 2, objective = "binary:logistic")
pred <- predict(bst, test$data)
}

View File

@@ -1,8 +0,0 @@
# package root
PKGROOT=../../
# _*_ mode: Makefile; _*_
PKG_CPPFLAGS= -DXGBOOST_CUSTOMIZE_MSG_ -DXGBOOST_CUSTOMIZE_PRNG_ -DXGBOOST_STRICT_CXX98_ -DRABIT_CUSTOMIZE_MSG_ -DRABIT_STRICT_CXX98_ -I$(PKGROOT)
PKG_CXXFLAGS= $(SHLIB_OPENMP_CFLAGS) $(SHLIB_PTHREAD_FLAGS)
PKG_LIBS = $(SHLIB_OPENMP_CFLAGS) $(SHLIB_PTHREAD_FLAGS)
OBJECTS= xgboost_R.o xgboost_assert.o $(PKGROOT)/wrapper/xgboost_wrapper.o $(PKGROOT)/src/io/io.o $(PKGROOT)/src/gbm/gbm.o $(PKGROOT)/src/tree/updater.o $(PKGROOT)/subtree/rabit/src/engine_empty.o $(PKGROOT)/src/io/dmlc_simple.o

View File

@@ -1,19 +0,0 @@
# package root
PKGROOT=./
# _*_ mode: Makefile; _*_
# This file is only used for windows compilation from github
# It will be replaced by Makevars in CRAN version
.PHONY: all xgblib
all: $(SHLIB)
$(SHLIB): xgblib
xgblib:
cp -r ../../src .
cp -r ../../wrapper .
cp -r ../../subtree .
PKG_CPPFLAGS= -DXGBOOST_CUSTOMIZE_MSG_ -DXGBOOST_CUSTOMIZE_PRNG_ -DXGBOOST_STRICT_CXX98_ -DRABIT_CUSTOMIZE_MSG_ -DRABIT_STRICT_CXX98_ -I$(PKGROOT) -I../..
PKG_CXXFLAGS= $(SHLIB_OPENMP_CFLAGS) $(SHLIB_PTHREAD_FLAGS)
PKG_LIBS = $(SHLIB_OPENMP_CFLAGS) $(SHLIB_PTHREAD_FLAGS)
OBJECTS= xgboost_R.o xgboost_assert.o $(PKGROOT)/wrapper/xgboost_wrapper.o $(PKGROOT)/src/io/io.o $(PKGROOT)/src/gbm/gbm.o $(PKGROOT)/src/tree/updater.o $(PKGROOT)/subtree/rabit/src/engine_empty.o $(PKGROOT)/src/io/dmlc_simple.o
$(OBJECTS) : xgblib

View File

@@ -1,344 +0,0 @@
// Copyright (c) 2014 by Contributors
#include <vector>
#include <string>
#include <utility>
#include <cstring>
#include <cstdio>
#include <sstream>
#include "wrapper/xgboost_wrapper.h"
#include "src/utils/utils.h"
#include "src/utils/omp.h"
#include "xgboost_R.h"
using namespace std;
using namespace xgboost;
extern "C" {
void XGBoostAssert_R(int exp, const char *fmt, ...);
void XGBoostCheck_R(int exp, const char *fmt, ...);
int XGBoostSPrintf_R(char *buf, size_t size, const char *fmt, ...);
}
// implements error handling
namespace xgboost {
namespace utils {
extern "C" {
void (*Printf)(const char *fmt, ...) = Rprintf;
int (*SPrintf)(char *buf, size_t size, const char *fmt, ...) = XGBoostSPrintf_R;
void (*Assert)(int exp, const char *fmt, ...) = XGBoostAssert_R;
void (*Check)(int exp, const char *fmt, ...) = XGBoostCheck_R;
void (*Error)(const char *fmt, ...) = error;
}
bool CheckNAN(double v) {
return ISNAN(v);
}
double LogGamma(double v) {
return lgammafn(v);
}
} // namespace utils
namespace random {
void Seed(unsigned seed) {
// warning("parameter seed is ignored, please set random seed using set.seed");
}
double Uniform(void) {
return unif_rand();
}
double Normal(void) {
return norm_rand();
}
} // namespace random
} // namespace xgboost
// call before wrapper starts
inline void _WrapperBegin(void) {
GetRNGstate();
}
// call after wrapper starts
inline void _WrapperEnd(void) {
PutRNGstate();
}
// do nothing, check error
inline void CheckErr(int ret) {
}
extern "C" {
SEXP XGCheckNullPtr_R(SEXP handle) {
return ScalarLogical(R_ExternalPtrAddr(handle) == NULL);
}
void _DMatrixFinalizer(SEXP ext) {
if (R_ExternalPtrAddr(ext) == NULL) return;
XGDMatrixFree(R_ExternalPtrAddr(ext));
R_ClearExternalPtr(ext);
}
SEXP XGDMatrixCreateFromFile_R(SEXP fname, SEXP silent) {
_WrapperBegin();
DMatrixHandle handle;
CheckErr(XGDMatrixCreateFromFile(CHAR(asChar(fname)), asInteger(silent), &handle));
_WrapperEnd();
SEXP ret = PROTECT(R_MakeExternalPtr(handle, R_NilValue, R_NilValue));
R_RegisterCFinalizerEx(ret, _DMatrixFinalizer, TRUE);
UNPROTECT(1);
return ret;
}
SEXP XGDMatrixCreateFromMat_R(SEXP mat,
SEXP missing) {
_WrapperBegin();
SEXP dim = getAttrib(mat, R_DimSymbol);
size_t nrow = static_cast<size_t>(INTEGER(dim)[0]);
size_t ncol = static_cast<size_t>(INTEGER(dim)[1]);
double *din = REAL(mat);
std::vector<float> data(nrow * ncol);
#pragma omp parallel for schedule(static)
for (bst_omp_uint i = 0; i < nrow; ++i) {
for (size_t j = 0; j < ncol; ++j) {
data[i * ncol +j] = din[i + nrow * j];
}
}
DMatrixHandle handle;
CheckErr(XGDMatrixCreateFromMat(BeginPtr(data), nrow, ncol, asReal(missing), &handle));
_WrapperEnd();
SEXP ret = PROTECT(R_MakeExternalPtr(handle, R_NilValue, R_NilValue));
R_RegisterCFinalizerEx(ret, _DMatrixFinalizer, TRUE);
UNPROTECT(1);
return ret;
}
SEXP XGDMatrixCreateFromCSC_R(SEXP indptr,
SEXP indices,
SEXP data) {
_WrapperBegin();
const int *p_indptr = INTEGER(indptr);
const int *p_indices = INTEGER(indices);
const double *p_data = REAL(data);
int nindptr = length(indptr);
int ndata = length(data);
std::vector<bst_ulong> col_ptr_(nindptr);
std::vector<unsigned> indices_(ndata);
std::vector<float> data_(ndata);
for (int i = 0; i < nindptr; ++i) {
col_ptr_[i] = static_cast<bst_ulong>(p_indptr[i]);
}
#pragma omp parallel for schedule(static)
for (int i = 0; i < ndata; ++i) {
indices_[i] = static_cast<unsigned>(p_indices[i]);
data_[i] = static_cast<float>(p_data[i]);
}
DMatrixHandle handle;
CheckErr(XGDMatrixCreateFromCSC(BeginPtr(col_ptr_), BeginPtr(indices_),
BeginPtr(data_), nindptr, ndata,
&handle));
_WrapperEnd();
SEXP ret = PROTECT(R_MakeExternalPtr(handle, R_NilValue, R_NilValue));
R_RegisterCFinalizerEx(ret, _DMatrixFinalizer, TRUE);
UNPROTECT(1);
return ret;
}
SEXP XGDMatrixSliceDMatrix_R(SEXP handle, SEXP idxset) {
_WrapperBegin();
int len = length(idxset);
std::vector<int> idxvec(len);
for (int i = 0; i < len; ++i) {
idxvec[i] = INTEGER(idxset)[i] - 1;
}
DMatrixHandle res;
CheckErr(XGDMatrixSliceDMatrix(R_ExternalPtrAddr(handle),
BeginPtr(idxvec), len,
&res));
_WrapperEnd();
SEXP ret = PROTECT(R_MakeExternalPtr(res, R_NilValue, R_NilValue));
R_RegisterCFinalizerEx(ret, _DMatrixFinalizer, TRUE);
UNPROTECT(1);
return ret;
}
void XGDMatrixSaveBinary_R(SEXP handle, SEXP fname, SEXP silent) {
_WrapperBegin();
CheckErr(XGDMatrixSaveBinary(R_ExternalPtrAddr(handle),
CHAR(asChar(fname)), asInteger(silent)));
_WrapperEnd();
}
void XGDMatrixSetInfo_R(SEXP handle, SEXP field, SEXP array) {
_WrapperBegin();
int len = length(array);
const char *name = CHAR(asChar(field));
if (!strcmp("group", name)) {
std::vector<unsigned> vec(len);
#pragma omp parallel for schedule(static)
for (int i = 0; i < len; ++i) {
vec[i] = static_cast<unsigned>(INTEGER(array)[i]);
}
CheckErr(XGDMatrixSetGroup(R_ExternalPtrAddr(handle), BeginPtr(vec), len));
} else {
std::vector<float> vec(len);
#pragma omp parallel for schedule(static)
for (int i = 0; i < len; ++i) {
vec[i] = REAL(array)[i];
}
CheckErr(XGDMatrixSetFloatInfo(R_ExternalPtrAddr(handle),
CHAR(asChar(field)),
BeginPtr(vec), len));
}
_WrapperEnd();
}
SEXP XGDMatrixGetInfo_R(SEXP handle, SEXP field) {
_WrapperBegin();
bst_ulong olen;
const float *res;
CheckErr(XGDMatrixGetFloatInfo(R_ExternalPtrAddr(handle),
CHAR(asChar(field)),
&olen,
&res));
_WrapperEnd();
SEXP ret = PROTECT(allocVector(REALSXP, olen));
for (size_t i = 0; i < olen; ++i) {
REAL(ret)[i] = res[i];
}
UNPROTECT(1);
return ret;
}
SEXP XGDMatrixNumRow_R(SEXP handle) {
bst_ulong nrow;
CheckErr(XGDMatrixNumRow(R_ExternalPtrAddr(handle), &nrow));
return ScalarInteger(static_cast<int>(nrow));
}
// functions related to booster
void _BoosterFinalizer(SEXP ext) {
if (R_ExternalPtrAddr(ext) == NULL) return;
CheckErr(XGBoosterFree(R_ExternalPtrAddr(ext)));
R_ClearExternalPtr(ext);
}
SEXP XGBoosterCreate_R(SEXP dmats) {
_WrapperBegin();
int len = length(dmats);
std::vector<void*> dvec;
for (int i = 0; i < len; ++i) {
dvec.push_back(R_ExternalPtrAddr(VECTOR_ELT(dmats, i)));
}
BoosterHandle handle;
CheckErr(XGBoosterCreate(BeginPtr(dvec), dvec.size(), &handle));
_WrapperEnd();
SEXP ret = PROTECT(R_MakeExternalPtr(handle, R_NilValue, R_NilValue));
R_RegisterCFinalizerEx(ret, _BoosterFinalizer, TRUE);
UNPROTECT(1);
return ret;
}
void XGBoosterSetParam_R(SEXP handle, SEXP name, SEXP val) {
_WrapperBegin();
CheckErr(XGBoosterSetParam(R_ExternalPtrAddr(handle),
CHAR(asChar(name)),
CHAR(asChar(val))));
_WrapperEnd();
}
void XGBoosterUpdateOneIter_R(SEXP handle, SEXP iter, SEXP dtrain) {
_WrapperBegin();
CheckErr(XGBoosterUpdateOneIter(R_ExternalPtrAddr(handle),
asInteger(iter),
R_ExternalPtrAddr(dtrain)));
_WrapperEnd();
}
void XGBoosterBoostOneIter_R(SEXP handle, SEXP dtrain, SEXP grad, SEXP hess) {
_WrapperBegin();
utils::Check(length(grad) == length(hess), "gradient and hess must have same length");
int len = length(grad);
std::vector<float> tgrad(len), thess(len);
#pragma omp parallel for schedule(static)
for (int j = 0; j < len; ++j) {
tgrad[j] = REAL(grad)[j];
thess[j] = REAL(hess)[j];
}
CheckErr(XGBoosterBoostOneIter(R_ExternalPtrAddr(handle),
R_ExternalPtrAddr(dtrain),
BeginPtr(tgrad), BeginPtr(thess),
len));
_WrapperEnd();
}
SEXP XGBoosterEvalOneIter_R(SEXP handle, SEXP iter, SEXP dmats, SEXP evnames) {
_WrapperBegin();
utils::Check(length(dmats) == length(evnames), "dmats and evnams must have same length");
int len = length(dmats);
std::vector<void*> vec_dmats;
std::vector<std::string> vec_names;
std::vector<const char*> vec_sptr;
for (int i = 0; i < len; ++i) {
vec_dmats.push_back(R_ExternalPtrAddr(VECTOR_ELT(dmats, i)));
vec_names.push_back(std::string(CHAR(asChar(VECTOR_ELT(evnames, i)))));
}
for (int i = 0; i < len; ++i) {
vec_sptr.push_back(vec_names[i].c_str());
}
const char *ret;
CheckErr(XGBoosterEvalOneIter(R_ExternalPtrAddr(handle),
asInteger(iter),
BeginPtr(vec_dmats),
BeginPtr(vec_sptr),
len, &ret));
_WrapperEnd();
return mkString(ret);
}
SEXP XGBoosterPredict_R(SEXP handle, SEXP dmat, SEXP option_mask, SEXP ntree_limit) {
_WrapperBegin();
bst_ulong olen;
const float *res;
CheckErr(XGBoosterPredict(R_ExternalPtrAddr(handle),
R_ExternalPtrAddr(dmat),
asInteger(option_mask),
asInteger(ntree_limit),
&olen, &res));
_WrapperEnd();
SEXP ret = PROTECT(allocVector(REALSXP, olen));
for (size_t i = 0; i < olen; ++i) {
REAL(ret)[i] = res[i];
}
UNPROTECT(1);
return ret;
}
void XGBoosterLoadModel_R(SEXP handle, SEXP fname) {
_WrapperBegin();
CheckErr(XGBoosterLoadModel(R_ExternalPtrAddr(handle), CHAR(asChar(fname))));
_WrapperEnd();
}
void XGBoosterSaveModel_R(SEXP handle, SEXP fname) {
_WrapperBegin();
CheckErr(XGBoosterSaveModel(R_ExternalPtrAddr(handle), CHAR(asChar(fname))));
_WrapperEnd();
}
void XGBoosterLoadModelFromRaw_R(SEXP handle, SEXP raw) {
_WrapperBegin();
XGBoosterLoadModelFromBuffer(R_ExternalPtrAddr(handle),
RAW(raw),
length(raw));
_WrapperEnd();
}
SEXP XGBoosterModelToRaw_R(SEXP handle) {
bst_ulong olen;
_WrapperBegin();
const char *raw;
CheckErr(XGBoosterGetModelRaw(R_ExternalPtrAddr(handle), &olen, &raw));
_WrapperEnd();
SEXP ret = PROTECT(allocVector(RAWSXP, olen));
if (olen != 0) {
memcpy(RAW(ret), raw, olen);
}
UNPROTECT(1);
return ret;
}
SEXP XGBoosterDumpModel_R(SEXP handle, SEXP fmap, SEXP with_stats) {
_WrapperBegin();
bst_ulong olen;
const char **res;
CheckErr(XGBoosterDumpModel(R_ExternalPtrAddr(handle),
CHAR(asChar(fmap)),
asInteger(with_stats),
&olen, &res));
_WrapperEnd();
SEXP out = PROTECT(allocVector(STRSXP, olen));
for (size_t i = 0; i < olen; ++i) {
stringstream stream;
stream << "booster[" << i <<"]\n" << res[i];
SET_STRING_ELT(out, i, mkChar(stream.str().c_str()));
}
UNPROTECT(1);
return out;
}
}

View File

@@ -1,158 +0,0 @@
/*!
* Copyright 2014 (c) by Contributors
* \file xgboost_wrapper_R.h
* \author Tianqi Chen
* \brief R wrapper of xgboost
*/
#ifndef XGBOOST_WRAPPER_R_H_ // NOLINT(*)
#define XGBOOST_WRAPPER_R_H_ // NOLINT(*)
extern "C" {
#include <Rinternals.h>
#include <R_ext/Random.h>
#include <Rmath.h>
}
extern "C" {
/*!
* \brief check whether a handle is NULL
* \param handle
* \return whether it is null ptr
*/
SEXP XGCheckNullPtr_R(SEXP handle);
/*!
* \brief load a data matrix
* \param fname name of the content
* \param silent whether print messages
* \return a loaded data matrix
*/
SEXP XGDMatrixCreateFromFile_R(SEXP fname, SEXP silent);
/*!
* \brief create matrix content from dense matrix
* This assumes the matrix is stored in column major format
* \param data R Matrix object
* \param missing which value to represent missing value
* \return created dmatrix
*/
SEXP XGDMatrixCreateFromMat_R(SEXP mat,
SEXP missing);
/*!
* \brief create a matrix content from CSC format
* \param indptr pointer to column headers
* \param indices row indices
* \param data content of the data
* \return created dmatrix
*/
SEXP XGDMatrixCreateFromCSC_R(SEXP indptr,
SEXP indices,
SEXP data);
/*!
* \brief create a new dmatrix from sliced content of existing matrix
* \param handle instance of data matrix to be sliced
* \param idxset index set
* \return a sliced new matrix
*/
SEXP XGDMatrixSliceDMatrix_R(SEXP handle, SEXP idxset);
/*!
* \brief load a data matrix into binary file
* \param handle a instance of data matrix
* \param fname file name
* \param silent print statistics when saving
*/
void XGDMatrixSaveBinary_R(SEXP handle, SEXP fname, SEXP silent);
/*!
* \brief set information to dmatrix
* \param handle a instance of data matrix
* \param field field name, can be label, weight
* \param array pointer to float vector
*/
void XGDMatrixSetInfo_R(SEXP handle, SEXP field, SEXP array);
/*!
* \brief get info vector from matrix
* \param handle a instance of data matrix
* \param field field name
* \return info vector
*/
SEXP XGDMatrixGetInfo_R(SEXP handle, SEXP field);
/*!
* \brief return number of rows
* \param handle a instance of data matrix
*/
SEXP XGDMatrixNumRow_R(SEXP handle);
/*!
* \brief create xgboost learner
* \param dmats a list of dmatrix handles that will be cached
*/
SEXP XGBoosterCreate_R(SEXP dmats);
/*!
* \brief set parameters
* \param handle handle
* \param name parameter name
* \param val value of parameter
*/
void XGBoosterSetParam_R(SEXP handle, SEXP name, SEXP val);
/*!
* \brief update the model in one round using dtrain
* \param handle handle
* \param iter current iteration rounds
* \param dtrain training data
*/
void XGBoosterUpdateOneIter_R(SEXP ext, SEXP iter, SEXP dtrain);
/*!
* \brief update the model, by directly specify gradient and second order gradient,
* this can be used to replace UpdateOneIter, to support customized loss function
* \param handle handle
* \param dtrain training data
* \param grad gradient statistics
* \param hess second order gradient statistics
*/
void XGBoosterBoostOneIter_R(SEXP handle, SEXP dtrain, SEXP grad, SEXP hess);
/*!
* \brief get evaluation statistics for xgboost
* \param handle handle
* \param iter current iteration rounds
* \param dmats list of handles to dmatrices
* \param evname name of evaluation
* \return the string containing evaluation stati
*/
SEXP XGBoosterEvalOneIter_R(SEXP handle, SEXP iter, SEXP dmats, SEXP evnames);
/*!
* \brief make prediction based on dmat
* \param handle handle
* \param dmat data matrix
* \param option_mask output_margin:1 predict_leaf:2
* \param ntree_limit limit number of trees used in prediction
*/
SEXP XGBoosterPredict_R(SEXP handle, SEXP dmat, SEXP option_mask, SEXP ntree_limit);
/*!
* \brief load model from existing file
* \param handle handle
* \param fname file name
*/
void XGBoosterLoadModel_R(SEXP handle, SEXP fname);
/*!
* \brief save model into existing file
* \param handle handle
* \param fname file name
*/
void XGBoosterSaveModel_R(SEXP handle, SEXP fname);
/*!
* \brief load model from raw array
* \param handle handle
*/
void XGBoosterLoadModelFromRaw_R(SEXP handle, SEXP raw);
/*!
* \brief save model into R's raw array
* \param handle handle
* \return raw array
*/
SEXP XGBoosterModelToRaw_R(SEXP handle);
/*!
* \brief dump model into a string
* \param handle handle
* \param fmap name to fmap can be empty string
* \param with_stats whether dump statistics of splits
*/
SEXP XGBoosterDumpModel_R(SEXP handle, SEXP fmap, SEXP with_stats);
}
#endif // XGBOOST_WRAPPER_R_H_ // NOLINT(*)

View File

@@ -1,34 +0,0 @@
// Copyright (c) 2014 by Contributors
#include <stdio.h>
#include <stdarg.h>
#include <Rinternals.h>
// implements error handling
void XGBoostAssert_R(int exp, const char *fmt, ...) {
char buf[1024];
if (exp == 0) {
va_list args;
va_start(args, fmt);
vsprintf(buf, fmt, args);
va_end(args);
error("AssertError:%s\n", buf);
}
}
void XGBoostCheck_R(int exp, const char *fmt, ...) {
char buf[1024];
if (exp == 0) {
va_list args;
va_start(args, fmt);
vsprintf(buf, fmt, args);
va_end(args);
error("%s\n", buf);
}
}
int XGBoostSPrintf_R(char *buf, size_t size, const char *fmt, ...) {
int ret;
va_list args;
va_start(args, fmt);
ret = vsnprintf(buf, size, fmt, args);
va_end(args);
return ret;
}

View File

@@ -1,4 +0,0 @@
library(testthat)
library(xgboost)
test_check("xgboost")

View File

@@ -1,36 +0,0 @@
require(xgboost)
context("basic functions")
data(agaricus.train, package='xgboost')
data(agaricus.test, package='xgboost')
train <- agaricus.train
test <- agaricus.test
set.seed(1994)
test_that("train and predict", {
bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
eta = 1, nthread = 2, nround = 2, objective = "binary:logistic")
pred <- predict(bst, test$data)
expect_equal(length(pred), 1611)
})
test_that("early stopping", {
res <- xgb.cv(data = train$data, label = train$label, max.depth = 2, nfold = 5,
eta = 0.3, nthread = 2, nround = 20, objective = "binary:logistic",
early.stop.round = 3, maximize = FALSE)
expect_true(nrow(res) < 20)
bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
eta = 0.3, nthread = 2, nround = 20, objective = "binary:logistic",
early.stop.round = 3, maximize = FALSE)
pred <- predict(bst, test$data)
expect_equal(length(pred), 1611)
})
test_that("save_period", {
bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
eta = 0.3, nthread = 2, nround = 20, objective = "binary:logistic",
save_period = 10, save_name = "xgb.model")
pred <- predict(bst, test$data)
expect_equal(length(pred), 1611)
})

View File

@@ -1,48 +0,0 @@
context('Test models with custom objective')
require(xgboost)
data(agaricus.train, package='xgboost')
data(agaricus.test, package='xgboost')
dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label)
test_that("custom objective works", {
watchlist <- list(eval = dtest, train = dtrain)
num_round <- 2
logregobj <- function(preds, dtrain) {
labels <- getinfo(dtrain, "label")
preds <- 1 / (1 + exp(-preds))
grad <- preds - labels
hess <- preds * (1 - preds)
return(list(grad = grad, hess = hess))
}
evalerror <- function(preds, dtrain) {
labels <- getinfo(dtrain, "label")
err <- as.numeric(sum(labels != (preds > 0))) / length(labels)
return(list(metric = "error", value = err))
}
param <- list(max.depth=2, eta=1, nthread = 2, silent=1,
objective=logregobj, eval_metric=evalerror)
bst <- xgb.train(param, dtrain, num_round, watchlist)
expect_equal(class(bst), "xgb.Booster")
expect_equal(length(bst$raw), 1064)
attr(dtrain, 'label') <- getinfo(dtrain, 'label')
logregobjattr <- function(preds, dtrain) {
labels <- attr(dtrain, 'label')
preds <- 1 / (1 + exp(-preds))
grad <- preds - labels
hess <- preds * (1 - preds)
return(list(grad = grad, hess = hess))
}
param <- list(max.depth=2, eta=1, nthread = 2, silent = 1,
objective = logregobjattr, eval_metric = evalerror)
bst <- xgb.train(param, dtrain, num_round, watchlist)
expect_equal(class(bst), "xgb.Booster")
expect_equal(length(bst$raw), 1064)
})

View File

@@ -1,19 +0,0 @@
context('Test generalized linear models')
require(xgboost)
test_that("glm works", {
data(agaricus.train, package='xgboost')
data(agaricus.test, package='xgboost')
dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label)
expect_equal(class(dtrain), "xgb.DMatrix")
expect_equal(class(dtest), "xgb.DMatrix")
param <- list(objective = "binary:logistic", booster = "gblinear",
nthread = 2, alpha = 0.0001, lambda = 1)
watchlist <- list(eval = dtest, train = dtrain)
num_round <- 2
bst <- xgb.train(param, dtrain, num_round, watchlist)
ypred <- predict(bst, dtest)
expect_equal(length(getinfo(dtest, 'label')), 1611)
})

View File

@@ -1,68 +0,0 @@
context('Test helper functions')
require(xgboost)
require(data.table)
require(Matrix)
require(vcd)
set.seed(1982)
data(Arthritis)
data(agaricus.train, package='xgboost')
df <- data.table(Arthritis, keep.rownames = F)
df[,AgeDiscret := as.factor(round(Age / 10,0))]
df[,AgeCat := as.factor(ifelse(Age > 30, "Old", "Young"))]
df[,ID := NULL]
sparse_matrix <- sparse.model.matrix(Improved~.-1, data = df)
output_vector <- df[,Y := 0][Improved == "Marked",Y := 1][,Y]
bst.Tree <- xgboost(data = sparse_matrix, label = output_vector, max.depth = 9,
eta = 1, nthread = 2, nround = 10, objective = "binary:logistic", booster = "gbtree")
bst.GLM <- xgboost(data = sparse_matrix, label = output_vector,
eta = 1, nthread = 2, nround = 10, objective = "binary:logistic", booster = "gblinear")
feature.names <- agaricus.train$data@Dimnames[[2]]
test_that("xgb.dump works", {
capture.output(print(xgb.dump(bst.Tree)))
capture.output(print(xgb.dump(bst.GLM)))
expect_true(xgb.dump(bst.Tree, 'xgb.model.dump', with.stats = T))
})
test_that("xgb.model.dt.tree works with and without feature names", {
names.dt.trees <- c("ID", "Feature", "Split", "Yes", "No", "Missing", "Quality", "Cover",
"Tree", "Yes.Feature", "Yes.Cover", "Yes.Quality", "No.Feature", "No.Cover", "No.Quality")
dt.tree <- xgb.model.dt.tree(feature_names = feature.names, model = bst.Tree)
expect_equal(names.dt.trees, names(dt.tree))
expect_equal(dim(dt.tree), c(162, 15))
xgb.model.dt.tree(model = bst.Tree)
})
test_that("xgb.importance works with and without feature names", {
importance.Tree <- xgb.importance(feature_names = sparse_matrix@Dimnames[[2]], model = bst.Tree)
expect_equal(dim(importance.Tree), c(7, 4))
expect_equal(colnames(importance.Tree), c("Feature", "Gain", "Cover", "Frequency"))
xgb.importance(model = bst.Tree)
xgb.plot.importance(importance_matrix = importance.Tree)
})
test_that("xgb.importance works with GLM model", {
importance.GLM <- xgb.importance(feature_names = sparse_matrix@Dimnames[[2]], model = bst.GLM)
expect_equal(dim(importance.GLM), c(10, 2))
expect_equal(colnames(importance.GLM), c("Feature", "Weight"))
xgb.importance(model = bst.GLM)
xgb.plot.importance(importance.GLM)
})
test_that("xgb.plot.tree works with and without feature names", {
xgb.plot.tree(feature_names = feature.names, model = bst.Tree)
xgb.plot.tree(model = bst.Tree)
})
test_that("xgb.plot.multi.trees works with and without feature names", {
xgb.plot.multi.trees(model = bst.Tree, feature_names = feature.names, features.keep = 3)
xgb.plot.multi.trees(model = bst.Tree, features.keep = 3)
})
test_that("xgb.plot.deepness works", {
xgb.plot.deepness(model = bst.Tree)
})

View File

@@ -1,27 +0,0 @@
context("Code is of high quality and lint free")
test_that("Code Lint", {
skip_on_cran()
skip_on_travis()
skip_if_not_installed("lintr")
my_linters <- list(
absolute_paths_linter=lintr::absolute_paths_linter,
assignment_linter=lintr::assignment_linter,
closed_curly_linter=lintr::closed_curly_linter,
commas_linter=lintr::commas_linter,
# commented_code_linter=lintr::commented_code_linter,
infix_spaces_linter=lintr::infix_spaces_linter,
line_length_linter=lintr::line_length_linter,
no_tab_linter=lintr::no_tab_linter,
object_usage_linter=lintr::object_usage_linter,
# snake_case_linter=lintr::snake_case_linter,
# multiple_dots_linter=lintr::multiple_dots_linter,
object_length_linter=lintr::object_length_linter,
open_curly_linter=lintr::open_curly_linter,
# single_quotes_linter=lintr::single_quotes_linter,
spaces_inside_linter=lintr::spaces_inside_linter,
spaces_left_parentheses_linter=lintr::spaces_left_parentheses_linter,
trailing_blank_lines_linter=lintr::trailing_blank_lines_linter,
trailing_whitespace_linter=lintr::trailing_whitespace_linter
)
# lintr::expect_lint_free(linters=my_linters) # uncomment this if you want to check code quality
})

View File

@@ -1,32 +0,0 @@
context('Test model params and call are exposed to R')
require(xgboost)
data(agaricus.train, package='xgboost')
data(agaricus.test, package='xgboost')
dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label)
bst <- xgboost(data = dtrain,
max.depth = 2,
eta = 1,
nround = 10,
nthread = 1,
verbose = 0,
objective = "binary:logistic")
test_that("call is exposed to R", {
model_call <- attr(bst, "call")
expect_is(model_call, "call")
})
test_that("params is exposed to R", {
model_params <- attr(bst, "params")
expect_is(model_params, "list")
expect_equal(model_params$eta, 1)
expect_equal(model_params$max.depth, 2)
expect_equal(model_params$objective, "binary:logistic")
})

View File

@@ -1,14 +0,0 @@
context('Test poisson regression model')
require(xgboost)
set.seed(1994)
test_that("poisson regression works", {
data(mtcars)
bst <- xgboost(data = as.matrix(mtcars[,-11]),label = mtcars[,11],
objective = 'count:poisson', nrounds=5)
expect_equal(class(bst), "xgb.Booster")
pred <- predict(bst,as.matrix(mtcars[, -11]))
expect_equal(length(pred), 32)
expect_equal(sqrt(mean( (pred - mtcars[,11]) ^ 2)), 1.16, tolerance = 0.01)
})

View File

@@ -1,337 +0,0 @@
---
title: "Understand your dataset with Xgboost"
output:
rmarkdown::html_vignette:
css: vignette.css
number_sections: yes
toc: yes
author: Tianqi Chen, Tong He, Michaël Benesty
vignette: >
%\VignetteIndexEntry{Discover your data}
%\VignetteEngine{knitr::rmarkdown}
\usepackage[utf8]{inputenc}
---
Introduction
============
The purpose of this Vignette is to show you how to use **Xgboost** to discover and understand your own dataset better.
This Vignette is not about predicting anything (see [Xgboost presentation](https://github.com/dmlc/xgboost/blob/master/R-package/vignettes/xgboostPresentation.Rmd)). We will explain how to use **Xgboost** to highlight the *link* between the *features* of your data and the *outcome*.
Pacakge loading:
```{r libLoading, results='hold', message=F, warning=F}
require(xgboost)
require(Matrix)
require(data.table)
if (!require('vcd')) install.packages('vcd')
```
> **VCD** package is used for one of its embedded dataset only.
Preparation of the dataset
==========================
Numeric VS categorical variables
--------------------------------
**Xgboost** manages only `numeric` vectors.
What to do when you have *categorical* data?
A *categorical* variable has a fixed number of different values. For instance, if a variable called *Colour* can have only one of these three values, *red*, *blue* or *green*, then *Colour* is a *categorical* variable.
> In **R**, a *categorical* variable is called `factor`.
>
> Type `?factor` in the console for more information.
To answer the question above we will convert *categorical* variables to `numeric` one.
Conversion from categorical to numeric variables
------------------------------------------------
### Looking at the raw data
In this Vignette we will see how to transform a *dense* `data.frame` (*dense* = few zeroes in the matrix) with *categorical* variables to a very *sparse* matrix (*sparse* = lots of zero in the matrix) of `numeric` features.
The method we are going to see is usually called [one-hot encoding](http://en.wikipedia.org/wiki/One-hot).
The first step is to load `Arthritis` dataset in memory and wrap it with `data.table` package.
```{r, results='hide'}
data(Arthritis)
df <- data.table(Arthritis, keep.rownames = F)
```
> `data.table` is 100% compliant with **R** `data.frame` but its syntax is more consistent and its performance for large dataset is [best in class](http://stackoverflow.com/questions/21435339/data-table-vs-dplyr-can-one-do-something-well-the-other-cant-or-does-poorly) (`dplyr` from **R** and `panda` from **Python** [included](https://github.com/Rdatatable/data.table/wiki/Benchmarks-%3A-Grouping)). Some parts of **Xgboost** **R** package use `data.table`.
The first thing we want to do is to have a look to the first lines of the `data.table`:
```{r}
head(df)
```
Now we will check the format of each column.
```{r}
str(df)
```
2 columns have `factor` type, one has `ordinal` type.
> `ordinal` variable :
>
> * can take a limited number of values (like `factor`) ;
> * these values are ordered (unlike `factor`). Here these ordered values are: `Marked > Some > None`
### Creation of new features based on old ones
We will add some new *categorical* features to see if it helps.
#### Grouping per 10 years
For the first feature we create groups of age by rounding the real age.
Note that we transform it to `factor` so the algorithm treat these age groups as independent values.
Therefore, 20 is not closer to 30 than 60. To make it short, the distance between ages is lost in this transformation.
```{r}
head(df[,AgeDiscret := as.factor(round(Age/10,0))])
```
#### Random split in two groups
Following is an even stronger simplification of the real age with an arbitrary split at 30 years old. I choose this value **based on nothing**. We will see later if simplifying the information based on arbitrary values is a good strategy (you may already have an idea of how well it will work...).
```{r}
head(df[,AgeCat:= as.factor(ifelse(Age > 30, "Old", "Young"))])
```
#### Risks in adding correlated features
These new features are highly correlated to the `Age` feature because they are simple transformations of this feature.
For many machine learning algorithms, using correlated features is not a good idea. It may sometimes make prediction less accurate, and most of the time make interpretation of the model almost impossible. GLM, for instance, assumes that the features are uncorrelated.
Fortunately, decision tree algorithms (including boosted trees) are very robust to these features. Therefore we have nothing to do to manage this situation.
#### Cleaning data
We remove ID as there is nothing to learn from this feature (it would just add some noise).
```{r, results='hide'}
df[,ID:=NULL]
```
We will list the different values for the column `Treatment`:
```{r}
levels(df[,Treatment])
```
### One-hot encoding
Next step, we will transform the categorical data to dummy variables.
This is the [one-hot encoding](http://en.wikipedia.org/wiki/One-hot) step.
The purpose is to transform each value of each *categorical* feature in a *binary* feature `{0, 1}`.
For example, the column `Treatment` will be replaced by two columns, `Placebo`, and `Treated`. Each of them will be *binary*. Therefore, an observation which has the value `Placebo` in column `Treatment` before the transformation will have after the transformation the value `1` in the new column `Placebo` and the value `0` in the new column `Treated`. The column `Treatment` will disappear during the one-hot encoding.
Column `Improved` is excluded because it will be our `label` column, the one we want to predict.
```{r, warning=FALSE,message=FALSE}
sparse_matrix <- sparse.model.matrix(Improved~.-1, data = df)
head(sparse_matrix)
```
> Formulae `Improved~.-1` used above means transform all *categorical* features but column `Improved` to binary values. The `-1` is here to remove the first column which is full of `1` (this column is generated by the conversion). For more information, you can type `?sparse.model.matrix` in the console.
Create the output `numeric` vector (not as a sparse `Matrix`):
```{r}
output_vector = df[,Improved] == "Marked"
```
1. set `Y` vector to `0`;
2. set `Y` to `1` for rows where `Improved == Marked` is `TRUE` ;
3. return `Y` vector.
Build the model
===============
The code below is very usual. For more information, you can look at the documentation of `xgboost` function (or at the vignette [Xgboost presentation](https://github.com/dmlc/xgboost/blob/master/R-package/vignettes/xgboostPresentation.Rmd)).
```{r}
bst <- xgboost(data = sparse_matrix, label = output_vector, max.depth = 4,
eta = 1, nthread = 2, nround = 10,objective = "binary:logistic")
```
You can see some `train-error: 0.XXXXX` lines followed by a number. It decreases. Each line shows how well the model explains your data. Lower is better.
A model which fits too well may [overfit](http://en.wikipedia.org/wiki/Overfitting) (meaning it copy/paste too much the past, and won't be that good to predict the future).
> Here you can see the numbers decrease until line 7 and then increase.
>
> It probably means we are overfitting. To fix that I should reduce the number of rounds to `nround = 4`. I will let things like that because I don't really care for the purpose of this example :-)
Feature importance
==================
Measure feature importance
--------------------------
### Build the feature importance data.table
In the code below, `sparse_matrix@Dimnames[[2]]` represents the column names of the sparse matrix. These names are the original values of the features (remember, each binary column == one value of one *categorical* feature).
```{r}
importance <- xgb.importance(feature_names = sparse_matrix@Dimnames[[2]], model = bst)
head(importance)
```
> The column `Gain` provide the information we are looking for.
>
> As you can see, features are classified by `Gain`.
`Gain` is the improvement in accuracy brought by a feature to the branches it is on. The idea is that before adding a new split on a feature X to the branch there was some wrongly classified elements, after adding the split on this feature, there are two new branches, and each of these branch is more accurate (one branch saying if your observation is on this branch then it should be classified as `1`, and the other branch saying the exact opposite).
`Cover` measures the relative quantity of observations concerned by a feature.
`Frequency` is a simpler way to measure the `Gain`. It just counts the number of times a feature is used in all generated trees. You should not use it (unless you know why you want to use it).
### Improvement in the interpretability of feature importance data.table
We can go deeper in the analysis of the model. In the `data.table` above, we have discovered which features counts to predict if the illness will go or not. But we don't yet know the role of these features. For instance, one of the question we may want to answer would be: does receiving a placebo treatment helps to recover from the illness?
One simple solution is to count the co-occurrences of a feature and a class of the classification.
For that purpose we will execute the same function as above but using two more parameters, `data` and `label`.
```{r}
importanceRaw <- xgb.importance(feature_names = sparse_matrix@Dimnames[[2]], model = bst, data = sparse_matrix, label = output_vector)
# Cleaning for better display
importanceClean <- importanceRaw[,`:=`(Cover=NULL, Frequency=NULL)]
head(importanceClean)
```
> In the table above we have removed two not needed columns and select only the first lines.
First thing you notice is the new column `Split`. It is the split applied to the feature on a branch of one of the tree. Each split is present, therefore a feature can appear several times in this table. Here we can see the feature `Age` is used several times with different splits.
How the split is applied to count the co-occurrences? It is always `<`. For instance, in the second line, we measure the number of persons under 61.5 years with the illness gone after the treatment.
The two other new columns are `RealCover` and `RealCover %`. In the first column it measures the number of observations in the dataset where the split is respected and the label marked as `1`. The second column is the percentage of the whole population that `RealCover` represents.
Therefore, according to our findings, getting a placebo doesn't seem to help but being younger than 61 years may help (seems logic).
> You may wonder how to interpret the `< 1.00001` on the first line. Basically, in a sparse `Matrix`, there is no `0`, therefore, looking for one hot-encoded categorical observations validating the rule `< 1.00001` is like just looking for `1` for this feature.
Plotting the feature importance
-------------------------------
All these things are nice, but it would be even better to plot the results.
```{r, fig.width=8, fig.height=5, fig.align='center'}
xgb.plot.importance(importance_matrix = importanceRaw)
```
Feature have automatically been divided in 2 clusters: the interesting features... and the others.
> Depending of the dataset and the learning parameters you may have more than two clusters. Default value is to limit them to `10`, but you can increase this limit. Look at the function documentation for more information.
According to the plot above, the most important features in this dataset to predict if the treatment will work are :
* the Age ;
* having received a placebo or not ;
* the sex is third but already included in the not interesting features group ;
* then we see our generated features (AgeDiscret). We can see that their contribution is very low.
Do these results make sense?
------------------------------
Let's check some **Chi2** between each of these features and the label.
Higher **Chi2** means better correlation.
```{r, warning=FALSE, message=FALSE}
c2 <- chisq.test(df$Age, output_vector)
print(c2)
```
Pearson correlation between Age and illness disapearing is **`r round(c2$statistic, 2 )`**.
```{r, warning=FALSE, message=FALSE}
c2 <- chisq.test(df$AgeDiscret, output_vector)
print(c2)
```
Our first simplification of Age gives a Pearson correlation is **`r round(c2$statistic, 2)`**.
```{r, warning=FALSE, message=FALSE}
c2 <- chisq.test(df$AgeCat, output_vector)
print(c2)
```
The perfectly random split I did between young and old at 30 years old have a low correlation of **`r round(c2$statistic, 2)`**. It's a result we may expect as may be in my mind > 30 years is being old (I am 32 and starting feeling old, this may explain that), but for the illness we are studying, the age to be vulnerable is not the same.
Morality: don't let your *gut* lower the quality of your model.
In *data science* expression, there is the word *science* :-)
Conclusion
==========
As you can see, in general *destroying information by simplifying it won't improve your model*. **Chi2** just demonstrates that.
But in more complex cases, creating a new feature based on existing one which makes link with the outcome more obvious may help the algorithm and improve the model.
The case studied here is not enough complex to show that. Check [Kaggle website](http://www.kaggle.com/) for some challenging datasets. However it's almost always worse when you add some arbitrary rules.
Moreover, you can notice that even if we have added some not useful new features highly correlated with other features, the boosting tree algorithm have been able to choose the best one, which in this case is the Age.
Linear model may not be that smart in this scenario.
Special Note: What about Random Forests™?
==========================================
As you may know, [Random Forests™](http://en.wikipedia.org/wiki/Random_forest) algorithm is cousin with boosting and both are part of the [ensemble learning](http://en.wikipedia.org/wiki/Ensemble_learning) family.
Both trains several decision trees for one dataset. The *main* difference is that in Random Forests™, trees are independent and in boosting, the tree `N+1` focus its learning on the loss (<=> what has not been well modeled by the tree `N`).
This difference have an impact on a corner case in feature importance analysis: the *correlated features*.
Imagine two features perfectly correlated, feature `A` and feature `B`. For one specific tree, if the algorithm needs one of them, it will choose randomly (true in both boosting and Random Forests™).
However, in Random Forests™ this random choice will be done for each tree, because each tree is independent from the others. Therefore, approximatively, depending of your parameters, 50% of the trees will choose feature `A` and the other 50% will choose feature `B`. So the *importance* of the information contained in `A` and `B` (which is the same, because they are perfectly correlated) is diluted in `A` and `B`. So you won't easily know this information is important to predict what you want to predict! It is even worse when you have 10 correlated features...
In boosting, when a specific link between feature and outcome have been learned by the algorithm, it will try to not refocus on it (in theory it is what happens, reality is not always that simple). Therefore, all the importance will be on feature `A` or on feature `B` (but not both). You will know that one feature have an important role in the link between the observations and the label. It is still up to you to search for the correlated features to the one detected as important if you need to know all of them.
If you want to try Random Forests™ algorithm, you can tweak Xgboost parameters!
**Warning**: this is still an experimental parameter.
For instance, to compute a model with 1000 trees, with a 0.5 factor on sampling rows and columns:
```{r, warning=FALSE, message=FALSE}
data(agaricus.train, package='xgboost')
data(agaricus.test, package='xgboost')
train <- agaricus.train
test <- agaricus.test
#Random Forest™ - 1000 trees
bst <- xgboost(data = train$data, label = train$label, max.depth = 4, num_parallel_tree = 1000, subsample = 0.5, colsample_bytree =0.5, nround = 1, objective = "binary:logistic")
#Boosting - 3 rounds
bst <- xgboost(data = train$data, label = train$label, max.depth = 4, nround = 3, objective = "binary:logistic")
```
> Note that the parameter `round` is set to `1`.
> [**Random Forests™**](https://www.stat.berkeley.edu/~breiman/RandomForests/cc_papers.htm) is a trademark of Leo Breiman and Adele Cutler and is licensed exclusively to Salford Systems for the commercial release of the software.

View File

@@ -1,225 +0,0 @@
body {
margin: 0 auto;
background-color: white;
/* --------- FONT FAMILY --------
following are some optional font families. Usually a family
is safer to choose than a specific font,
which may not be on the users computer */
/ font-family:Georgia, Palatino, serif;
font-family: "Open Sans", "Book Antiqua", Palatino, serif;
/ font-family:Arial, Helvetica, sans-serif;
/ font-family:Tahoma, Verdana, Geneva, sans-serif;
/ font-family:Courier, monospace;
/ font-family:"Times New Roman", Times, serif;
/* -------------- COLOR OPTIONS ------------
following are additional color options for base font
you could uncomment another one to easily change the base color
or add one to a specific element style below */
color: #333333; /* dark gray not black */
/ color: #000000; /* black */
/ color: #666666; /* medium gray black */
/ color: #E3E3E3; /* very light gray */
/ color: white;
line-height: 100%;
max-width: 800px;
padding: 10px;
font-size: 17px;
text-align: justify;
text-justify: inter-word;
}
p {
line-height: 150%;
/ max-width: 540px;
max-width: 960px;
margin-bottom: 5px;
font-weight: 400;
/ color: #333333
}
h1, h2, h3, h4, h5, h6 {
font-weight: 400;
margin-top: 35px;
margin-bottom: 15px;
padding-top: 10px;
}
h1 {
margin-top: 70px;
color: #606AAA;
font-size:230%;
font-variant:small-caps;
padding-bottom:20px;
width:100%;
border-bottom:1px solid #606AAA;
}
h2 {
font-size:160%;
}
h3 {
font-size:130%;
}
h4 {
font-size:120%;
font-variant:small-caps;
}
h5 {
font-size:120%;
}
h6 {
font-size:120%;
font-variant:small-caps;
}
a {
color: #606AAA;
margin: 0;
padding: 0;
vertical-align: baseline;
}
a:hover {
text-decoration: blink;
color: green;
}
a:visited {
color: gray;
}
ul, ol {
padding: 0;
margin: 0px 0px 0px 50px;
}
ul {
list-style-type: square;
list-style-position: inside;
}
li {
line-height:150%
}
li ul, li ul {
margin-left: 24px;
}
pre {
padding: 0px 10px;
max-width: 800px;
white-space: pre-wrap;
}
code {
font-family: Consolas, Monaco, Andale Mono, monospace, courrier new;
line-height: 1.5;
font-size: 15px;
background: #F8F8F8;
border-radius: 4px;
padding: 5px;
display: inline-block;
max-width: 800px;
white-space: pre-wrap;
}
li code, p code {
background: #CDCDCD;
color: #606AAA;
padding: 0px 5px 0px 5px;
}
code.r, code.cpp {
display: block;
word-wrap: break-word;
border: 1px solid #606AAA;
}
aside {
display: block;
float: right;
width: 390px;
}
blockquote {
border-left:.5em solid #606AAA;
background: #F8F8F8;
padding: 0em 1em 0em 1em;
margin-left:10px;
max-width: 500px;
}
blockquote cite {
line-height:10px;
color:#bfbfbf;
}
blockquote cite:before {
/content: '\2014 \00A0';
}
blockquote p, blockquote li {
color: #666;
}
hr {
/ width: 540px;
text-align: left;
margin: 0 auto 0 0;
color: #999;
}
/* table */
table {
width: 100%;
border-top: 1px solid #919699;
border-left: 1px solid #919699;
border-spacing: 0;
}
table th {
padding: 4px 8px 4px 8px;
text-align: center;
color: white;
background: #606AAA;
border-bottom: 1px solid #919699;
border-right: 1px solid #919699;
}
table th p {
font-weight: bold;
margin-bottom: 0px;
}
table td {
padding: 8px;
vertical-align: top;
border-bottom: 1px solid #919699;
border-right: 1px solid #919699;
}
table td:last-child {
/background: lightgray;
text-align: right;
}
table td p {
margin-bottom: 0px;
}
table td p + p {
margin-top: 5px;
}
table td p + p + p {
margin-top: 5px;
}

View File

@@ -1,221 +0,0 @@
\documentclass{article}
\RequirePackage{url}
\usepackage{hyperref}
\RequirePackage{amsmath}
\RequirePackage{natbib}
\RequirePackage[a4paper,lmargin={1.25in},rmargin={1.25in},tmargin={1in},bmargin={1in}]{geometry}
\makeatletter
% \VignetteIndexEntry{xgboost: eXtreme Gradient Boosting}
%\VignetteKeywords{xgboost, gbm, gradient boosting machines}
%\VignettePackage{xgboost}
% \VignetteEngine{knitr::knitr}
\makeatother
\begin{document}
%\SweaveOpts{concordance=TRUE}
<<knitropts,echo=FALSE,message=FALSE>>=
if (require('knitr')) opts_chunk$set(fig.width = 5, fig.height = 5, fig.align = 'center', tidy = FALSE, warning = FALSE, cache = TRUE)
@
%
<<prelim,echo=FALSE>>=
xgboost.version = '0.3-0'
@
%
\begin{center}
\vspace*{6\baselineskip}
\rule{\textwidth}{1.6pt}\vspace*{-\baselineskip}\vspace*{2pt}
\rule{\textwidth}{0.4pt}\\[2\baselineskip]
{\LARGE \textbf{xgboost: eXtreme Gradient Boosting}}\\[1.2\baselineskip]
\rule{\textwidth}{0.4pt}\vspace*{-\baselineskip}\vspace{3.2pt}
\rule{\textwidth}{1.6pt}\\[2\baselineskip]
{\Large Tianqi Chen, Tong He}\\[\baselineskip]
{\large Package Version: \Sexpr{xgboost.version}}\\[\baselineskip]
{\large \today}\par
\vfill
\end{center}
\thispagestyle{empty}
\clearpage
\setcounter{page}{1}
\section{Introduction}
This is an introductory document of using the \verb@xgboost@ package in R.
\verb@xgboost@ is short for eXtreme Gradient Boosting package. It is an efficient
and scalable implementation of gradient boosting framework by \citep{friedman2001greedy} \citep{friedman2000additive}.
The package includes efficient linear model solver and tree learning algorithm.
It supports various objective functions, including regression, classification
and ranking. The package is made to be extendible, so that users are also allowed to define their own objectives easily. It has several features:
\begin{enumerate}
\item{Speed: }{\verb@xgboost@ can automatically do parallel computation on
Windows and Linux, with openmp. It is generally over 10 times faster than
\verb@gbm@.}
\item{Input Type: }{\verb@xgboost@ takes several types of input data:}
\begin{itemize}
\item{Dense Matrix: }{R's dense matrix, i.e. \verb@matrix@}
\item{Sparse Matrix: }{R's sparse matrix \verb@Matrix::dgCMatrix@}
\item{Data File: }{Local data files}
\item{xgb.DMatrix: }{\verb@xgboost@'s own class. Recommended.}
\end{itemize}
\item{Sparsity: }{\verb@xgboost@ accepts sparse input for both tree booster
and linear booster, and is optimized for sparse input.}
\item{Customization: }{\verb@xgboost@ supports customized objective function
and evaluation function}
\item{Performance: }{\verb@xgboost@ has better performance on several different
datasets.}
\end{enumerate}
\section{Example with Mushroom data}
In this section, we will illustrate some common usage of \verb@xgboost@. The
Mushroom data is cited from UCI Machine Learning Repository. \citep{Bache+Lichman:2013}
<<Training and prediction with iris>>=
library(xgboost)
data(agaricus.train, package='xgboost')
data(agaricus.test, package='xgboost')
train <- agaricus.train
test <- agaricus.test
bst <- xgboost(data = train$data, label = train$label, max.depth = 2, eta = 1,
nround = 2, objective = "binary:logistic")
xgb.save(bst, 'model.save')
bst = xgb.load('model.save')
pred <- predict(bst, test$data)
@
\verb@xgboost@ is the main function to train a \verb@Booster@, i.e. a model.
\verb@predict@ does prediction on the model.
Here we can save the model to a binary local file, and load it when needed.
We can't inspect the trees inside. However we have another function to save the
model in plain text.
<<Dump Model>>=
xgb.dump(bst, 'model.dump')
@
The output looks like
\begin{verbatim}
booster[0]:
0:[f28<1.00001] yes=1,no=2,missing=2
1:[f108<1.00001] yes=3,no=4,missing=4
3:leaf=1.85965
4:leaf=-1.94071
2:[f55<1.00001] yes=5,no=6,missing=6
5:leaf=-1.70044
6:leaf=1.71218
booster[1]:
0:[f59<1.00001] yes=1,no=2,missing=2
1:leaf=-6.23624
2:[f28<1.00001] yes=3,no=4,missing=4
3:leaf=-0.96853
4:leaf=0.784718
\end{verbatim}
It is important to know \verb@xgboost@'s own data type: \verb@xgb.DMatrix@.
It speeds up \verb@xgboost@, and is needed for advanced features such as
training from initial prediction value, weighted training instance.
We can use \verb@xgb.DMatrix@ to construct an \verb@xgb.DMatrix@ object:
<<xgb.DMatrix>>=
dtrain <- xgb.DMatrix(train$data, label = train$label)
class(dtrain)
head(getinfo(dtrain,'label'))
@
We can also save the matrix to a binary file. Then load it simply with
\verb@xgb.DMatrix@
<<save model>>=
xgb.DMatrix.save(dtrain, 'xgb.DMatrix')
dtrain = xgb.DMatrix('xgb.DMatrix')
@
\section{Advanced Examples}
The function \verb@xgboost@ is a simple function with less parameter, in order
to be R-friendly. The core training function is wrapped in \verb@xgb.train@. It is more flexible than \verb@xgboost@, but it requires users to read the document a bit more carefully.
\verb@xgb.train@ only accept a \verb@xgb.DMatrix@ object as its input, while it supports advanced features as custom objective and evaluation functions.
<<Customized loss function>>=
logregobj <- function(preds, dtrain) {
labels <- getinfo(dtrain, "label")
preds <- 1/(1 + exp(-preds))
grad <- preds - labels
hess <- preds * (1 - preds)
return(list(grad = grad, hess = hess))
}
evalerror <- function(preds, dtrain) {
labels <- getinfo(dtrain, "label")
err <- sqrt(mean((preds-labels)^2))
return(list(metric = "MSE", value = err))
}
dtest <- xgb.DMatrix(test$data, label = test$label)
watchlist <- list(eval = dtest, train = dtrain)
param <- list(max.depth = 2, eta = 1, silent = 1)
bst <- xgb.train(param, dtrain, nround = 2, watchlist, logregobj, evalerror)
@
The gradient and second order gradient is required for the output of customized
objective function.
We also have \verb@slice@ for row extraction. It is useful in
cross-validation.
For a walkthrough demo, please see \verb@R-package/demo/@ for further
details.
\section{The Higgs Boson competition}
We have made a demo for \href{http://www.kaggle.com/c/higgs-boson}{the Higgs
Boson Machine Learning Challenge}.
Here are the instructions to make a submission
\begin{enumerate}
\item Download the \href{http://www.kaggle.com/c/higgs-boson/data}{datasets}
and extract them to \verb@data/@.
\item Run scripts under \verb@xgboost/demo/kaggle-higgs/@:
\href{https://github.com/tqchen/xgboost/blob/master/demo/kaggle-higgs/higgs-train.R}{higgs-train.R}
and \href{https://github.com/tqchen/xgboost/blob/master/demo/kaggle-higgs/higgs-pred.R}{higgs-pred.R}.
The computation will take less than a minute on Intel i7.
\item Go to the \href{http://www.kaggle.com/c/higgs-boson/submissions/attach}{submission page}
and submit your result.
\end{enumerate}
We provide \href{https://github.com/tqchen/xgboost/blob/master/demo/kaggle-higgs/speedtest.R}{a script}
to compare the time cost on the higgs dataset with \verb@gbm@ and \verb@xgboost@.
The training set contains 350000 records and 30 features.
\verb@xgboost@ can automatically do parallel computation. On a machine with Intel
i7-4700MQ and 24GB memories, we found that \verb@xgboost@ costs about 35 seconds, which is about 20 times faster
than \verb@gbm@. When we limited \verb@xgboost@ to use only one thread, it was
still about two times faster than \verb@gbm@.
Meanwhile, the result from \verb@xgboost@ reaches
\href{http://www.kaggle.com/c/higgs-boson/details/evaluation}{3.60@AMS} with a
single model. This results stands in the
\href{http://www.kaggle.com/c/higgs-boson/leaderboard}{top 30\%} of the
competition.
\bibliographystyle{jss}
\nocite{*} % list uncited references
\bibliography{xgboost}
\end{document}
<<Temp file cleaning, include=FALSE>>=
file.remove("xgb.DMatrix")
file.remove("model.dump")
file.remove("model.save")
@

View File

@@ -1,30 +0,0 @@
@article{friedman2001greedy,
title={Greedy function approximation: a gradient boosting machine},
author={Friedman, Jerome H},
journal={Annals of Statistics},
pages={1189--1232},
year={2001},
publisher={JSTOR}
}
@article{friedman2000additive,
title={Additive logistic regression: a statistical view of boosting (with discussion and a rejoinder by the authors)},
author={Friedman, Jerome and Hastie, Trevor and Tibshirani, Robert and others},
journal={The annals of statistics},
volume={28},
number={2},
pages={337--407},
year={2000},
publisher={Institute of Mathematical Statistics}
}
@misc{
Bache+Lichman:2013 ,
author = "K. Bache and M. Lichman",
year = "2013",
title = "{UCI} Machine Learning Repository",
url = "http://archive.ics.uci.edu/ml",
institution = "University of California, Irvine, School of Information and Computer Sciences"
}

View File

@@ -1,420 +0,0 @@
---
title: "Xgboost presentation"
output:
rmarkdown::html_vignette:
css: vignette.css
number_sections: yes
toc: yes
bibliography: xgboost.bib
author: Tianqi Chen, Tong He, Michaël Benesty
vignette: >
%\VignetteIndexEntry{Xgboost presentation}
%\VignetteEngine{knitr::rmarkdown}
\usepackage[utf8]{inputenc}
---
Introduction
============
**Xgboost** is short for e**X**treme **G**radient **Boost**ing package.
The purpose of this Vignette is to show you how to use **Xgboost** to build a model and make predictions.
It is an efficient and scalable implementation of gradient boosting framework by @friedman2000additive and @friedman2001greedy. Two solvers are included:
- *linear* model ;
- *tree learning* algorithm.
It supports various objective functions, including *regression*, *classification* and *ranking*. The package is made to be extendible, so that users are also allowed to define their own objective functions easily.
It has been [used](https://github.com/dmlc/xgboost) to win several [Kaggle](http://www.kaggle.com) competitions.
It has several features:
* Speed: it can automatically do parallel computation on *Windows* and *Linux*, with *OpenMP*. It is generally over 10 times faster than the classical `gbm`.
* Input Type: it takes several types of input data:
* *Dense* Matrix: *R*'s *dense* matrix, i.e. `matrix` ;
* *Sparse* Matrix: *R*'s *sparse* matrix, i.e. `Matrix::dgCMatrix` ;
* Data File: local data files ;
* `xgb.DMatrix`: its own class (recommended).
* Sparsity: it accepts *sparse* input for both *tree booster* and *linear booster*, and is optimized for *sparse* input ;
* Customization: it supports customized objective functions and evaluation functions.
Installation
============
Github version
--------------
For up-to-date version (highly recommended), install from *Github*:
```{r installGithub, eval=FALSE}
devtools::install_github('dmlc/xgboost', subdir='R-package')
```
> *Windows* user will need to install [RTools](http://cran.r-project.org/bin/windows/Rtools/) first.
Cran version
------------
As of 2015-03-13, xgboost was removed from the CRAN repository.
Formerly available versions can be obtained from the CRAN [archive](http://cran.r-project.org/src/contrib/Archive/xgboost)
Learning
========
For the purpose of this tutorial we will load **XGBoost** package.
```{r libLoading, results='hold', message=F, warning=F}
require(xgboost)
```
Dataset presentation
--------------------
In this example, we are aiming to predict whether a mushroom can be eaten or not (like in many tutorials, example data are the the same as you will use on in your every day life :-).
Mushroom data is cited from UCI Machine Learning Repository. @Bache+Lichman:2013.
Dataset loading
---------------
We will load the `agaricus` datasets embedded with the package and will link them to variables.
The datasets are already split in:
* `train`: will be used to build the model ;
* `test`: will be used to assess the quality of our model.
Why *split* the dataset in two parts?
In the first part we will build our model. In the second part we will want to test it and assess its quality. Without dividing the dataset we would test the model on the data which the algorithm have already seen.
```{r datasetLoading, results='hold', message=F, warning=F}
data(agaricus.train, package='xgboost')
data(agaricus.test, package='xgboost')
train <- agaricus.train
test <- agaricus.test
```
> In the real world, it would be up to you to make this division between `train` and `test` data. The way to do it is out of the purpose of this article, however `caret` package may [help](http://topepo.github.io/caret/splitting.html).
Each variable is a `list` containing two things, `label` and `data`:
```{r dataList, message=F, warning=F}
str(train)
```
`label` is the outcome of our dataset meaning it is the binary *classification* we will try to predict.
Let's discover the dimensionality of our datasets.
```{r dataSize, message=F, warning=F}
dim(train$data)
dim(test$data)
```
This dataset is very small to not make the **R** package too heavy, however **XGBoost** is built to manage huge dataset very efficiently.
As seen below, the `data` are stored in a `dgCMatrix` which is a *sparse* matrix and `label` vector is a `numeric` vector (`{0,1}`):
```{r dataClass, message=F, warning=F}
class(train$data)[1]
class(train$label)
```
Basic Training using XGBoost
----------------------------
This step is the most critical part of the process for the quality of our model.
### Basic training
We are using the `train` data. As explained above, both `data` and `label` are stored in a `list`.
In a *sparse* matrix, cells containing `0` are not stored in memory. Therefore, in a dataset mainly made of `0`, memory size is reduced. It is very usual to have such dataset.
We will train decision tree model using the following parameters:
* `objective = "binary:logistic"`: we will train a binary classification model ;
* `max.deph = 2`: the trees won't be deep, because our case is very simple ;
* `nthread = 2`: the number of cpu threads we are going to use;
* `nround = 2`: there will be two passes on the data, the second one will enhance the model by further reducing the difference between ground truth and prediction.
```{r trainingSparse, message=F, warning=F}
bstSparse <- xgboost(data = train$data, label = train$label, max.depth = 2, eta = 1, nthread = 2, nround = 2, objective = "binary:logistic")
```
> More complex the relationship between your features and your `label` is, more passes you need.
### Parameter variations
#### Dense matrix
Alternatively, you can put your dataset in a *dense* matrix, i.e. a basic **R** matrix.
```{r trainingDense, message=F, warning=F}
bstDense <- xgboost(data = as.matrix(train$data), label = train$label, max.depth = 2, eta = 1, nthread = 2, nround = 2, objective = "binary:logistic")
```
#### xgb.DMatrix
**XGBoost** offers a way to group them in a `xgb.DMatrix`. You can even add other meta data in it. It will be useful for the most advanced features we will discover later.
```{r trainingDmatrix, message=F, warning=F}
dtrain <- xgb.DMatrix(data = train$data, label = train$label)
bstDMatrix <- xgboost(data = dtrain, max.depth = 2, eta = 1, nthread = 2, nround = 2, objective = "binary:logistic")
```
#### Verbose option
**XGBoost** has several features to help you to view how the learning progress internally. The purpose is to help you to set the best parameters, which is the key of your model quality.
One of the simplest way to see the training progress is to set the `verbose` option (see below for more advanced technics).
```{r trainingVerbose0, message=T, warning=F}
# verbose = 0, no message
bst <- xgboost(data = dtrain, max.depth = 2, eta = 1, nthread = 2, nround = 2, objective = "binary:logistic", verbose = 0)
```
```{r trainingVerbose1, message=T, warning=F}
# verbose = 1, print evaluation metric
bst <- xgboost(data = dtrain, max.depth = 2, eta = 1, nthread = 2, nround = 2, objective = "binary:logistic", verbose = 1)
```
```{r trainingVerbose2, message=T, warning=F}
# verbose = 2, also print information about tree
bst <- xgboost(data = dtrain, max.depth = 2, eta = 1, nthread = 2, nround = 2, objective = "binary:logistic", verbose = 2)
```
Basic prediction using XGBoost
==============================
Perform the prediction
----------------------
The purpose of the model we have built is to classify new data. As explained before, we will use the `test` dataset for this step.
```{r predicting, message=F, warning=F}
pred <- predict(bst, test$data)
# size of the prediction vector
print(length(pred))
# limit display of predictions to the first 10
print(head(pred))
```
These numbers doesn't look like *binary classification* `{0,1}`. We need to perform a simple transformation before being able to use these results.
Transform the regression in a binary classification
---------------------------------------------------
The only thing that **XGBoost** does is a *regression*. **XGBoost** is using `label` vector to build its *regression* model.
How can we use a *regression* model to perform a binary classification?
If we think about the meaning of a regression applied to our data, the numbers we get are probabilities that a datum will be classified as `1`. Therefore, we will set the rule that if this probability for a specific datum is `> 0.5` then the observation is classified as `1` (or `0` otherwise).
```{r predictingTest, message=F, warning=F}
prediction <- as.numeric(pred > 0.5)
print(head(prediction))
```
Measuring model performance
---------------------------
To measure the model performance, we will compute a simple metric, the *average error*.
```{r predictingAverageError, message=F, warning=F}
err <- mean(as.numeric(pred > 0.5) != test$label)
print(paste("test-error=", err))
```
> Note that the algorithm has not seen the `test` data during the model construction.
Steps explanation:
1. `as.numeric(pred > 0.5)` applies our rule that when the probability (<=> regression <=> prediction) is `> 0.5` the observation is classified as `1` and `0` otherwise ;
2. `probabilityVectorPreviouslyComputed != test$label` computes the vector of error between true data and computed probabilities ;
3. `mean(vectorOfErrors)` computes the *average error* itself.
The most important thing to remember is that **to do a classification, you just do a regression to the** `label` **and then apply a threshold**.
*Multiclass* classification works in a similar way.
This metric is **`r round(err, 2)`** and is pretty low: our yummly mushroom model works well!
Advanced features
=================
Most of the features below have been implemented to help you to improve your model by offering a better understanding of its content.
Dataset preparation
-------------------
For the following advanced features, we need to put data in `xgb.DMatrix` as explained above.
```{r DMatrix, message=F, warning=F}
dtrain <- xgb.DMatrix(data = train$data, label=train$label)
dtest <- xgb.DMatrix(data = test$data, label=test$label)
```
Measure learning progress with xgb.train
----------------------------------------
Both `xgboost` (simple) and `xgb.train` (advanced) functions train models.
One of the special feature of `xgb.train` is the capacity to follow the progress of the learning after each round. Because of the way boosting works, there is a time when having too many rounds lead to an overfitting. You can see this feature as a cousin of cross-validation method. The following techniques will help you to avoid overfitting or optimizing the learning time in stopping it as soon as possible.
One way to measure progress in learning of a model is to provide to **XGBoost** a second dataset already classified. Therefore it can learn on the first dataset and test its model on the second one. Some metrics are measured after each round during the learning.
> in some way it is similar to what we have done above with the average error. The main difference is that below it was after building the model, and now it is during the construction that we measure errors.
For the purpose of this example, we use `watchlist` parameter. It is a list of `xgb.DMatrix`, each of them tagged with a name.
```{r watchlist, message=F, warning=F}
watchlist <- list(train=dtrain, test=dtest)
bst <- xgb.train(data=dtrain, max.depth=2, eta=1, nthread = 2, nround=2, watchlist=watchlist, objective = "binary:logistic")
```
**XGBoost** has computed at each round the same average error metric than seen above (we set `nround` to 2, that is why we have two lines). Obviously, the `train-error` number is related to the training dataset (the one the algorithm learns from) and the `test-error` number to the test dataset.
Both training and test error related metrics are very similar, and in some way, it makes sense: what we have learned from the training dataset matches the observations from the test dataset.
If with your own dataset you have not such results, you should think about how you divided your dataset in training and test. May be there is something to fix. Again, `caret` package may [help](http://topepo.github.io/caret/splitting.html).
For a better understanding of the learning progression, you may want to have some specific metric or even use multiple evaluation metrics.
```{r watchlist2, message=F, warning=F}
bst <- xgb.train(data=dtrain, max.depth=2, eta=1, nthread = 2, nround=2, watchlist=watchlist, eval.metric = "error", eval.metric = "logloss", objective = "binary:logistic")
```
> `eval.metric` allows us to monitor two new metrics for each round, `logloss` and `error`.
Linear boosting
---------------
Until now, all the learnings we have performed were based on boosting trees. **XGBoost** implements a second algorithm, based on linear boosting. The only difference with previous command is `booster = "gblinear"` parameter (and removing `eta` parameter).
```{r linearBoosting, message=F, warning=F}
bst <- xgb.train(data=dtrain, booster = "gblinear", max.depth=2, nthread = 2, nround=2, watchlist=watchlist, eval.metric = "error", eval.metric = "logloss", objective = "binary:logistic")
```
In this specific case, *linear boosting* gets sligtly better performance metrics than decision trees based algorithm.
In simple cases, it will happen because there is nothing better than a linear algorithm to catch a linear link. However, decision trees are much better to catch a non linear link between predictors and outcome. Because there is no silver bullet, we advise you to check both algorithms with your own datasets to have an idea of what to use.
Manipulating xgb.DMatrix
------------------------
### Save / Load
Like saving models, `xgb.DMatrix` object (which groups both dataset and outcome) can also be saved using `xgb.DMatrix.save` function.
```{r DMatrixSave, message=F, warning=F}
xgb.DMatrix.save(dtrain, "dtrain.buffer")
# to load it in, simply call xgb.DMatrix
dtrain2 <- xgb.DMatrix("dtrain.buffer")
bst <- xgb.train(data=dtrain2, max.depth=2, eta=1, nthread = 2, nround=2, watchlist=watchlist, objective = "binary:logistic")
```
```{r DMatrixDel, include=FALSE}
file.remove("dtrain.buffer")
```
### Information extraction
Information can be extracted from `xgb.DMatrix` using `getinfo` function. Hereafter we will extract `label` data.
```{r getinfo, message=F, warning=F}
label = getinfo(dtest, "label")
pred <- predict(bst, dtest)
err <- as.numeric(sum(as.integer(pred > 0.5) != label))/length(label)
print(paste("test-error=", err))
```
View feature importance/influence from the learnt model
-------------------------------------------------------
Feature importance is similar to R gbm package's relative influence (rel.inf).
```
importance_matrix <- xgb.importance(model = bst)
print(importance_matrix)
xgb.plot.importance(importance_matrix = importance_matrix)
```
View the trees from a model
---------------------------
You can dump the tree you learned using `xgb.dump` into a text file.
```{r dump, message=T, warning=F}
xgb.dump(bst, with.stats = T)
```
You can plot the trees from your model using ```xgb.plot.tree``
```
xgb.plot.tree(model = bst)
```
> if you provide a path to `fname` parameter you can save the trees to your hard drive.
Save and load models
--------------------
Maybe your dataset is big, and it takes time to train a model on it? May be you are not a big fan of losing time in redoing the same task again and again? In these very rare cases, you will want to save your model and load it when required.
Hopefully for you, **XGBoost** implements such functions.
```{r saveModel, message=F, warning=F}
# save model to binary local file
xgb.save(bst, "xgboost.model")
```
> `xgb.save` function should return `r TRUE` if everything goes well and crashes otherwise.
An interesting test to see how identical our saved model is to the original one would be to compare the two predictions.
```{r loadModel, message=F, warning=F}
# load binary model to R
bst2 <- xgb.load("xgboost.model")
pred2 <- predict(bst2, test$data)
# And now the test
print(paste("sum(abs(pred2-pred))=", sum(abs(pred2-pred))))
```
```{r clean, include=FALSE}
# delete the created model
file.remove("./xgboost.model")
```
> result is `0`? We are good!
In some very specific cases, like when you want to pilot **XGBoost** from `caret` package, you will want to save the model as a *R* binary vector. See below how to do it.
```{r saveLoadRBinVectorModel, message=F, warning=F}
# save model to R's raw vector
rawVec <- xgb.save.raw(bst)
# print class
print(class(rawVec))
# load binary model to R
bst3 <- xgb.load(rawVec)
pred3 <- predict(bst3, test$data)
# pred2 should be identical to pred
print(paste("sum(abs(pred3-pred))=", sum(abs(pred2-pred))))
```
> Again `0`? It seems that `XGBoost` works pretty well!
References
==========

117
README.md
View File

@@ -1,84 +1,47 @@
<img src=https://raw.githubusercontent.com/dmlc/dmlc.github.io/master/img/logo-m/xgboost.png width=135/> eXtreme Gradient Boosting
===========
[![Build Status](https://travis-ci.org/dmlc/xgboost.svg?branch=master)](https://travis-ci.org/dmlc/xgboost)
[![Documentation Status](https://readthedocs.org/projects/xgboost/badge/?version=latest)](https://xgboost.readthedocs.org)
[![GitHub license](http://dmlc.github.io/img/apache2.svg)](./LICENSE)
[![CRAN Status Badge](http://www.r-pkg.org/badges/version/xgboost)](http://cran.r-project.org/web/packages/xgboost)
[![PyPI version](https://badge.fury.io/py/xgboost.svg)](https://pypi.python.org/pypi/xgboost/)
[![Gitter chat for developers at https://gitter.im/dmlc/xgboost](https://badges.gitter.im/Join%20Chat.svg)](https://gitter.im/dmlc/xgboost?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)
xgboost: eXtreme Gradient Boosting
=======
An optimized general purpose gradient boosting (tree) library.
An optimized general purpose gradient boosting library. The library is parallelized, and also provides an optimized distributed version.
It implements machine learning algorithms under the [Gradient Boosting](https://en.wikipedia.org/wiki/Gradient_boosting) framework, including [Generalized Linear Model](https://en.wikipedia.org/wiki/Generalized_linear_model) (GLM) and [Gradient Boosted Decision Trees](https://en.wikipedia.org/wiki/Gradient_boosting#Gradient_tree_boosting) (GBDT). XGBoost can also be [distributed](#features) and scale to Terascale data
XGBoost is part of [Distributed Machine Learning Common](http://dmlc.github.io/) <img src=https://avatars2.githubusercontent.com/u/11508361?v=3&s=20> projects
Contents
--------
* [What's New](#whats-new)
* [Version](#version)
* [Documentation](doc/index.md)
* [Build Instruction](doc/build.md)
* [Features](#features)
* [Distributed XGBoost](multi-node)
* [Usecases](doc/index.md#highlight-links)
* [Bug Reporting](#bug-reporting)
* [Contributing to XGBoost](#contributing-to-xgboost)
* [Committers and Contributors](CONTRIBUTORS.md)
* [License](#license)
* [XGBoost in Graphlab Create](#xgboost-in-graphlab-create)
What's New
----------
* XGBoost helps Vlad Mironov, Alexander Guschin to win the [CERN LHCb experiment Flavour of Physics competition](https://www.kaggle.com/c/flavours-of-physics). Check out the [interview from Kaggle](http://blog.kaggle.com/2015/11/30/flavour-of-physics-technical-write-up-1st-place-go-polar-bears/).
* XGBoost helps Mario Filho, Josef Feigl, Lucas, Gilberto to win the [Caterpillar Tube Pricing competition](https://www.kaggle.com/c/caterpillar-tube-pricing). Check out the [interview from Kaggle](http://blog.kaggle.com/2015/09/22/caterpillar-winners-interview-1st-place-gilberto-josef-leustagos-mario/).
* XGBoost helps Halla Yang to win the [Recruit Coupon Purchase Prediction Challenge](https://www.kaggle.com/c/coupon-purchase-prediction). Check out the [interview from Kaggle](http://blog.kaggle.com/2015/10/21/recruit-coupon-purchase-winners-interview-2nd-place-halla-yang/).
* XGBoost helps Owen Zhang to win the [Avito Context Ad Click competition](https://www.kaggle.com/c/avito-context-ad-clicks). Check out the [interview from Kaggle](http://blog.kaggle.com/2015/08/26/avito-winners-interview-1st-place-owen-zhang/).
* XGBoost helps Chenglong Chen to win [Kaggle CrowdFlower Competition](https://www.kaggle.com/c/crowdflower-search-relevance)
Check out the [winning solution](https://github.com/ChenglongChen/Kaggle_CrowdFlower)
* XGBoost-0.4 release, see [CHANGES.md](CHANGES.md#xgboost-04)
* XGBoost helps three champion teams to win [WWW2015 Microsoft Malware Classification Challenge (BIG 2015)](http://www.kaggle.com/c/malware-classification/forums/t/13490/say-no-to-overfitting-approaches-sharing)
Check out the [winning solution](doc/README.md#highlight-links)
* [External Memory Version](doc/external_memory.md)
Version
-------
* Current version xgboost-0.4
- [Change log](CHANGES.md)
- This version is compatible with 0.3x versions
Contributors: https://github.com/tqchen/xgboost/graphs/contributors
Turorial and Documentation: https://github.com/tqchen/xgboost/wiki
Features
--------
* Easily accessible through CLI, [python](https://github.com/dmlc/xgboost/blob/master/demo/guide-python/basic_walkthrough.py),
[R](https://github.com/dmlc/xgboost/blob/master/R-package/demo/basic_walkthrough.R),
[Julia](https://github.com/antinucleon/XGBoost.jl/blob/master/demo/basic_walkthrough.jl)
* Its fast! Benchmark numbers comparing xgboost, H20, Spark, R - [benchm-ml numbers](https://github.com/szilard/benchm-ml)
* Memory efficient - Handles sparse matrices, supports external memory
* Accurate prediction, and used extensively by data scientists and kagglers - [highlight links](https://github.com/dmlc/xgboost/blob/master/doc/README.md#highlight-links)
* Distributed version runs on Hadoop (YARN), MPI, SGE etc., scales to billions of examples.
=======
* Sparse feature format:
- Sparse feature format allows easy handling of missing values, and improve computation efficiency.
* Push the limit on single machine:
- Efficient implementation that optimizes memory and computation.
* Speed: XGBoost is very fast
- IN [demo/higgs/speedtest.py](demo/kaggle-higgs/speedtest.py), kaggle higgs data it is faster(on our machine 20 times faster using 4 threads) than sklearn.ensemble.GradientBoostingClassifier
* Layout of gradient boosting algorithm to support user defined objective
* Python interface, works with numpy and scipy.sparse matrix
Bug Reporting
-------------
Supported key components
=======
* Gradient boosting models:
- regression tree (GBRT)
- linear model/lasso
* Objectives to support tasks:
- regression
- classification
* OpenMP implementation
* For reporting bugs please use the [xgboost/issues](https://github.com/dmlc/xgboost/issues) page.
* For generic questions or to share your experience using xgboost please use the [XGBoost User Group](https://groups.google.com/forum/#!forum/xgboost-user/)
Planned components
=======
* More objective to support tasks:
- ranking
- matrix factorization
- structured prediction
Build
======
* Simply type make
* If your compiler does not come with OpenMP support, it will fire an warning telling you that the code will compile into single thread mode, and you will get single thread xgboost
- You may get a error: -lgomp is not found, you can remove -fopenmp flag in Makefile to get single thread xgboost, or upgrade your compiler to compile multi-thread version
Contributing to XGBoost
-----------------------
XGBoost has been developed and used by a group of active community members. Everyone is more than welcome to contribute. It is a way to make the project better and more accessible to more users.
* Check out [Feature Wish List](https://github.com/dmlc/xgboost/labels/Wish-List) to see what can be improved, or open an issue if you want something.
* Contribute to the [documents and examples](https://github.com/dmlc/xgboost/blob/master/doc/) to share your experience with other users.
* Please add your name to [CONTRIBUTORS.md](CONTRIBUTORS.md) after your patch has been merged.
License
-------
© Contributors, 2015. Licensed under an [Apache-2](https://github.com/dmlc/xgboost/blob/master/LICENSE) license.
XGBoost in Graphlab Create
--------------------------
* XGBoost is adopted as part of boosted tree toolkit in Graphlab Create (GLC). Graphlab Create is a powerful python toolkit that allows you to do data manipulation, graph processing, hyper-parameter search, and visualization of TeraBytes scale data in one framework. Try the [Graphlab Create](http://graphlab.com/products/create/quick-start-guide.html)
* Nice [blogpost](http://blog.graphlab.com/using-gradient-boosted-trees-to-predict-bike-sharing-demand) by Jay Gu about using GLC boosted tree to solve kaggle bike sharing challenge:
File extension convention
=======
* .h are interface, utils and data structures, with detailed comment;
* .cpp are implementations that will be compiled, with less comment;
* .hpp are implementations that will be included by .cpp, with less comment

View File

@@ -1,36 +0,0 @@
environment:
global:
CMD_IN_ENV: "cmd /E:ON /V:ON /C .\\python-appveyor-demo\\appveyor\\run_with_env.cmd"
DISABLE_OPENMP: 1
VisualStudioVersion: 12.0
matrix:
- PYTHON: "C:\\Python27-x64"
PYTHON_VERSION: "2.7.x" # currently 2.7.9
PYTHON_ARCH: "64"
- PYTHON: "C:\\Python33-x64"
PYTHON_VERSION: "3.3.x" # currently 3.3.5
PYTHON_ARCH: "64"
platform:
- x64
configuration:
- Release
install:
- cmd: git clone https://github.com/ogrisel/python-appveyor-demo
- ECHO "Filesystem root:"
- ps: "ls \"C:/\""
- ECHO "Installed SDKs:"
- ps: "ls \"C:/Program Files/Microsoft SDKs/Windows\""
- ps: python-appveyor-demo\appveyor\install.ps1
- "SET PATH=%PYTHON%;%PYTHON%\\Scripts;%PATH%"
- "python --version"
- "python -c \"import struct; print(struct.calcsize('P') * 8)\""
build: off
#project: windows\xgboost.sln

View File

@@ -0,0 +1,200 @@
#ifndef XGBOOST_LINEAR_HPP
#define XGBOOST_LINEAR_HPP
/*!
* \file xgboost_linear.h
* \brief Implementation of Linear booster, with L1/L2 regularization: Elastic Net
* the update rule is coordinate descent, require column major format
* \author Tianqi Chen: tianqi.tchen@gmail.com
*/
#include <vector>
#include <algorithm>
#include "../xgboost.h"
#include "../../utils/xgboost_utils.h"
namespace xgboost{
namespace booster{
/*! \brief linear model, with L1/L2 regularization */
template<typename FMatrix>
class LinearBooster : public InterfaceBooster<FMatrix>{
public:
LinearBooster( void ){ silent = 0;}
virtual ~LinearBooster( void ){}
public:
virtual void SetParam( const char *name, const char *val ){
if( !strcmp( name, "silent") ) silent = atoi( val );
if( model.weight.size() == 0 ) model.param.SetParam( name, val );
param.SetParam( name, val );
}
virtual void LoadModel( utils::IStream &fi ){
model.LoadModel( fi );
}
virtual void SaveModel( utils::IStream &fo ) const{
model.SaveModel( fo );
}
virtual void InitModel( void ){
model.InitModel();
}
public:
virtual void DoBoost( std::vector<float> &grad,
std::vector<float> &hess,
const FMatrix &fmat,
const std::vector<unsigned> &root_index ){
utils::Assert( grad.size() < UINT_MAX, "number of instance exceed what we can handle" );
this->UpdateWeights( grad, hess, fmat );
}
inline float Predict( const FMatrix &fmat, bst_uint ridx, unsigned root_index ){
float sum = model.bias();
for( typename FMatrix::RowIter it = fmat.GetRow(ridx); it.Next(); ){
sum += model.weight[ it.findex() ] * it.fvalue();
}
return sum;
}
virtual float Predict( const std::vector<float> &feat,
const std::vector<bool> &funknown,
unsigned rid = 0 ){
float sum = model.bias();
for( size_t i = 0; i < feat.size(); i ++ ){
if( funknown[i] ) continue;
sum += model.weight[ i ] * feat[ i ];
}
return sum;
}
protected:
// training parameter
struct ParamTrain{
/*! \brief learning_rate */
float learning_rate;
/*! \brief regularization weight for L2 norm */
float reg_lambda;
/*! \brief regularization weight for L1 norm */
float reg_alpha;
/*! \brief regularization weight for L2 norm in bias */
float reg_lambda_bias;
ParamTrain( void ){
reg_alpha = 0.0f; reg_lambda = 0.0f; reg_lambda_bias = 0.0f;
learning_rate = 1.0f;
}
inline void SetParam( const char *name, const char *val ){
// sync-names
if( !strcmp( "eta", name ) ) learning_rate = (float)atof( val );
if( !strcmp( "lambda", name ) ) reg_lambda = (float)atof( val );
if( !strcmp( "alpha", name ) ) reg_alpha = (float)atof( val );
if( !strcmp( "lambda_bias", name ) ) reg_lambda_bias = (float)atof( val );
// real names
if( !strcmp( "learning_rate", name ) ) learning_rate = (float)atof( val );
if( !strcmp( "reg_lambda", name ) ) reg_lambda = (float)atof( val );
if( !strcmp( "reg_alpha", name ) ) reg_alpha = (float)atof( val );
if( !strcmp( "reg_lambda_bias", name ) ) reg_lambda_bias = (float)atof( val );
}
// given original weight calculate delta
inline double CalcDelta( double sum_grad, double sum_hess, double w ){
if( sum_hess < 1e-5f ) return 0.0f;
double tmp = w - ( sum_grad + reg_lambda*w )/( sum_hess + reg_lambda );
if ( tmp >=0 ){
return std::max(-( sum_grad + reg_lambda*w + reg_alpha)/(sum_hess+reg_lambda),-w);
}else{
return std::min(-( sum_grad + reg_lambda*w - reg_alpha)/(sum_hess+reg_lambda),-w);
}
}
// given original weight calculate delta bias
inline double CalcDeltaBias( double sum_grad, double sum_hess, double w ){
return - (sum_grad + reg_lambda_bias*w) / (sum_hess + reg_lambda_bias );
}
};
// model for linear booster
class Model{
public:
// model parameter
struct Param{
// number of feature dimension
int num_feature;
// reserved field
int reserved[ 32 ];
// constructor
Param( void ){
num_feature = 0;
memset( reserved, 0, sizeof(reserved) );
}
inline void SetParam( const char *name, const char *val ){
if( !strcmp( name, "num_feature" ) ) num_feature = atoi( val );
}
};
public:
Param param;
// weight for each of feature, bias is the last one
std::vector<float> weight;
public:
// initialize the model parameter
inline void InitModel( void ){
// bias is the last weight
weight.resize( param.num_feature + 1 );
std::fill( weight.begin(), weight.end(), 0.0f );
}
// save the model to file
inline void SaveModel( utils::IStream &fo ) const{
fo.Write( &param, sizeof(Param) );
fo.Write( &weight[0], sizeof(float) * weight.size() );
}
// load model from file
inline void LoadModel( utils::IStream &fi ){
utils::Assert( fi.Read( &param, sizeof(Param) ) != 0, "Load LinearBooster" );
weight.resize( param.num_feature + 1 );
utils::Assert( fi.Read( &weight[0], sizeof(float) * weight.size() ) != 0, "Load LinearBooster" );
}
// model bias
inline float &bias( void ){
return weight.back();
}
};
private:
int silent;
protected:
Model model;
ParamTrain param;
protected:
// update weights, should work for any FMatrix
inline void UpdateWeights( std::vector<float> &grad,
const std::vector<float> &hess,
const FMatrix &smat ){
{// optimize bias
double sum_grad = 0.0, sum_hess = 0.0;
for( size_t i = 0; i < grad.size(); i ++ ){
sum_grad += grad[ i ]; sum_hess += hess[ i ];
}
// remove bias effect
double dw = param.learning_rate * param.CalcDeltaBias( sum_grad, sum_hess, model.bias() );
model.bias() += dw;
// update grad value
for( size_t i = 0; i < grad.size(); i ++ ){
grad[ i ] += dw * hess[ i ];
}
}
// optimize weight
const unsigned nfeat= (unsigned)smat.NumCol();
for( unsigned i = 0; i < nfeat; i ++ ){
if( !smat.GetSortedCol( i ).Next() ) continue;
double sum_grad = 0.0, sum_hess = 0.0;
for( typename FMatrix::ColIter it = smat.GetSortedCol(i); it.Next(); ){
const float v = it.fvalue();
sum_grad += grad[ it.rindex() ] * v;
sum_hess += hess[ it.rindex() ] * v * v;
}
float w = model.weight[ i ];
double dw = param.learning_rate * param.CalcDelta( sum_grad, sum_hess, w );
model.weight[ i ] += dw;
// update grad value
for( typename FMatrix::ColIter it = smat.GetSortedCol(i); it.Next(); ){
const float v = it.fvalue();
grad[ it.rindex() ] += hess[ it.rindex() ] * v * dw;
}
}
}
};
};
};
#endif

View File

@@ -0,0 +1,147 @@
#ifndef XGBOOST_BASE_TREEMAKER_HPP
#define XGBOOST_BASE_TREEMAKER_HPP
/*!
* \file xgboost_base_treemaker.hpp
* \brief implementation of base data structure for regression tree maker,
* gives common operations of tree construction steps template
*
* \author Tianqi Chen: tianqi.tchen@gmail.com
*/
#include <vector>
#include "xgboost_tree_model.h"
namespace xgboost{
namespace booster{
class BaseTreeMaker{
protected:
BaseTreeMaker( RegTree &tree,
const TreeParamTrain &param )
: tree( tree ), param( param ){}
protected:
// statistics that is helpful to decide a split
struct SplitEntry{
/*! \brief loss change after split this node */
float loss_chg;
/*! \brief split index */
unsigned sindex;
/*! \brief split value */
float split_value;
/*! \brief constructor */
SplitEntry( void ){
loss_chg = 0.0f;
split_value = 0.0f; sindex = 0;
}
// This function gives better priority to lower index when loss_chg equals
// not the best way, but helps to give consistent result during multi-thread execution
inline bool NeedReplace( float loss_chg, unsigned split_index ) const{
if( this->split_index() <= split_index ){
return loss_chg > this->loss_chg;
}else{
return !(this->loss_chg > loss_chg);
}
}
inline bool Update( const SplitEntry &e ){
if( this->NeedReplace( e.loss_chg, e.split_index() ) ){
this->loss_chg = e.loss_chg;
this->sindex = e.sindex;
this->split_value = e.split_value;
return true;
} else{
return false;
}
}
inline bool Update( float loss_chg, unsigned split_index, float split_value, bool default_left ){
if( this->NeedReplace( loss_chg, split_index ) ){
this->loss_chg = loss_chg;
if( default_left ) split_index |= (1U << 31);
this->sindex = split_index;
this->split_value = split_value;
return true;
}else{
return false;
}
}
inline unsigned split_index( void ) const{
return sindex & ( (1U<<31) - 1U );
}
inline bool default_left( void ) const{
return (sindex >> 31) != 0;
}
};
struct NodeEntry{
/*! \brief sum gradient statistics */
double sum_grad;
/*! \brief sum hessian statistics */
double sum_hess;
/*! \brief loss of this node, without split */
float root_gain;
/*! \brief weight calculated related to current data */
float weight;
/*! \brief current best solution */
SplitEntry best;
NodeEntry( void ){
sum_grad = sum_hess = 0.0;
weight = root_gain = 0.0f;
}
};
private:
// try to prune off current leaf, return true if successful
inline void TryPruneLeaf( int nid, int depth ){
if( tree[ nid ].is_root() ) return;
int pid = tree[ nid ].parent();
RegTree::NodeStat &s = tree.stat( pid );
++ s.leaf_child_cnt;
if( s.leaf_child_cnt >= 2 && param.need_prune( s.loss_chg, depth - 1 ) ){
this->stat_num_pruned += 2;
// need to be pruned
tree.ChangeToLeaf( pid, param.learning_rate * s.base_weight );
// tail recursion
this->TryPruneLeaf( pid, depth - 1 );
}
}
protected:
/*! \brief do prunning of a tree */
inline int DoPrune( void ){
this->stat_num_pruned = 0;
// initialize auxiliary statistics
for( int nid = 0; nid < tree.param.num_nodes; ++ nid ){
tree.stat( nid ).leaf_child_cnt = 0;
tree.stat( nid ).loss_chg = snode[ nid ].best.loss_chg;
tree.stat( nid ).sum_hess = static_cast<float>( snode[ nid ].sum_hess );
}
for( int nid = 0; nid < tree.param.num_nodes; ++ nid ){
if( tree[ nid ].is_leaf() ) this->TryPruneLeaf( nid, tree.GetDepth(nid) );
}
return this->stat_num_pruned;
}
protected:
/*! \brief update queue expand add in new leaves */
inline void UpdateQueueExpand( std::vector<int> &qexpand ){
std::vector<int> newnodes;
for( size_t i = 0; i < qexpand.size(); ++ i ){
const int nid = qexpand[i];
if( !tree[ nid ].is_leaf() ){
newnodes.push_back( tree[nid].cleft() );
newnodes.push_back( tree[nid].cright() );
}
}
// use new nodes for qexpand
qexpand = newnodes;
}
protected:
// local helper tmp data structure
// statistics
int stat_num_pruned;
/*! \brief queue of nodes to be expanded */
std::vector<int> qexpand;
/*! \brief TreeNode Data: statistics for each constructed node, the derived class must maintain this */
std::vector<NodeEntry> snode;
protected:
// original data that supports tree construction
RegTree &tree;
const TreeParamTrain &param;
};
}; // namespace booster
}; // namespace xgboost
#endif // XGBOOST_BASE_TREEMAKER_HPP

View File

@@ -0,0 +1,335 @@
#ifndef XGBOOST_COL_TREEMAKER_HPP
#define XGBOOST_COL_TREEMAKER_HPP
/*!
* \file xgboost_col_treemaker.hpp
* \brief implementation of regression tree maker,
* use a column based approach, with OpenMP
* \author Tianqi Chen: tianqi.tchen@gmail.com
*/
// use openmp
#include <vector>
#include "xgboost_tree_model.h"
#include "../../utils/xgboost_omp.h"
#include "../../utils/xgboost_random.h"
#include "../../utils/xgboost_fmap.h"
#include "xgboost_base_treemaker.hpp"
namespace xgboost{
namespace booster{
template<typename FMatrix>
class ColTreeMaker : protected BaseTreeMaker{
public:
ColTreeMaker( RegTree &tree,
const TreeParamTrain &param,
const std::vector<float> &grad,
const std::vector<float> &hess,
const FMatrix &smat,
const std::vector<unsigned> &root_index,
const utils::FeatConstrain &constrain )
: BaseTreeMaker( tree, param ),
grad(grad), hess(hess),
smat(smat), root_index(root_index), constrain(constrain) {
utils::Assert( grad.size() == hess.size(), "booster:invalid input" );
utils::Assert( smat.NumRow() == hess.size(), "booster:invalid input" );
utils::Assert( root_index.size() == 0 || root_index.size() == hess.size(), "booster:invalid input" );
utils::Assert( smat.HaveColAccess(), "ColTreeMaker: need column access matrix" );
}
inline void Make( int& stat_max_depth, int& stat_num_pruned ){
this->InitData();
this->InitNewNode( this->qexpand );
stat_max_depth = 0;
for( int depth = 0; depth < param.max_depth; ++ depth ){
this->FindSplit( depth );
this->UpdateQueueExpand( this->qexpand );
this->InitNewNode( this->qexpand );
// if nothing left to be expand, break
if( qexpand.size() == 0 ) break;
stat_max_depth = depth + 1;
}
// set all the rest expanding nodes to leaf
for( size_t i = 0; i < qexpand.size(); ++ i ){
const int nid = qexpand[i];
tree[ nid ].set_leaf( snode[nid].weight * param.learning_rate );
}
// start prunning the tree
stat_num_pruned = this->DoPrune();
}
private:
/*! \brief per thread x per node entry to store tmp data */
struct ThreadEntry{
/*! \brief sum gradient statistics */
double sum_grad;
/*! \brief sum hessian statistics */
double sum_hess;
/*! \brief last feature value scanned */
float last_fvalue;
/*! \brief current best solution */
SplitEntry best;
/*! \brief constructor */
ThreadEntry( void ){
this->ClearStats();
}
/*! \brief clear statistics */
inline void ClearStats( void ){
sum_grad = sum_hess = 0.0;
}
};
private:
// make leaf nodes for all qexpand, update node statistics, mark leaf value
inline void InitNewNode( const std::vector<int> &qexpand ){
{// setup statistics space for each tree node
for( size_t i = 0; i < stemp.size(); ++ i ){
stemp[i].resize( tree.param.num_nodes, ThreadEntry() );
}
snode.resize( tree.param.num_nodes, NodeEntry() );
}
const unsigned ndata = static_cast<unsigned>( position.size() );
#pragma omp parallel for schedule( static )
for( unsigned i = 0; i < ndata; ++ i ){
const int tid = omp_get_thread_num();
if( position[i] < 0 ) continue;
stemp[tid][ position[i] ].sum_grad += grad[i];
stemp[tid][ position[i] ].sum_hess += hess[i];
}
for( size_t j = 0; j < qexpand.size(); ++ j ){
const int nid = qexpand[ j ];
double sum_grad = 0.0, sum_hess = 0.0;
for( size_t tid = 0; tid < stemp.size(); tid ++ ){
sum_grad += stemp[tid][nid].sum_grad;
sum_hess += stemp[tid][nid].sum_hess;
}
// update node statistics
snode[nid].sum_grad = sum_grad;
snode[nid].sum_hess = sum_hess;
snode[nid].root_gain = param.CalcRootGain( sum_grad, sum_hess );
if( !tree[nid].is_root() ){
snode[nid].weight = param.CalcWeight( sum_grad, sum_hess, tree.stat( tree[nid].parent() ).base_weight );
tree.stat(nid).base_weight = snode[nid].weight;
}else{
snode[nid].weight = param.CalcWeight( sum_grad, sum_hess, 0.0f );
tree.stat(nid).base_weight = snode[nid].weight;
}
}
}
private:
// enumerate the split values of specific feature
template<typename Iter>
inline void EnumerateSplit( Iter it, const unsigned fid, std::vector<ThreadEntry> &temp, bool is_forward_search ){
// clear all the temp statistics
for( size_t j = 0; j < qexpand.size(); ++ j ){
temp[ qexpand[j] ].ClearStats();
}
while( it.Next() ){
const bst_uint ridx = it.rindex();
const int nid = position[ ridx ];
if( nid < 0 ) continue;
const float fvalue = it.fvalue();
ThreadEntry &e = temp[ nid ];
// test if first hit, this is fine, because we set 0 during init
if( e.sum_hess == 0.0 ){
e.sum_grad = grad[ ridx ];
e.sum_hess = hess[ ridx ];
e.last_fvalue = fvalue;
}else{
// try to find a split
if( fabsf(fvalue - e.last_fvalue) > rt_2eps && e.sum_hess >= param.min_child_weight ){
const double csum_hess = snode[ nid ].sum_hess - e.sum_hess;
if( csum_hess >= param.min_child_weight ){
const double csum_grad = snode[nid].sum_grad - e.sum_grad;
const double loss_chg =
+ param.CalcGain( e.sum_grad, e.sum_hess, snode[nid].weight )
+ param.CalcGain( csum_grad , csum_hess , snode[nid].weight )
- snode[nid].root_gain;
e.best.Update( loss_chg, fid, (fvalue + e.last_fvalue) * 0.5f, !is_forward_search );
}
}
// update the statistics
e.sum_grad += grad[ ridx ];
e.sum_hess += hess[ ridx ];
e.last_fvalue = fvalue;
}
}
// finish updating all statistics, check if it is possible to include all sum statistics
for( size_t i = 0; i < qexpand.size(); ++ i ){
const int nid = qexpand[ i ];
ThreadEntry &e = temp[ nid ];
const double csum_hess = snode[nid].sum_hess - e.sum_hess;
if( e.sum_hess >= param.min_child_weight && csum_hess >= param.min_child_weight ){
const double csum_grad = snode[nid].sum_grad - e.sum_grad;
const double loss_chg =
+ param.CalcGain( e.sum_grad, e.sum_hess, snode[nid].weight )
+ param.CalcGain( csum_grad, csum_hess, snode[nid].weight )
- snode[nid].root_gain;
const float delta = is_forward_search ? rt_eps:-rt_eps;
e.best.Update( loss_chg, fid, e.last_fvalue + delta, !is_forward_search );
}
}
}
// find splits at current level
inline void FindSplit( int depth ){
const unsigned nsize = static_cast<unsigned>( feat_index.size() );
#pragma omp parallel for schedule( dynamic, 1 )
for( unsigned i = 0; i < nsize; ++ i ){
const unsigned fid = feat_index[i];
const int tid = omp_get_thread_num();
if( param.need_forward_search() ){
this->EnumerateSplit( smat.GetSortedCol(fid), fid, stemp[tid], true );
}
if( param.need_backward_search() ){
this->EnumerateSplit( smat.GetReverseSortedCol(fid), fid, stemp[tid], false );
}
}
// after this each thread's stemp will get the best candidates, aggregate results
for( size_t i = 0; i < qexpand.size(); ++ i ){
const int nid = qexpand[ i ];
NodeEntry &e = snode[ nid ];
for( int tid = 0; tid < this->nthread; ++ tid ){
e.best.Update( stemp[ tid ][ nid ].best );
}
// now we know the solution in snode[ nid ], set split
if( e.best.loss_chg > rt_eps ){
tree.AddChilds( nid );
tree[ nid ].set_split( e.best.split_index(), e.best.split_value, e.best.default_left() );
} else{
tree[ nid ].set_leaf( e.weight * param.learning_rate );
}
}
{// reset position
// step 1, set default direct nodes to default, and leaf nodes to -1,
const unsigned ndata = static_cast<unsigned>( position.size() );
#pragma omp parallel for schedule( static )
for( unsigned i = 0; i < ndata; ++ i ){
const int nid = position[i];
if( nid >= 0 ){
if( tree[ nid ].is_leaf() ){
position[i] = -1;
}else{
// push to default branch, correct latter
position[i] = tree[nid].default_left() ? tree[nid].cleft(): tree[nid].cright();
}
}
}
// step 2, classify the non-default data into right places
std::vector<unsigned> fsplits;
for( size_t i = 0; i < qexpand.size(); ++ i ){
const int nid = qexpand[i];
if( !tree[nid].is_leaf() ) fsplits.push_back( tree[nid].split_index() );
}
std::sort( fsplits.begin(), fsplits.end() );
fsplits.resize( std::unique( fsplits.begin(), fsplits.end() ) - fsplits.begin() );
const unsigned nfeats = static_cast<unsigned>( fsplits.size() );
#pragma omp parallel for schedule( dynamic, 1 )
for( unsigned i = 0; i < nfeats; ++ i ){
const unsigned fid = fsplits[i];
for( typename FMatrix::ColIter it = smat.GetSortedCol( fid ); it.Next(); ){
const bst_uint ridx = it.rindex();
int nid = position[ ridx ];
if( nid == -1 ) continue;
// go back to parent, correct those who are not default
nid = tree[ nid ].parent();
if( tree[ nid ].split_index() == fid ){
if( it.fvalue() < tree[nid].split_cond() ){
position[ ridx ] = tree[ nid ].cleft();
}else{
position[ ridx ] = tree[ nid ].cright();
}
}
}
}
}
}
private:
// initialize temp data structure
inline void InitData( void ){
{
position.resize( grad.size() );
if( root_index.size() == 0 ){
std::fill( position.begin(), position.end(), 0 );
}else{
for( size_t i = 0; i < root_index.size(); ++ i ){
position[i] = root_index[i];
utils::Assert( root_index[i] < (unsigned)tree.param.num_roots, "root index exceed setting" );
}
}
// mark delete for the deleted datas
for( size_t i = 0; i < grad.size(); ++ i ){
if( hess[i] < 0.0f ) position[i] = -1;
}
if( param.subsample < 1.0f - 1e-6f ){
for( size_t i = 0; i < grad.size(); ++ i ){
if( hess[i] < 0.0f ) continue;
if( random::SampleBinary( param.subsample) == 0 ){
position[ i ] = -1;
}
}
}
}
{// initialize feature index
int ncol = static_cast<int>( smat.NumCol() );
for( int i = 0; i < ncol; i ++ ){
if( smat.GetSortedCol(i).Next() && constrain.NotBanned(i) ){
feat_index.push_back( i );
}
}
random::Shuffle( feat_index );
}
{// setup temp space for each thread
if( param.nthread != 0 ){
omp_set_num_threads( param.nthread );
}
#pragma omp parallel
{
this->nthread = omp_get_num_threads();
}
// reserve a small space
stemp.resize( this->nthread, std::vector<ThreadEntry>() );
for( size_t i = 0; i < stemp.size(); ++ i ){
stemp[i].reserve( 256 );
}
snode.reserve( 256 );
}
{// expand query
qexpand.reserve( 256 ); qexpand.clear();
for( int i = 0; i < tree.param.num_roots; ++ i ){
qexpand.push_back( i );
}
}
}
private:
// number of omp thread used during training
int nthread;
// Per feature: shuffle index of each feature index
std::vector<int> feat_index;
// Instance Data: current node position in the tree of each instance
std::vector<int> position;
// PerThread x PerTreeNode: statistics for per thread construction
std::vector< std::vector<ThreadEntry> > stemp;
private:
const std::vector<float> &grad;
const std::vector<float> &hess;
const FMatrix &smat;
const std::vector<unsigned> &root_index;
const utils::FeatConstrain &constrain;
};
};
};
#endif

Some files were not shown because too many files have changed in this diff Show More